ingestify 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ingestify/__init__.py CHANGED
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.3.3"
11
+ __version__ = "0.4.0"
@@ -58,8 +58,7 @@ class DatasetStore:
58
58
  self.event_bus.dispatch(event)
59
59
 
60
60
  def save_ingestion_job_summary(self, ingestion_job_summary):
61
- self.dataset_repository.session.add(ingestion_job_summary)
62
- self.dataset_repository.session.commit()
61
+ self.dataset_repository.save_ingestion_job_summary(ingestion_job_summary)
63
62
 
64
63
  def get_dataset_collection(
65
64
  self,
@@ -271,6 +270,7 @@ class DatasetStore:
271
270
  metadata=metadata,
272
271
  created_at=now,
273
272
  updated_at=now,
273
+ last_modified_at=None, # Not known at this moment
274
274
  )
275
275
  revision = self.add_revision(dataset, files, revision_source, description)
276
276
 
@@ -298,8 +298,8 @@ class DatasetStore:
298
298
  )
299
299
 
300
300
  loaded_file = LoadedFile(
301
- _stream=get_stream if lazy else get_stream(file),
302
- **asdict(file),
301
+ stream_=get_stream if lazy else get_stream(file),
302
+ **file.model_dump(),
303
303
  )
304
304
  files[file.file_id] = loaded_file
305
305
  return FileCollection(files, auto_rewind=auto_rewind)
@@ -21,8 +21,13 @@ class IngestionEngine:
21
21
  def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
22
22
  self.loader.add_ingestion_plan(ingestion_plan)
23
23
 
24
- def load(self, dry_run: bool = False, provider: Optional[str] = None):
25
- self.loader.collect_and_run(dry_run=dry_run, provider=provider)
24
+ def load(
25
+ self,
26
+ dry_run: bool = False,
27
+ provider: Optional[str] = None,
28
+ source: Optional[str] = None,
29
+ ):
30
+ self.loader.collect_and_run(dry_run=dry_run, provider=provider, source=source)
26
31
 
27
32
  def list_datasets(self, as_count: bool = False):
28
33
  """Consider moving this to DataStore"""
@@ -29,7 +29,12 @@ class Loader:
29
29
  def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
30
30
  self.ingestion_plans.append(ingestion_plan)
31
31
 
32
- def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
32
+ def collect_and_run(
33
+ self,
34
+ dry_run: bool = False,
35
+ provider: Optional[str] = None,
36
+ source: Optional[str] = None,
37
+ ):
33
38
  # First collect all selectors, before discovering datasets
34
39
  selectors = {}
35
40
  for ingestion_plan in self.ingestion_plans:
@@ -42,6 +47,13 @@ class Loader:
42
47
  )
43
48
  continue
44
49
 
50
+ if source is not None:
51
+ if ingestion_plan.source.name != source:
52
+ logger.info(
53
+ f"Skipping {ingestion_plan} because source doesn't match '{source}'"
54
+ )
55
+ continue
56
+
45
57
  static_selectors = [
46
58
  selector
47
59
  for selector in ingestion_plan.selectors
@@ -60,6 +72,7 @@ class Loader:
60
72
 
61
73
  # TODO: consider making this lazy and fetch once per Source instead of
62
74
  # once per IngestionPlan
75
+ # TODO: Log exception when `discover_selectors` fails
63
76
  all_selectors = ingestion_plan.source.discover_selectors(
64
77
  ingestion_plan.dataset_type
65
78
  )
ingestify/cmdline.py CHANGED
@@ -58,7 +58,14 @@ def cli():
58
58
  help="bucket",
59
59
  type=str,
60
60
  )
61
- @click.option("--debug", "debug", required=False, help="Debugging enabled", type=bool)
61
+ @click.option(
62
+ "--debug",
63
+ "debug",
64
+ required=False,
65
+ help="Debugging enabled",
66
+ is_flag=True,
67
+ type=bool,
68
+ )
62
69
  @click.option(
63
70
  "--dry-run",
64
71
  "dry_run",
@@ -74,11 +81,19 @@ def cli():
74
81
  help="Provider - only run tasks for a single provider",
75
82
  type=str,
76
83
  )
84
+ @click.option(
85
+ "--source",
86
+ "source",
87
+ required=False,
88
+ help="Source - only run tasks for a single source",
89
+ type=str,
90
+ )
77
91
  def run(
78
92
  config_file: str,
79
93
  bucket: Optional[str],
80
94
  dry_run: Optional[bool],
81
95
  provider: Optional[str],
96
+ source: Optional[str],
82
97
  debug: Optional[bool],
83
98
  ):
84
99
  try:
@@ -90,7 +105,10 @@ def run(
90
105
  logger.exception(f"Failed due a configuration error: {e}")
91
106
  sys.exit(1)
92
107
 
93
- engine.load(dry_run=dry_run, provider=provider)
108
+ if debug:
109
+ logging.getLogger("root").setLevel(logging.DEBUG)
110
+
111
+ engine.load(dry_run=dry_run, provider=provider, source=source)
94
112
 
95
113
  logger.info("Done")
96
114
 
@@ -1,22 +1,5 @@
1
- from functools import partial
2
- from typing import ClassVar, Any, Optional
3
-
4
- import pydantic
5
1
  from pydantic import BaseModel as PydanticBaseModel, ConfigDict
6
2
 
7
3
 
8
- # class BaseModel(PydanticBaseModel):
9
- # model_config = ConfigDict(arbitrary_types_allowed=True)
10
- #
11
- # _sa_instance_state: Optional[dict] = None
12
- from sqlalchemy.orm import MappedAsDataclass
13
-
14
-
15
- class BaseModel(
16
- MappedAsDataclass,
17
- # DeclarativeBase,
18
- dataclass_callable=partial(
19
- pydantic.dataclasses.dataclass, config=ConfigDict(arbitrary_types_allowed=True)
20
- ),
21
- ):
22
- pass
4
+ class BaseModel(PydanticBaseModel):
5
+ model_config = ConfigDict(arbitrary_types_allowed=True, from_attributes=True)
@@ -6,7 +6,8 @@ from typing import Optional
6
6
  @dataclass
7
7
  class DatasetCollectionMetadata:
8
8
  # This can be useful to figure out if a backfill is required
9
- first_modified: Optional[datetime]
9
+ # TODO - Note: not stored at Dataset level and requires joined query to retrieve
10
+ # first_modified: Optional[datetime]
10
11
 
11
12
  # Use the last modified to only retrieve datasets that are changed
12
13
  last_modified: Optional[datetime]
@@ -1,7 +1,7 @@
1
1
  from datetime import datetime
2
2
  from enum import Enum
3
3
  from typing import List, Optional
4
- from pydantic import Field
4
+ from pydantic import Field, field_validator
5
5
 
6
6
  from ingestify.utils import utcnow
7
7
  from .dataset_state import DatasetState
@@ -22,7 +22,17 @@ class Dataset(BaseModel):
22
22
  metadata: dict
23
23
  created_at: datetime
24
24
  updated_at: datetime
25
+
25
26
  revisions: List[Revision] = Field(default_factory=list)
27
+ # The last_modified_at is equal to the max modified_at of all files in all revisions
28
+ last_modified_at: Optional[datetime]
29
+
30
+ @field_validator("identifier", mode="before")
31
+ @classmethod
32
+ def parse_identifier(cls, value):
33
+ if not isinstance(value, Identifier):
34
+ return Identifier(value)
35
+ return value
26
36
 
27
37
  @property
28
38
  def is_complete(self):
@@ -35,6 +45,13 @@ class Dataset(BaseModel):
35
45
  self.revisions.append(revision)
36
46
  self.updated_at = utcnow()
37
47
 
48
+ if self.last_modified_at:
49
+ self.last_modified_at = max(
50
+ self.last_modified_at, revision.last_modified_at
51
+ )
52
+ else:
53
+ self.last_modified_at = revision.last_modified_at
54
+
38
55
  def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
39
56
  changed = False
40
57
  if self.name != name:
@@ -116,18 +116,18 @@ class LoadedFile(BaseModel):
116
116
  data_serialization_format: Optional[str] # Example: 'json'
117
117
  storage_compression_method: Optional[str] # Example: 'gzip'
118
118
  storage_path: Path
119
- _stream: Union[BinaryIO, BytesIO, Callable[[], Awaitable[Union[BinaryIO, BytesIO]]]]
119
+ stream_: Union[BinaryIO, BytesIO, Callable[[], Awaitable[Union[BinaryIO, BytesIO]]]]
120
120
  revision_id: Optional[int] = None # This can be used when a Revision is squashed
121
121
 
122
122
  def load_stream(self):
123
- if callable(self._stream):
124
- self._stream = self._stream(self)
123
+ if callable(self.stream_):
124
+ self.stream_ = self.stream_(self)
125
125
 
126
126
  @property
127
127
  def stream(self):
128
- if callable(self._stream):
128
+ if callable(self.stream_):
129
129
  raise Exception("You should load the stream first using `load_stream`")
130
- return self._stream
130
+ return self.stream_
131
131
 
132
132
 
133
133
  __all__ = ["File", "DraftFile", "LoadedFile"]
@@ -1,6 +1,6 @@
1
1
  from datetime import datetime
2
2
  from enum import Enum
3
- from typing import Dict, List
3
+ from typing import Dict, List, Optional
4
4
 
5
5
  from typing_extensions import TypedDict
6
6
 
@@ -32,10 +32,14 @@ class Revision(BaseModel):
32
32
  created_at: datetime
33
33
  description: str
34
34
  modified_files: List[File]
35
- source: RevisionSource
35
+ source: Optional[RevisionSource]
36
36
  is_squashed: bool = False
37
37
  state: RevisionState = RevisionState.PENDING_VALIDATION
38
38
 
39
+ @property
40
+ def last_modified_at(self):
41
+ return max(file.modified_at for file in self.modified_files)
42
+
39
43
  @property
40
44
  def modified_files_map(self) -> Dict[str, File]:
41
45
  return {file.file_id: file for file in self.modified_files}
@@ -2,6 +2,7 @@ import itertools
2
2
  import json
3
3
  import logging
4
4
  import uuid
5
+ from enum import Enum
5
6
  from typing import Optional, Iterator
6
7
 
7
8
  from ingestify import retrieve_http
@@ -17,6 +18,7 @@ from ingestify.domain.models.resources.dataset_resource import (
17
18
  DatasetResource,
18
19
  )
19
20
  from ingestify.domain.models.task.task_summary import TaskSummary
21
+ from ingestify.exceptions import SaveError
20
22
  from ingestify.utils import TaskExecutor, chunker
21
23
 
22
24
  logger = logging.getLogger(__name__)
@@ -120,21 +122,27 @@ class UpdateDatasetTask(Task):
120
122
  with TaskSummary.update(
121
123
  self.task_id, dataset_identifier=dataset_identifier
122
124
  ) as task_summary:
123
- revision = self.store.update_dataset(
124
- dataset=self.dataset,
125
- name=self.dataset_resource.name,
126
- state=self.dataset_resource.state,
127
- metadata=self.dataset_resource.metadata,
128
- files={
129
- file_id: task_summary.record_load_file(
130
- lambda: load_file(file_resource, dataset=self.dataset),
131
- metadata={"file_id": file_id},
132
- )
133
- for file_id, file_resource in self.dataset_resource.files.items()
134
- },
135
- revision_source=revision_source,
136
- )
137
- task_summary.set_stats_from_revision(revision)
125
+
126
+ files = {
127
+ file_id: task_summary.record_load_file(
128
+ lambda: load_file(file_resource, dataset=self.dataset),
129
+ metadata={"file_id": file_id},
130
+ )
131
+ for file_id, file_resource in self.dataset_resource.files.items()
132
+ }
133
+
134
+ try:
135
+ revision = self.store.update_dataset(
136
+ dataset=self.dataset,
137
+ name=self.dataset_resource.name,
138
+ state=self.dataset_resource.state,
139
+ metadata=self.dataset_resource.metadata,
140
+ files=files,
141
+ revision_source=revision_source,
142
+ )
143
+ task_summary.set_stats_from_revision(revision)
144
+ except Exception as e:
145
+ raise SaveError("Could not update dataset") from e
138
146
 
139
147
  return task_summary
140
148
 
@@ -159,24 +167,28 @@ class CreateDatasetTask(Task):
159
167
  )
160
168
 
161
169
  with TaskSummary.create(self.task_id, dataset_identifier) as task_summary:
162
- revision = self.store.create_dataset(
163
- dataset_type=self.dataset_resource.dataset_type,
164
- provider=self.dataset_resource.provider,
165
- dataset_identifier=dataset_identifier,
166
- name=self.dataset_resource.name,
167
- state=self.dataset_resource.state,
168
- metadata=self.dataset_resource.metadata,
169
- files={
170
- file_id: task_summary.record_load_file(
171
- lambda: load_file(file_resource, dataset=None),
172
- metadata={"file_id": file_id},
173
- )
174
- for file_id, file_resource in self.dataset_resource.files.items()
175
- },
176
- revision_source=revision_source,
177
- )
170
+ files = {
171
+ file_id: task_summary.record_load_file(
172
+ lambda: load_file(file_resource, dataset=None),
173
+ metadata={"file_id": file_id},
174
+ )
175
+ for file_id, file_resource in self.dataset_resource.files.items()
176
+ }
177
+ try:
178
+ revision = self.store.create_dataset(
179
+ dataset_type=self.dataset_resource.dataset_type,
180
+ provider=self.dataset_resource.provider,
181
+ dataset_identifier=dataset_identifier,
182
+ name=self.dataset_resource.name,
183
+ state=self.dataset_resource.state,
184
+ metadata=self.dataset_resource.metadata,
185
+ files=files,
186
+ revision_source=revision_source,
187
+ )
178
188
 
179
- task_summary.set_stats_from_revision(revision)
189
+ task_summary.set_stats_from_revision(revision)
190
+ except Exception as e:
191
+ raise SaveError("Could not create dataset") from e
180
192
 
181
193
  return task_summary
182
194
 
@@ -209,6 +221,7 @@ class IngestionJob:
209
221
  with ingestion_job_summary.record_timing("get_dataset_collection"):
210
222
  dataset_collection_metadata = store.get_dataset_collection(
211
223
  dataset_type=self.ingestion_plan.dataset_type,
224
+ provider=self.ingestion_plan.source.provider,
212
225
  data_spec_versions=self.selector.data_spec_versions,
213
226
  selector=self.selector,
214
227
  metadata_only=True,
@@ -218,27 +231,38 @@ class IngestionJob:
218
231
  # There are two different, but similar flows here:
219
232
  # 1. The discover_datasets returns a list, and the entire list can be processed at once
220
233
  # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
221
- with ingestion_job_summary.record_timing("find_datasets"):
222
- # Timing might be incorrect as it is an iterator
223
- dataset_resources = self.ingestion_plan.source.find_datasets(
224
- dataset_type=self.ingestion_plan.dataset_type,
225
- data_spec_versions=self.selector.data_spec_versions,
226
- dataset_collection_metadata=dataset_collection_metadata,
227
- **self.selector.custom_attributes,
228
- )
234
+ try:
235
+ with ingestion_job_summary.record_timing("find_datasets"):
236
+ dataset_resources = self.ingestion_plan.source.find_datasets(
237
+ dataset_type=self.ingestion_plan.dataset_type,
238
+ data_spec_versions=self.selector.data_spec_versions,
239
+ dataset_collection_metadata=dataset_collection_metadata,
240
+ **self.selector.custom_attributes,
241
+ )
229
242
 
230
- finish_task_timer = ingestion_job_summary.start_timing("tasks")
243
+ # We need to include the to_batches as that will start the generator
244
+ batches = to_batches(dataset_resources)
245
+ except Exception as e:
246
+ logger.exception("Failed to find datasets")
231
247
 
232
- batches = to_batches(dataset_resources)
248
+ ingestion_job_summary.set_exception(e)
249
+ yield ingestion_job_summary
250
+ return
251
+
252
+ finish_task_timer = ingestion_job_summary.start_timing("tasks")
233
253
 
234
254
  while True:
235
255
  try:
236
256
  batch = next(batches)
237
257
  except StopIteration:
238
258
  break
239
- except Exception:
240
- # TODO: handle exception on IngestionJob level
241
- raise
259
+ except Exception as e:
260
+ logger.exception("Failed to fetch next batch")
261
+
262
+ finish_task_timer()
263
+ ingestion_job_summary.set_exception(e)
264
+ yield ingestion_job_summary
265
+ return
242
266
 
243
267
  dataset_identifiers = [
244
268
  Identifier.create_from_selector(
@@ -1,24 +1,31 @@
1
1
  import uuid
2
2
  from contextlib import contextmanager
3
3
  from datetime import datetime, timedelta
4
+ from enum import Enum
4
5
  from typing import Optional, List, TYPE_CHECKING
5
6
  from pydantic import Field
6
7
 
7
8
  from ingestify.domain import Selector, DataSpecVersionCollection
8
9
  from ingestify.domain.models.base import BaseModel
9
- from ingestify.domain.models.task.task_summary import TaskSummary, TaskStatus
10
+ from ingestify.domain.models.task.task_summary import TaskSummary, TaskState
10
11
  from ingestify.domain.models.timing import Timing
11
- from ingestify.utils import utcnow
12
+ from ingestify.utils import utcnow, HasTiming
12
13
 
13
14
  if TYPE_CHECKING:
14
15
  from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
15
16
 
16
17
 
18
+ class IngestionJobState(str, Enum):
19
+ RUNNING = "RUNNING"
20
+ FINISHED = "FINISHED"
21
+ FAILED = "FAILED"
22
+
23
+
17
24
  def format_duration(duration: timedelta):
18
25
  return f"{duration.total_seconds():.2f}sec"
19
26
 
20
27
 
21
- class IngestionJobSummary(BaseModel):
28
+ class IngestionJobSummary(BaseModel, HasTiming):
22
29
  ingestion_job_summary_id: str
23
30
  ingestion_job_id: str
24
31
 
@@ -30,8 +37,8 @@ class IngestionJobSummary(BaseModel):
30
37
  selector: Selector
31
38
 
32
39
  started_at: datetime = Field(default_factory=utcnow)
33
- finished_at: Optional[datetime] = None
34
- timings: List[Timing] = Field(default_factory=list)
40
+ ended_at: Optional[datetime] = None
41
+ state: IngestionJobState = IngestionJobState.RUNNING
35
42
  task_summaries: List[TaskSummary] = Field(default_factory=list)
36
43
 
37
44
  skipped_datasets: int = 0
@@ -52,20 +59,6 @@ class IngestionJobSummary(BaseModel):
52
59
  )
53
60
  return cls(**args)
54
61
 
55
- @contextmanager
56
- def record_timing(self, name: str):
57
- start = utcnow()
58
- yield
59
- self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
60
-
61
- def start_timing(self, name):
62
- start = utcnow()
63
-
64
- def finish():
65
- self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
66
-
67
- return finish
68
-
69
62
  def add_task_summaries(self, task_summaries: List[TaskSummary]):
70
63
  self.task_summaries.extend(task_summaries)
71
64
 
@@ -75,46 +68,61 @@ class IngestionJobSummary(BaseModel):
75
68
  def task_count(self):
76
69
  return len(self.task_summaries)
77
70
 
78
- def set_finished(self):
71
+ def _set_ended(self):
79
72
  self.failed_tasks = len(
80
- [task for task in self.task_summaries if task.status == TaskStatus.FAILED]
73
+ [task for task in self.task_summaries if task.state == TaskState.FAILED]
81
74
  )
82
75
  self.successful_tasks = len(
83
- [task for task in self.task_summaries if task.status == TaskStatus.FINISHED]
76
+ [task for task in self.task_summaries if task.state == TaskState.FINISHED]
84
77
  )
85
78
  self.ignored_successful_tasks = len(
86
79
  [
87
80
  task
88
81
  for task in self.task_summaries
89
- if task.status == TaskStatus.FINISHED_IGNORED
82
+ if task.state == TaskState.FINISHED_IGNORED
90
83
  ]
91
84
  )
92
- self.finished_at = utcnow()
85
+ self.ended_at = utcnow()
86
+
87
+ # Only keep failed tasks. Rest isn't interesting
88
+ self.task_summaries = [
89
+ task for task in self.task_summaries if task.state == TaskState.FAILED
90
+ ]
91
+
92
+ def set_finished(self):
93
+ self.state = IngestionJobState.FINISHED
94
+ self._set_ended()
95
+
96
+ def set_exception(self, e: Exception):
97
+ self.state = IngestionJobState.FAILED
98
+ self._set_ended()
93
99
 
94
100
  @property
95
101
  def duration(self) -> timedelta:
96
- return self.finished_at - self.started_at
102
+ return self.ended_at - self.started_at
97
103
 
98
104
  def output_report(self):
99
- print(f"\nIngestionJobSummary finished in {format_duration(self.duration)}")
100
- print("--------------------")
101
- print(f" - IngestionPlan:")
102
- print(f" Source: {self.source_name}")
103
- print(f" Provider: {self.provider}")
104
- print(f" DatasetType: {self.dataset_type}")
105
- print(f" - Selector: {self.selector}")
106
- print(f" - Timings: ")
105
+ print(
106
+ f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
107
+ )
108
+ print("********************************")
109
+ print(f"* - IngestionPlan:")
110
+ print(f"* Source: {self.source_name}")
111
+ print(f"* Provider: {self.provider}")
112
+ print(f"* DatasetType: {self.dataset_type}")
113
+ print(f"* - Selector: {self.selector}")
114
+ print(f"* - Timings: ")
107
115
  for timing in self.timings:
108
- print(f" - {timing.name}: {format_duration(timing.duration)}")
116
+ print(f"* - {timing.name}: {format_duration(timing.duration)}")
109
117
  print(
110
- f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
118
+ f"* - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
111
119
  )
112
120
 
113
- print(f" - Failed tasks: {self.failed_tasks}")
114
- print(f" - Successful tasks: {self.successful_tasks}")
115
- print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
116
- print(f" - Skipped datasets: {self.skipped_datasets}")
117
- print("--------------------")
121
+ print(f"* - Failed tasks: {self.failed_tasks}")
122
+ print(f"* - Successful tasks: {self.successful_tasks}")
123
+ print(f"* - Successful ignored tasks: {self.ignored_successful_tasks}")
124
+ print(f"* - Skipped datasets: {self.skipped_datasets}")
125
+ print("********************************")
118
126
 
119
127
  def __enter__(self):
120
128
  return self
@@ -10,13 +10,12 @@ from ingestify.domain.models.base import BaseModel
10
10
  from ingestify.domain.models.dataset.identifier import Identifier
11
11
  from ingestify.domain.models.timing import Timing
12
12
  from ingestify.exceptions import IngestifyError
13
- from ingestify.utils import utcnow
14
-
13
+ from ingestify.utils import utcnow, HasTiming
15
14
 
16
15
  logger = logging.getLogger(__name__)
17
16
 
18
17
 
19
- class TaskStatus(str, Enum):
18
+ class TaskState(str, Enum):
20
19
  RUNNING = "RUNNING"
21
20
  FINISHED = "FINISHED"
22
21
  FINISHED_IGNORED = "FINISHED_IGNORED" # Finished, but didn't produce any new data
@@ -28,7 +27,7 @@ class Operation(str, Enum):
28
27
  UPDATE = "UPDATE"
29
28
 
30
29
 
31
- class TaskSummary(BaseModel):
30
+ class TaskSummary(BaseModel, HasTiming):
32
31
  task_id: str
33
32
  started_at: datetime
34
33
  operation: Operation
@@ -37,8 +36,7 @@ class TaskSummary(BaseModel):
37
36
  persisted_file_count: int = 0
38
37
  bytes_retrieved: int = 0
39
38
  last_modified: Optional[datetime] = None
40
- status: TaskStatus = TaskStatus.RUNNING
41
- timings: List[Timing] = Field(default_factory=list)
39
+ state: TaskState = TaskState.RUNNING
42
40
 
43
41
  @field_validator("dataset_identifier", mode="before")
44
42
  @classmethod
@@ -48,27 +46,8 @@ class TaskSummary(BaseModel):
48
46
  return value
49
47
 
50
48
  def record_load_file(self, fn, metadata: dict):
51
- start = utcnow()
52
- try:
53
- result = None
49
+ with self.record_timing(f"Load of {metadata.get('file_id', 'file')}", metadata):
54
50
  return fn()
55
- except Exception as e:
56
- result = {
57
- "type": type(e).__name__,
58
- "message": str(e),
59
- "traceback": traceback.format_exc(),
60
- }
61
- raise e
62
- finally:
63
- metadata = dict(result=result, **metadata)
64
- self.timings.append(
65
- Timing(
66
- name=f"Load of {metadata.get('file_id', 'file')}",
67
- started_at=start,
68
- ended_at=utcnow(),
69
- metadata=metadata,
70
- )
71
- )
72
51
 
73
52
  @classmethod
74
53
  @contextmanager
@@ -83,10 +62,10 @@ class TaskSummary(BaseModel):
83
62
  try:
84
63
  yield task_summary
85
64
 
86
- task_summary.set_status(TaskStatus.FINISHED)
65
+ task_summary.set_state(TaskState.FINISHED)
87
66
  except Exception as e:
88
67
  logger.exception(f"Failed to execute task.")
89
- task_summary.set_status(TaskStatus.FAILED)
68
+ task_summary.set_state(TaskState.FAILED)
90
69
 
91
70
  # When the error comes from our own code, make sure it will be raised to the highest level
92
71
  # raise
@@ -111,8 +90,8 @@ class TaskSummary(BaseModel):
111
90
  file.modified_at for file in revision.modified_files
112
91
  )
113
92
  else:
114
- self.status = TaskStatus.FINISHED_IGNORED
93
+ self.state = TaskState.FINISHED_IGNORED
115
94
 
116
- def set_status(self, status: TaskStatus):
117
- if self.status == TaskStatus.RUNNING:
118
- self.status = status
95
+ def set_state(self, state: TaskState):
96
+ if self.state == TaskState.RUNNING:
97
+ self.state = state