ingestify 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. ingestify/__init__.py +1 -1
  2. ingestify/application/dataset_store.py +44 -24
  3. ingestify/application/ingestion_engine.py +3 -3
  4. ingestify/application/loader.py +67 -237
  5. ingestify/domain/models/__init__.py +1 -6
  6. ingestify/domain/models/base.py +22 -0
  7. ingestify/domain/models/data_spec_version_collection.py +6 -0
  8. ingestify/domain/models/dataset/__init__.py +3 -5
  9. ingestify/domain/models/dataset/dataset.py +15 -32
  10. ingestify/domain/models/dataset/dataset_repository.py +1 -15
  11. ingestify/domain/models/dataset/dataset_state.py +11 -0
  12. ingestify/domain/models/dataset/events.py +6 -16
  13. ingestify/domain/models/dataset/file.py +21 -34
  14. ingestify/domain/models/dataset/file_collection.py +3 -1
  15. ingestify/domain/models/dataset/file_repository.py +1 -10
  16. ingestify/domain/models/dataset/revision.py +26 -3
  17. ingestify/domain/models/event/domain_event.py +8 -4
  18. ingestify/domain/models/ingestion/__init__.py +0 -0
  19. ingestify/domain/models/ingestion/ingestion_job.py +292 -0
  20. ingestify/domain/models/ingestion/ingestion_job_summary.py +106 -0
  21. ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
  22. ingestify/domain/models/resources/dataset_resource.py +29 -37
  23. ingestify/domain/models/sink.py +1 -8
  24. ingestify/domain/models/task/task.py +3 -1
  25. ingestify/domain/models/task/task_summary.py +118 -0
  26. ingestify/domain/models/timing.py +16 -0
  27. ingestify/infra/fetch/http.py +5 -0
  28. ingestify/infra/source/statsbomb_github.py +67 -54
  29. ingestify/infra/store/dataset/__init__.py +0 -2
  30. ingestify/infra/store/dataset/sqlalchemy/mapping.py +184 -4
  31. ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -22
  32. ingestify/main.py +42 -22
  33. ingestify/utils.py +15 -78
  34. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/METADATA +2 -1
  35. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/RECORD +38 -32
  36. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/WHEEL +1 -1
  37. ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
  38. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/entry_points.txt +0 -0
  39. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,292 @@
1
+ import itertools
2
+ import json
3
+ import logging
4
+ import uuid
5
+ from typing import Optional
6
+
7
+ from ingestify import retrieve_http
8
+ from ingestify.application.dataset_store import DatasetStore
9
+ from ingestify.domain import Selector, Identifier, TaskSet, Dataset, DraftFile, Task
10
+ from ingestify.domain.models.dataset.revision import RevisionSource, SourceType
11
+ from ingestify.domain.models.ingestion.ingestion_job_summary import (
12
+ IngestionJobSummary,
13
+ )
14
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
15
+ from ingestify.domain.models.resources.dataset_resource import (
16
+ FileResource,
17
+ DatasetResource,
18
+ )
19
+ from ingestify.domain.models.task.task_summary import TaskSummary
20
+ from ingestify.utils import TaskExecutor, chunker
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ DEFAULT_CHUNK_SIZE = 1000
26
+
27
+
28
+ def run_task(task):
29
+ logger.info(f"Running task {task}")
30
+ return task.run()
31
+
32
+
33
+ def to_batches(input_):
34
+ if isinstance(input_, list):
35
+ batches = [input_]
36
+ else:
37
+ # Assume it's an iterator. Peek what's inside, and put it back
38
+ try:
39
+ peek = next(input_)
40
+ except StopIteration:
41
+ # Nothing to batch
42
+ return []
43
+
44
+ input_ = itertools.chain([peek], input_)
45
+
46
+ if not isinstance(peek, list):
47
+ batches = chunker(input_, DEFAULT_CHUNK_SIZE)
48
+ else:
49
+ batches = input_
50
+ return batches
51
+
52
+
53
+ def load_file(
54
+ file_resource: FileResource, dataset: Optional[Dataset] = None
55
+ ) -> Optional[DraftFile]:
56
+ current_file = None
57
+ if dataset:
58
+ current_file = dataset.current_revision.modified_files_map.get(
59
+ file_resource.file_id
60
+ )
61
+
62
+ if file_resource.json_content is not None:
63
+ # Empty dictionary is allowed
64
+ file = DraftFile.from_input(
65
+ file_=json.dumps(file_resource.json_content, indent=4),
66
+ data_serialization_format="json",
67
+ data_feed_key=file_resource.data_feed_key,
68
+ data_spec_version=file_resource.data_spec_version,
69
+ modified_at=file_resource.last_modified,
70
+ )
71
+ if current_file and current_file.tag == file.tag:
72
+ # Nothing changed
73
+ return None
74
+ return file
75
+ elif file_resource.url:
76
+ http_options = {}
77
+ if file_resource.http_options:
78
+ for k, v in file_resource.http_options.items():
79
+ http_options[f"http_{k}"] = v
80
+
81
+ return retrieve_http(
82
+ url=file_resource.url,
83
+ current_file=current_file,
84
+ file_data_feed_key=file_resource.data_feed_key,
85
+ file_data_spec_version=file_resource.data_spec_version,
86
+ file_data_serialization_format=file_resource.data_serialization_format
87
+ or "txt",
88
+ last_modified=file_resource.last_modified,
89
+ **http_options,
90
+ **file_resource.loader_kwargs,
91
+ )
92
+ else:
93
+ return file_resource.file_loader(
94
+ file_resource,
95
+ current_file,
96
+ # TODO: check how to fix this with typehints
97
+ **file_resource.loader_kwargs,
98
+ )
99
+
100
+
101
+ class UpdateDatasetTask(Task):
102
+ def __init__(
103
+ self,
104
+ dataset: Dataset,
105
+ dataset_resource: DatasetResource,
106
+ store: DatasetStore,
107
+ ):
108
+ self.dataset = dataset
109
+ self.dataset_resource = dataset_resource
110
+ self.store = store
111
+ self.task_id = str(uuid.uuid1())
112
+
113
+ def run(self):
114
+ dataset_identifier = Identifier(**self.dataset_resource.dataset_resource_id)
115
+
116
+ revision_source = RevisionSource(
117
+ source_id=self.task_id, source_type=SourceType.TASK
118
+ )
119
+
120
+ with TaskSummary.update(
121
+ self.task_id, dataset_identifier=dataset_identifier
122
+ ) as task_summary:
123
+ revision = self.store.update_dataset(
124
+ dataset=self.dataset,
125
+ name=self.dataset_resource.name,
126
+ state=self.dataset_resource.state,
127
+ metadata=self.dataset_resource.metadata,
128
+ files={
129
+ file_id: task_summary.record_load_file(
130
+ lambda: load_file(file_resource, dataset=self.dataset),
131
+ metadata={"file_id": file_id},
132
+ )
133
+ for file_id, file_resource in self.dataset_resource.files.items()
134
+ },
135
+ revision_source=revision_source,
136
+ )
137
+ task_summary.set_stats_from_revision(revision)
138
+
139
+ return task_summary
140
+
141
+ def __repr__(self):
142
+ return f"UpdateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
143
+
144
+
145
+ class CreateDatasetTask(Task):
146
+ def __init__(
147
+ self,
148
+ dataset_resource: DatasetResource,
149
+ store: DatasetStore,
150
+ ):
151
+ self.dataset_resource = dataset_resource
152
+ self.store = store
153
+ self.task_id = str(uuid.uuid1())
154
+
155
+ def run(self):
156
+ dataset_identifier = Identifier(**self.dataset_resource.dataset_resource_id)
157
+ revision_source = RevisionSource(
158
+ source_id=self.task_id, source_type=SourceType.TASK
159
+ )
160
+
161
+ with TaskSummary.create(self.task_id, dataset_identifier) as task_summary:
162
+ revision = self.store.create_dataset(
163
+ dataset_type=self.dataset_resource.dataset_type,
164
+ provider=self.dataset_resource.provider,
165
+ dataset_identifier=dataset_identifier,
166
+ name=self.dataset_resource.name,
167
+ state=self.dataset_resource.state,
168
+ metadata=self.dataset_resource.metadata,
169
+ files={
170
+ file_id: task_summary.record_load_file(
171
+ lambda: load_file(file_resource, dataset=None),
172
+ metadata={"file_id": file_id},
173
+ )
174
+ for file_id, file_resource in self.dataset_resource.files.items()
175
+ },
176
+ revision_source=revision_source,
177
+ )
178
+
179
+ task_summary.set_stats_from_revision(revision)
180
+
181
+ return task_summary
182
+
183
+ def __repr__(self):
184
+ return f"CreateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
185
+
186
+
187
+ class IngestionJob:
188
+ def __init__(
189
+ self,
190
+ ingestion_job_id: str,
191
+ ingestion_plan: IngestionPlan,
192
+ selector: Selector,
193
+ ):
194
+ self.ingestion_job_id = ingestion_job_id
195
+ self.ingestion_plan = ingestion_plan
196
+ self.selector = selector
197
+
198
+ def execute(
199
+ self, store: DatasetStore, task_executor: TaskExecutor
200
+ ) -> IngestionJobSummary:
201
+ with IngestionJobSummary.new(ingestion_job=self) as ingestion_job_summary:
202
+ with ingestion_job_summary.record_timing("get_dataset_collection"):
203
+ dataset_collection_metadata = store.get_dataset_collection(
204
+ dataset_type=self.ingestion_plan.dataset_type,
205
+ data_spec_versions=self.selector.data_spec_versions,
206
+ selector=self.selector,
207
+ metadata_only=True,
208
+ ).metadata
209
+
210
+ # There are two different, but similar flows here:
211
+ # 1. The discover_datasets returns a list, and the entire list can be processed at once
212
+ # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
213
+ with ingestion_job_summary.record_timing("find_datasets"):
214
+ # Timing might be incorrect as it is an iterator
215
+ datasets = self.ingestion_plan.source.find_datasets(
216
+ dataset_type=self.ingestion_plan.dataset_type,
217
+ data_spec_versions=self.selector.data_spec_versions,
218
+ dataset_collection_metadata=dataset_collection_metadata,
219
+ **self.selector.custom_attributes,
220
+ )
221
+
222
+ batches = to_batches(datasets)
223
+
224
+ with ingestion_job_summary.record_timing("tasks"):
225
+ for batch in batches:
226
+ dataset_identifiers = [
227
+ Identifier.create_from_selector(
228
+ self.selector, **dataset_resource.dataset_resource_id
229
+ )
230
+ # We have to pass the data_spec_versions here as a Source can add some
231
+ # extra data to the identifier which is retrieved in a certain data format
232
+ for dataset_resource in batch
233
+ ]
234
+
235
+ # Load all available datasets based on the discovered dataset identifiers
236
+ dataset_collection = store.get_dataset_collection(
237
+ dataset_type=self.ingestion_plan.dataset_type,
238
+ # Assume all DatasetResources share the same provider
239
+ provider=batch[0].provider,
240
+ selector=dataset_identifiers,
241
+ )
242
+
243
+ skip_count = 0
244
+
245
+ task_set = TaskSet()
246
+ for dataset_resource in batch:
247
+ dataset_identifier = Identifier.create_from_selector(
248
+ self.selector, **dataset_resource.dataset_resource_id
249
+ )
250
+
251
+ if dataset := dataset_collection.get(dataset_identifier):
252
+ if self.ingestion_plan.fetch_policy.should_refetch(
253
+ dataset, dataset_resource
254
+ ):
255
+ task_set.add(
256
+ UpdateDatasetTask(
257
+ dataset=dataset, # Current dataset from the database
258
+ dataset_resource=dataset_resource, # Most recent dataset_resource
259
+ store=store,
260
+ )
261
+ )
262
+ else:
263
+ skip_count += 1
264
+ else:
265
+ if self.ingestion_plan.fetch_policy.should_fetch(
266
+ dataset_resource
267
+ ):
268
+ task_set.add(
269
+ CreateDatasetTask(
270
+ dataset_resource=dataset_resource,
271
+ store=store,
272
+ )
273
+ )
274
+ else:
275
+ skip_count += 1
276
+
277
+ if task_set:
278
+ logger.info(
279
+ f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
280
+ f"using selector {self.selector} => {len(task_set)} tasks. {skip_count} skipped."
281
+ )
282
+ logger.info(f"Running {len(task_set)} tasks")
283
+ ingestion_job_summary.add_task_summaries(
284
+ task_executor.run(run_task, task_set)
285
+ )
286
+ else:
287
+ logger.info(
288
+ f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
289
+ f"using selector {self.selector} => nothing to do"
290
+ )
291
+
292
+ return ingestion_job_summary
@@ -0,0 +1,106 @@
1
+ from contextlib import contextmanager
2
+ from datetime import datetime, timedelta
3
+ from typing import Optional, List, TYPE_CHECKING
4
+ from pydantic import Field
5
+
6
+ from ingestify.domain import Selector, DataSpecVersionCollection
7
+ from ingestify.domain.models.base import BaseModel
8
+ from ingestify.domain.models.task.task_summary import TaskSummary, TaskStatus
9
+ from ingestify.domain.models.timing import Timing
10
+ from ingestify.utils import utcnow
11
+
12
+ if TYPE_CHECKING:
13
+ from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
14
+
15
+
16
+ def format_duration(duration: timedelta):
17
+ return f"{duration.total_seconds():.2}sec"
18
+
19
+
20
+ class IngestionJobSummary(BaseModel):
21
+ ingestion_job_id: str
22
+
23
+ # From the IngestionPlan
24
+ source_name: str
25
+ dataset_type: str
26
+ data_spec_versions: DataSpecVersionCollection
27
+ selector: Selector
28
+
29
+ started_at: datetime = Field(default_factory=utcnow)
30
+ finished_at: Optional[datetime] = None
31
+ timings: List[Timing] = Field(default_factory=list)
32
+ task_summaries: List[TaskSummary] = Field(default_factory=list)
33
+
34
+ failed_tasks: int = 0
35
+ successful_tasks: int = 0
36
+ ignored_successful_tasks: int = 0
37
+
38
+ @classmethod
39
+ def new(cls, ingestion_job: "IngestionJob"):
40
+ args = dict(
41
+ ingestion_job_id=ingestion_job.ingestion_job_id,
42
+ source_name=ingestion_job.ingestion_plan.source.name,
43
+ dataset_type=ingestion_job.ingestion_plan.dataset_type,
44
+ data_spec_versions=ingestion_job.ingestion_plan.data_spec_versions,
45
+ selector=ingestion_job.selector,
46
+ )
47
+ return cls(**args)
48
+
49
+ @contextmanager
50
+ def record_timing(self, name: str):
51
+ start = utcnow()
52
+ yield
53
+ self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
54
+
55
+ def add_task_summaries(self, task_summaries: List[TaskSummary]):
56
+ self.task_summaries.extend(task_summaries)
57
+
58
+ def set_finished(self):
59
+ self.failed_tasks = len(
60
+ [task for task in self.task_summaries if task.status == TaskStatus.FAILED]
61
+ )
62
+ self.successful_tasks = len(
63
+ [task for task in self.task_summaries if task.status == TaskStatus.FINISHED]
64
+ )
65
+ self.ignored_successful_tasks = len(
66
+ [
67
+ task
68
+ for task in self.task_summaries
69
+ if task.status == TaskStatus.FINISHED_IGNORED
70
+ ]
71
+ )
72
+ self.finished_at = utcnow()
73
+
74
+ @property
75
+ def duration(self) -> timedelta:
76
+ return self.finished_at - self.started_at
77
+
78
+ def output_report(self):
79
+ print(f"\nIngestionJobSummary finished in {format_duration(self.duration)}")
80
+ print("--------------------")
81
+ print(f" - IngestionPlan:")
82
+ print(f" Source: {self.source_name}")
83
+ print(f" DatasetType: {self.dataset_type}")
84
+ print(f" - Selector: {self.selector}")
85
+ print(f" - Timings: ")
86
+ for timing in self.timings:
87
+ print(f" - {timing.name}: {format_duration(timing.duration)}")
88
+ print(
89
+ f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
90
+ )
91
+
92
+ for status in [
93
+ TaskStatus.FAILED,
94
+ TaskStatus.FINISHED,
95
+ TaskStatus.FINISHED_IGNORED,
96
+ ]:
97
+ print(
98
+ f" - {status.value.lower()}: {len([task for task in self.task_summaries if task.status == status])}"
99
+ )
100
+ print("--------------------")
101
+
102
+ def __enter__(self):
103
+ return self
104
+
105
+ def __exit__(self, exc_type, exc_val, exc_tb):
106
+ pass
@@ -1,15 +1,15 @@
1
- from dataclasses import dataclass
2
1
  from typing import List
3
2
 
4
3
  from ingestify.domain.models import Source, Selector
4
+ from ingestify.domain.models.base import BaseModel
5
5
  from ingestify.domain.models.data_spec_version_collection import (
6
6
  DataSpecVersionCollection,
7
7
  )
8
8
  from ingestify.domain.models.fetch_policy import FetchPolicy
9
9
 
10
10
 
11
- @dataclass
12
- class ExtractJob:
11
+ class IngestionPlan(BaseModel):
12
+
13
13
  source: Source
14
14
  selectors: List[Selector]
15
15
  fetch_policy: FetchPolicy
@@ -17,7 +17,7 @@ class ExtractJob:
17
17
  data_spec_versions: DataSpecVersionCollection
18
18
 
19
19
  def __repr__(self):
20
- return f'<ExtractJob source="{self.source.name}" dataset_type="{self.dataset_type}">'
20
+ return f'<IngestionPlan source="{self.source.name}" dataset_type="{self.dataset_type}">'
21
21
 
22
22
  def __str__(self):
23
23
  return repr(self)
@@ -1,35 +1,40 @@
1
- from dataclasses import dataclass, field
1
+ from dataclasses import dataclass
2
2
  from datetime import datetime
3
- from typing import Optional, Callable, TYPE_CHECKING
3
+ from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING # noqa
4
+ from pydantic import Field
4
5
 
6
+ from ingestify.domain.models.base import BaseModel
7
+ from ingestify.domain.models.dataset.dataset_state import DatasetState
5
8
  from ingestify.exceptions import DuplicateFile
6
9
 
7
- if TYPE_CHECKING:
8
- from ingestify.domain import DraftFile, File
9
- from ingestify.domain.models.dataset.dataset import DatasetState
10
+ from ingestify.domain.models import File, DraftFile
10
11
 
11
12
 
12
- @dataclass(frozen=True)
13
- class FileResource:
13
+ class FileLoaderProtocol(Protocol):
14
+ def __call__(
15
+ self,
16
+ file_resource: "FileResource",
17
+ file: Optional["File"] = None,
18
+ **kwargs: Any,
19
+ ) -> Optional["DraftFile"]:
20
+ ...
21
+
22
+
23
+ class FileResource(BaseModel):
14
24
  dataset_resource: "DatasetResource"
15
25
  file_id: str
16
26
  last_modified: datetime
17
27
  data_feed_key: str
18
28
  data_spec_version: str
19
-
20
- # DataSerializationFormat is "json" in case of json_content, otherwise file_loader will return it
21
- # data_serialization_format: str
22
-
23
29
  json_content: Optional[dict] = None
24
-
25
30
  url: Optional[str] = None
26
31
  http_options: Optional[dict] = None
32
+ # DataSerializationFormat is "json" in case of json_content, otherwise file_loader will return it
27
33
  data_serialization_format: Optional[str] = None
28
-
29
34
  file_loader: Optional[
30
35
  Callable[["FileResource", Optional["File"]], Optional["DraftFile"]]
31
36
  ] = None
32
- loader_kwargs: dict = field(default_factory=dict)
37
+ loader_kwargs: dict = Field(default_factory=dict)
33
38
 
34
39
  def __post_init__(self):
35
40
  if self.json_content is None and not self.url and not self.file_loader:
@@ -38,27 +43,14 @@ class FileResource:
38
43
  )
39
44
 
40
45
 
41
- class DatasetResource:
42
- def __init__(
43
- self,
44
- dataset_resource_id: dict,
45
- /,
46
- dataset_type: str,
47
- provider: str,
48
- name: str,
49
- metadata: Optional[dict] = None,
50
- state: Optional["DatasetState"] = None,
51
- ):
52
- from ingestify.domain.models.dataset.dataset import DatasetState
53
-
54
- self.dataset_type = dataset_type
55
- self.provider = provider
56
- self.dataset_resource_id = dataset_resource_id
57
- self.name = name
58
- self.metadata = metadata or {}
59
- self.state = state or DatasetState.COMPLETE
60
-
61
- self.files = {}
46
+ class DatasetResource(BaseModel):
47
+ dataset_resource_id: dict
48
+ dataset_type: str
49
+ provider: str
50
+ name: str
51
+ metadata: dict = Field(default_factory=dict)
52
+ state: DatasetState = Field(default_factory=lambda: DatasetState.COMPLETE)
53
+ files: dict[str, FileResource] = Field(default_factory=dict)
62
54
 
63
55
  def add_file(
64
56
  self,
@@ -72,8 +64,8 @@ class DatasetResource:
72
64
  data_serialization_format: Optional[str] = None,
73
65
  file_loader: Optional[
74
66
  Callable[
75
- ["FileResource", Optional["File"]],
76
- Optional["DraftFile"],
67
+ [FileResource, Optional[File]],
68
+ Optional[DraftFile],
77
69
  ]
78
70
  ] = None,
79
71
  loader_kwargs: Optional[dict] = None,
@@ -1,16 +1,9 @@
1
1
  from abc import ABC, abstractmethod
2
2
 
3
- from ingestify.utils import ComponentFactory, ComponentRegistry
4
-
5
3
  from .dataset import Dataset
6
4
 
7
- sink_registry = ComponentRegistry()
8
-
9
5
 
10
- class Sink(ABC, metaclass=sink_registry.metaclass):
6
+ class Sink(ABC):
11
7
  @abstractmethod
12
8
  def upsert(self, dataset: Dataset, data, params: dict):
13
9
  pass
14
-
15
-
16
- sink_factory = ComponentFactory.build_factory(Sink, sink_registry)
@@ -1,7 +1,9 @@
1
1
  from abc import ABC, abstractmethod
2
2
 
3
+ from .task_summary import TaskSummary
4
+
3
5
 
4
6
  class Task(ABC):
5
7
  @abstractmethod
6
- def run(self):
8
+ def run(self) -> TaskSummary:
7
9
  pass
@@ -0,0 +1,118 @@
1
+ import logging
2
+ import traceback
3
+ from contextlib import contextmanager
4
+ from datetime import datetime
5
+ from enum import Enum
6
+ from typing import Optional, List
7
+ from pydantic import Field, field_validator
8
+
9
+ from ingestify.domain.models.base import BaseModel
10
+ from ingestify.domain.models.dataset.identifier import Identifier
11
+ from ingestify.domain.models.timing import Timing
12
+ from ingestify.exceptions import IngestifyError
13
+ from ingestify.utils import utcnow
14
+
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class TaskStatus(str, Enum):
20
+ RUNNING = "RUNNING"
21
+ FINISHED = "FINISHED"
22
+ FINISHED_IGNORED = "FINISHED_IGNORED" # Finished, but didn't produce any new data
23
+ FAILED = "FAILED"
24
+
25
+
26
+ class Operation(str, Enum):
27
+ CREATE = "CREATE"
28
+ UPDATE = "UPDATE"
29
+
30
+
31
+ class TaskSummary(BaseModel):
32
+ task_id: str
33
+ started_at: datetime
34
+ operation: Operation
35
+ dataset_identifier: Identifier
36
+ ended_at: Optional[datetime] = None
37
+ persisted_file_count: int = 0
38
+ bytes_retrieved: int = 0
39
+ last_modified: Optional[datetime] = None
40
+ status: TaskStatus = TaskStatus.RUNNING
41
+ timings: List[Timing] = Field(default_factory=list)
42
+
43
+ @field_validator("dataset_identifier", mode="before")
44
+ @classmethod
45
+ def ensure_list(cls, value) -> Identifier:
46
+ if not isinstance(value, Identifier):
47
+ return Identifier(**value)
48
+ return value
49
+
50
+ def record_load_file(self, fn, metadata: dict):
51
+ start = utcnow()
52
+ try:
53
+ result = None
54
+ return fn()
55
+ except Exception as e:
56
+ result = {
57
+ "type": type(e).__name__,
58
+ "message": str(e),
59
+ "traceback": traceback.format_exc(),
60
+ }
61
+ raise e
62
+ finally:
63
+ metadata = dict(result=result, **metadata)
64
+ self.timings.append(
65
+ Timing(
66
+ name=f"Load of {metadata.get('file_id', 'file')}",
67
+ started_at=start,
68
+ ended_at=utcnow(),
69
+ metadata=metadata,
70
+ )
71
+ )
72
+
73
+ @classmethod
74
+ @contextmanager
75
+ def new(cls, task_id: str, operation: Operation, dataset_identifier: Identifier):
76
+ start = utcnow()
77
+ task_summary = cls(
78
+ task_id=task_id,
79
+ started_at=start,
80
+ operation=operation,
81
+ dataset_identifier=dataset_identifier,
82
+ )
83
+ try:
84
+ yield task_summary
85
+
86
+ task_summary.set_status(TaskStatus.FINISHED)
87
+ except Exception as e:
88
+ logger.exception(f"Failed to execute task.")
89
+ task_summary.set_status(TaskStatus.FAILED)
90
+
91
+ # When the error comes from our own code, make sure it will be raised to the highest level
92
+ # raise
93
+ if isinstance(e, IngestifyError):
94
+ raise
95
+ finally:
96
+ task_summary.ended_at = utcnow()
97
+
98
+ @classmethod
99
+ def update(cls, task_id: str, dataset_identifier: Identifier):
100
+ return cls.new(task_id, Operation.UPDATE, dataset_identifier)
101
+
102
+ @classmethod
103
+ def create(cls, task_id: str, dataset_identifier: Identifier):
104
+ return cls.new(task_id, Operation.CREATE, dataset_identifier)
105
+
106
+ def set_stats_from_revision(self, revision: Optional["Revision"]):
107
+ if revision:
108
+ self.persisted_file_count = len(revision.modified_files)
109
+ self.bytes_retrieved = sum(file.size for file in revision.modified_files)
110
+ self.last_modified = max(
111
+ file.modified_at for file in revision.modified_files
112
+ )
113
+ else:
114
+ self.status = TaskStatus.FINISHED_IGNORED
115
+
116
+ def set_status(self, status: TaskStatus):
117
+ if self.status == TaskStatus.RUNNING:
118
+ self.status = status
@@ -0,0 +1,16 @@
1
+ from datetime import datetime
2
+ from typing import Optional, Any
3
+ from pydantic import BaseModel, ConfigDict
4
+
5
+
6
+ class Timing(BaseModel):
7
+ model_config = ConfigDict(arbitrary_types_allowed=True)
8
+
9
+ name: str
10
+ started_at: datetime
11
+ ended_at: datetime
12
+ metadata: Optional[dict[str, Any]] = None
13
+
14
+ @property
15
+ def duration(self):
16
+ return self.ended_at - self.started_at