ingestify 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +44 -24
- ingestify/application/ingestion_engine.py +3 -3
- ingestify/application/loader.py +67 -237
- ingestify/domain/models/__init__.py +1 -6
- ingestify/domain/models/base.py +22 -0
- ingestify/domain/models/data_spec_version_collection.py +6 -0
- ingestify/domain/models/dataset/__init__.py +3 -5
- ingestify/domain/models/dataset/dataset.py +15 -32
- ingestify/domain/models/dataset/dataset_repository.py +1 -15
- ingestify/domain/models/dataset/dataset_state.py +11 -0
- ingestify/domain/models/dataset/events.py +6 -16
- ingestify/domain/models/dataset/file.py +21 -34
- ingestify/domain/models/dataset/file_collection.py +3 -1
- ingestify/domain/models/dataset/file_repository.py +1 -10
- ingestify/domain/models/dataset/revision.py +26 -3
- ingestify/domain/models/event/domain_event.py +8 -4
- ingestify/domain/models/ingestion/__init__.py +0 -0
- ingestify/domain/models/ingestion/ingestion_job.py +292 -0
- ingestify/domain/models/ingestion/ingestion_job_summary.py +106 -0
- ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
- ingestify/domain/models/resources/dataset_resource.py +29 -37
- ingestify/domain/models/sink.py +1 -8
- ingestify/domain/models/task/task.py +3 -1
- ingestify/domain/models/task/task_summary.py +118 -0
- ingestify/domain/models/timing.py +16 -0
- ingestify/infra/fetch/http.py +5 -0
- ingestify/infra/source/statsbomb_github.py +67 -54
- ingestify/infra/store/dataset/__init__.py +0 -2
- ingestify/infra/store/dataset/sqlalchemy/mapping.py +184 -4
- ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -22
- ingestify/main.py +42 -22
- ingestify/utils.py +15 -78
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/METADATA +2 -1
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/RECORD +38 -32
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/WHEEL +1 -1
- ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import uuid
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from ingestify import retrieve_http
|
|
8
|
+
from ingestify.application.dataset_store import DatasetStore
|
|
9
|
+
from ingestify.domain import Selector, Identifier, TaskSet, Dataset, DraftFile, Task
|
|
10
|
+
from ingestify.domain.models.dataset.revision import RevisionSource, SourceType
|
|
11
|
+
from ingestify.domain.models.ingestion.ingestion_job_summary import (
|
|
12
|
+
IngestionJobSummary,
|
|
13
|
+
)
|
|
14
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
15
|
+
from ingestify.domain.models.resources.dataset_resource import (
|
|
16
|
+
FileResource,
|
|
17
|
+
DatasetResource,
|
|
18
|
+
)
|
|
19
|
+
from ingestify.domain.models.task.task_summary import TaskSummary
|
|
20
|
+
from ingestify.utils import TaskExecutor, chunker
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
DEFAULT_CHUNK_SIZE = 1000
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def run_task(task):
|
|
29
|
+
logger.info(f"Running task {task}")
|
|
30
|
+
return task.run()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def to_batches(input_):
|
|
34
|
+
if isinstance(input_, list):
|
|
35
|
+
batches = [input_]
|
|
36
|
+
else:
|
|
37
|
+
# Assume it's an iterator. Peek what's inside, and put it back
|
|
38
|
+
try:
|
|
39
|
+
peek = next(input_)
|
|
40
|
+
except StopIteration:
|
|
41
|
+
# Nothing to batch
|
|
42
|
+
return []
|
|
43
|
+
|
|
44
|
+
input_ = itertools.chain([peek], input_)
|
|
45
|
+
|
|
46
|
+
if not isinstance(peek, list):
|
|
47
|
+
batches = chunker(input_, DEFAULT_CHUNK_SIZE)
|
|
48
|
+
else:
|
|
49
|
+
batches = input_
|
|
50
|
+
return batches
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def load_file(
|
|
54
|
+
file_resource: FileResource, dataset: Optional[Dataset] = None
|
|
55
|
+
) -> Optional[DraftFile]:
|
|
56
|
+
current_file = None
|
|
57
|
+
if dataset:
|
|
58
|
+
current_file = dataset.current_revision.modified_files_map.get(
|
|
59
|
+
file_resource.file_id
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
if file_resource.json_content is not None:
|
|
63
|
+
# Empty dictionary is allowed
|
|
64
|
+
file = DraftFile.from_input(
|
|
65
|
+
file_=json.dumps(file_resource.json_content, indent=4),
|
|
66
|
+
data_serialization_format="json",
|
|
67
|
+
data_feed_key=file_resource.data_feed_key,
|
|
68
|
+
data_spec_version=file_resource.data_spec_version,
|
|
69
|
+
modified_at=file_resource.last_modified,
|
|
70
|
+
)
|
|
71
|
+
if current_file and current_file.tag == file.tag:
|
|
72
|
+
# Nothing changed
|
|
73
|
+
return None
|
|
74
|
+
return file
|
|
75
|
+
elif file_resource.url:
|
|
76
|
+
http_options = {}
|
|
77
|
+
if file_resource.http_options:
|
|
78
|
+
for k, v in file_resource.http_options.items():
|
|
79
|
+
http_options[f"http_{k}"] = v
|
|
80
|
+
|
|
81
|
+
return retrieve_http(
|
|
82
|
+
url=file_resource.url,
|
|
83
|
+
current_file=current_file,
|
|
84
|
+
file_data_feed_key=file_resource.data_feed_key,
|
|
85
|
+
file_data_spec_version=file_resource.data_spec_version,
|
|
86
|
+
file_data_serialization_format=file_resource.data_serialization_format
|
|
87
|
+
or "txt",
|
|
88
|
+
last_modified=file_resource.last_modified,
|
|
89
|
+
**http_options,
|
|
90
|
+
**file_resource.loader_kwargs,
|
|
91
|
+
)
|
|
92
|
+
else:
|
|
93
|
+
return file_resource.file_loader(
|
|
94
|
+
file_resource,
|
|
95
|
+
current_file,
|
|
96
|
+
# TODO: check how to fix this with typehints
|
|
97
|
+
**file_resource.loader_kwargs,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class UpdateDatasetTask(Task):
|
|
102
|
+
def __init__(
|
|
103
|
+
self,
|
|
104
|
+
dataset: Dataset,
|
|
105
|
+
dataset_resource: DatasetResource,
|
|
106
|
+
store: DatasetStore,
|
|
107
|
+
):
|
|
108
|
+
self.dataset = dataset
|
|
109
|
+
self.dataset_resource = dataset_resource
|
|
110
|
+
self.store = store
|
|
111
|
+
self.task_id = str(uuid.uuid1())
|
|
112
|
+
|
|
113
|
+
def run(self):
|
|
114
|
+
dataset_identifier = Identifier(**self.dataset_resource.dataset_resource_id)
|
|
115
|
+
|
|
116
|
+
revision_source = RevisionSource(
|
|
117
|
+
source_id=self.task_id, source_type=SourceType.TASK
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
with TaskSummary.update(
|
|
121
|
+
self.task_id, dataset_identifier=dataset_identifier
|
|
122
|
+
) as task_summary:
|
|
123
|
+
revision = self.store.update_dataset(
|
|
124
|
+
dataset=self.dataset,
|
|
125
|
+
name=self.dataset_resource.name,
|
|
126
|
+
state=self.dataset_resource.state,
|
|
127
|
+
metadata=self.dataset_resource.metadata,
|
|
128
|
+
files={
|
|
129
|
+
file_id: task_summary.record_load_file(
|
|
130
|
+
lambda: load_file(file_resource, dataset=self.dataset),
|
|
131
|
+
metadata={"file_id": file_id},
|
|
132
|
+
)
|
|
133
|
+
for file_id, file_resource in self.dataset_resource.files.items()
|
|
134
|
+
},
|
|
135
|
+
revision_source=revision_source,
|
|
136
|
+
)
|
|
137
|
+
task_summary.set_stats_from_revision(revision)
|
|
138
|
+
|
|
139
|
+
return task_summary
|
|
140
|
+
|
|
141
|
+
def __repr__(self):
|
|
142
|
+
return f"UpdateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class CreateDatasetTask(Task):
|
|
146
|
+
def __init__(
|
|
147
|
+
self,
|
|
148
|
+
dataset_resource: DatasetResource,
|
|
149
|
+
store: DatasetStore,
|
|
150
|
+
):
|
|
151
|
+
self.dataset_resource = dataset_resource
|
|
152
|
+
self.store = store
|
|
153
|
+
self.task_id = str(uuid.uuid1())
|
|
154
|
+
|
|
155
|
+
def run(self):
|
|
156
|
+
dataset_identifier = Identifier(**self.dataset_resource.dataset_resource_id)
|
|
157
|
+
revision_source = RevisionSource(
|
|
158
|
+
source_id=self.task_id, source_type=SourceType.TASK
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
with TaskSummary.create(self.task_id, dataset_identifier) as task_summary:
|
|
162
|
+
revision = self.store.create_dataset(
|
|
163
|
+
dataset_type=self.dataset_resource.dataset_type,
|
|
164
|
+
provider=self.dataset_resource.provider,
|
|
165
|
+
dataset_identifier=dataset_identifier,
|
|
166
|
+
name=self.dataset_resource.name,
|
|
167
|
+
state=self.dataset_resource.state,
|
|
168
|
+
metadata=self.dataset_resource.metadata,
|
|
169
|
+
files={
|
|
170
|
+
file_id: task_summary.record_load_file(
|
|
171
|
+
lambda: load_file(file_resource, dataset=None),
|
|
172
|
+
metadata={"file_id": file_id},
|
|
173
|
+
)
|
|
174
|
+
for file_id, file_resource in self.dataset_resource.files.items()
|
|
175
|
+
},
|
|
176
|
+
revision_source=revision_source,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
task_summary.set_stats_from_revision(revision)
|
|
180
|
+
|
|
181
|
+
return task_summary
|
|
182
|
+
|
|
183
|
+
def __repr__(self):
|
|
184
|
+
return f"CreateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class IngestionJob:
|
|
188
|
+
def __init__(
|
|
189
|
+
self,
|
|
190
|
+
ingestion_job_id: str,
|
|
191
|
+
ingestion_plan: IngestionPlan,
|
|
192
|
+
selector: Selector,
|
|
193
|
+
):
|
|
194
|
+
self.ingestion_job_id = ingestion_job_id
|
|
195
|
+
self.ingestion_plan = ingestion_plan
|
|
196
|
+
self.selector = selector
|
|
197
|
+
|
|
198
|
+
def execute(
|
|
199
|
+
self, store: DatasetStore, task_executor: TaskExecutor
|
|
200
|
+
) -> IngestionJobSummary:
|
|
201
|
+
with IngestionJobSummary.new(ingestion_job=self) as ingestion_job_summary:
|
|
202
|
+
with ingestion_job_summary.record_timing("get_dataset_collection"):
|
|
203
|
+
dataset_collection_metadata = store.get_dataset_collection(
|
|
204
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
205
|
+
data_spec_versions=self.selector.data_spec_versions,
|
|
206
|
+
selector=self.selector,
|
|
207
|
+
metadata_only=True,
|
|
208
|
+
).metadata
|
|
209
|
+
|
|
210
|
+
# There are two different, but similar flows here:
|
|
211
|
+
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
212
|
+
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
213
|
+
with ingestion_job_summary.record_timing("find_datasets"):
|
|
214
|
+
# Timing might be incorrect as it is an iterator
|
|
215
|
+
datasets = self.ingestion_plan.source.find_datasets(
|
|
216
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
217
|
+
data_spec_versions=self.selector.data_spec_versions,
|
|
218
|
+
dataset_collection_metadata=dataset_collection_metadata,
|
|
219
|
+
**self.selector.custom_attributes,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
batches = to_batches(datasets)
|
|
223
|
+
|
|
224
|
+
with ingestion_job_summary.record_timing("tasks"):
|
|
225
|
+
for batch in batches:
|
|
226
|
+
dataset_identifiers = [
|
|
227
|
+
Identifier.create_from_selector(
|
|
228
|
+
self.selector, **dataset_resource.dataset_resource_id
|
|
229
|
+
)
|
|
230
|
+
# We have to pass the data_spec_versions here as a Source can add some
|
|
231
|
+
# extra data to the identifier which is retrieved in a certain data format
|
|
232
|
+
for dataset_resource in batch
|
|
233
|
+
]
|
|
234
|
+
|
|
235
|
+
# Load all available datasets based on the discovered dataset identifiers
|
|
236
|
+
dataset_collection = store.get_dataset_collection(
|
|
237
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
238
|
+
# Assume all DatasetResources share the same provider
|
|
239
|
+
provider=batch[0].provider,
|
|
240
|
+
selector=dataset_identifiers,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
skip_count = 0
|
|
244
|
+
|
|
245
|
+
task_set = TaskSet()
|
|
246
|
+
for dataset_resource in batch:
|
|
247
|
+
dataset_identifier = Identifier.create_from_selector(
|
|
248
|
+
self.selector, **dataset_resource.dataset_resource_id
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
if dataset := dataset_collection.get(dataset_identifier):
|
|
252
|
+
if self.ingestion_plan.fetch_policy.should_refetch(
|
|
253
|
+
dataset, dataset_resource
|
|
254
|
+
):
|
|
255
|
+
task_set.add(
|
|
256
|
+
UpdateDatasetTask(
|
|
257
|
+
dataset=dataset, # Current dataset from the database
|
|
258
|
+
dataset_resource=dataset_resource, # Most recent dataset_resource
|
|
259
|
+
store=store,
|
|
260
|
+
)
|
|
261
|
+
)
|
|
262
|
+
else:
|
|
263
|
+
skip_count += 1
|
|
264
|
+
else:
|
|
265
|
+
if self.ingestion_plan.fetch_policy.should_fetch(
|
|
266
|
+
dataset_resource
|
|
267
|
+
):
|
|
268
|
+
task_set.add(
|
|
269
|
+
CreateDatasetTask(
|
|
270
|
+
dataset_resource=dataset_resource,
|
|
271
|
+
store=store,
|
|
272
|
+
)
|
|
273
|
+
)
|
|
274
|
+
else:
|
|
275
|
+
skip_count += 1
|
|
276
|
+
|
|
277
|
+
if task_set:
|
|
278
|
+
logger.info(
|
|
279
|
+
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
280
|
+
f"using selector {self.selector} => {len(task_set)} tasks. {skip_count} skipped."
|
|
281
|
+
)
|
|
282
|
+
logger.info(f"Running {len(task_set)} tasks")
|
|
283
|
+
ingestion_job_summary.add_task_summaries(
|
|
284
|
+
task_executor.run(run_task, task_set)
|
|
285
|
+
)
|
|
286
|
+
else:
|
|
287
|
+
logger.info(
|
|
288
|
+
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
289
|
+
f"using selector {self.selector} => nothing to do"
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
return ingestion_job_summary
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from contextlib import contextmanager
|
|
2
|
+
from datetime import datetime, timedelta
|
|
3
|
+
from typing import Optional, List, TYPE_CHECKING
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from ingestify.domain import Selector, DataSpecVersionCollection
|
|
7
|
+
from ingestify.domain.models.base import BaseModel
|
|
8
|
+
from ingestify.domain.models.task.task_summary import TaskSummary, TaskStatus
|
|
9
|
+
from ingestify.domain.models.timing import Timing
|
|
10
|
+
from ingestify.utils import utcnow
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def format_duration(duration: timedelta):
|
|
17
|
+
return f"{duration.total_seconds():.2}sec"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class IngestionJobSummary(BaseModel):
|
|
21
|
+
ingestion_job_id: str
|
|
22
|
+
|
|
23
|
+
# From the IngestionPlan
|
|
24
|
+
source_name: str
|
|
25
|
+
dataset_type: str
|
|
26
|
+
data_spec_versions: DataSpecVersionCollection
|
|
27
|
+
selector: Selector
|
|
28
|
+
|
|
29
|
+
started_at: datetime = Field(default_factory=utcnow)
|
|
30
|
+
finished_at: Optional[datetime] = None
|
|
31
|
+
timings: List[Timing] = Field(default_factory=list)
|
|
32
|
+
task_summaries: List[TaskSummary] = Field(default_factory=list)
|
|
33
|
+
|
|
34
|
+
failed_tasks: int = 0
|
|
35
|
+
successful_tasks: int = 0
|
|
36
|
+
ignored_successful_tasks: int = 0
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def new(cls, ingestion_job: "IngestionJob"):
|
|
40
|
+
args = dict(
|
|
41
|
+
ingestion_job_id=ingestion_job.ingestion_job_id,
|
|
42
|
+
source_name=ingestion_job.ingestion_plan.source.name,
|
|
43
|
+
dataset_type=ingestion_job.ingestion_plan.dataset_type,
|
|
44
|
+
data_spec_versions=ingestion_job.ingestion_plan.data_spec_versions,
|
|
45
|
+
selector=ingestion_job.selector,
|
|
46
|
+
)
|
|
47
|
+
return cls(**args)
|
|
48
|
+
|
|
49
|
+
@contextmanager
|
|
50
|
+
def record_timing(self, name: str):
|
|
51
|
+
start = utcnow()
|
|
52
|
+
yield
|
|
53
|
+
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
54
|
+
|
|
55
|
+
def add_task_summaries(self, task_summaries: List[TaskSummary]):
|
|
56
|
+
self.task_summaries.extend(task_summaries)
|
|
57
|
+
|
|
58
|
+
def set_finished(self):
|
|
59
|
+
self.failed_tasks = len(
|
|
60
|
+
[task for task in self.task_summaries if task.status == TaskStatus.FAILED]
|
|
61
|
+
)
|
|
62
|
+
self.successful_tasks = len(
|
|
63
|
+
[task for task in self.task_summaries if task.status == TaskStatus.FINISHED]
|
|
64
|
+
)
|
|
65
|
+
self.ignored_successful_tasks = len(
|
|
66
|
+
[
|
|
67
|
+
task
|
|
68
|
+
for task in self.task_summaries
|
|
69
|
+
if task.status == TaskStatus.FINISHED_IGNORED
|
|
70
|
+
]
|
|
71
|
+
)
|
|
72
|
+
self.finished_at = utcnow()
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def duration(self) -> timedelta:
|
|
76
|
+
return self.finished_at - self.started_at
|
|
77
|
+
|
|
78
|
+
def output_report(self):
|
|
79
|
+
print(f"\nIngestionJobSummary finished in {format_duration(self.duration)}")
|
|
80
|
+
print("--------------------")
|
|
81
|
+
print(f" - IngestionPlan:")
|
|
82
|
+
print(f" Source: {self.source_name}")
|
|
83
|
+
print(f" DatasetType: {self.dataset_type}")
|
|
84
|
+
print(f" - Selector: {self.selector}")
|
|
85
|
+
print(f" - Timings: ")
|
|
86
|
+
for timing in self.timings:
|
|
87
|
+
print(f" - {timing.name}: {format_duration(timing.duration)}")
|
|
88
|
+
print(
|
|
89
|
+
f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
for status in [
|
|
93
|
+
TaskStatus.FAILED,
|
|
94
|
+
TaskStatus.FINISHED,
|
|
95
|
+
TaskStatus.FINISHED_IGNORED,
|
|
96
|
+
]:
|
|
97
|
+
print(
|
|
98
|
+
f" - {status.value.lower()}: {len([task for task in self.task_summaries if task.status == status])}"
|
|
99
|
+
)
|
|
100
|
+
print("--------------------")
|
|
101
|
+
|
|
102
|
+
def __enter__(self):
|
|
103
|
+
return self
|
|
104
|
+
|
|
105
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
106
|
+
pass
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
1
|
from typing import List
|
|
3
2
|
|
|
4
3
|
from ingestify.domain.models import Source, Selector
|
|
4
|
+
from ingestify.domain.models.base import BaseModel
|
|
5
5
|
from ingestify.domain.models.data_spec_version_collection import (
|
|
6
6
|
DataSpecVersionCollection,
|
|
7
7
|
)
|
|
8
8
|
from ingestify.domain.models.fetch_policy import FetchPolicy
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
class IngestionPlan(BaseModel):
|
|
12
|
+
|
|
13
13
|
source: Source
|
|
14
14
|
selectors: List[Selector]
|
|
15
15
|
fetch_policy: FetchPolicy
|
|
@@ -17,7 +17,7 @@ class ExtractJob:
|
|
|
17
17
|
data_spec_versions: DataSpecVersionCollection
|
|
18
18
|
|
|
19
19
|
def __repr__(self):
|
|
20
|
-
return f'<
|
|
20
|
+
return f'<IngestionPlan source="{self.source.name}" dataset_type="{self.dataset_type}">'
|
|
21
21
|
|
|
22
22
|
def __str__(self):
|
|
23
23
|
return repr(self)
|
|
@@ -1,35 +1,40 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import Optional, Callable, TYPE_CHECKING
|
|
3
|
+
from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING # noqa
|
|
4
|
+
from pydantic import Field
|
|
4
5
|
|
|
6
|
+
from ingestify.domain.models.base import BaseModel
|
|
7
|
+
from ingestify.domain.models.dataset.dataset_state import DatasetState
|
|
5
8
|
from ingestify.exceptions import DuplicateFile
|
|
6
9
|
|
|
7
|
-
|
|
8
|
-
from ingestify.domain import DraftFile, File
|
|
9
|
-
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
10
|
+
from ingestify.domain.models import File, DraftFile
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
|
|
13
|
+
class FileLoaderProtocol(Protocol):
|
|
14
|
+
def __call__(
|
|
15
|
+
self,
|
|
16
|
+
file_resource: "FileResource",
|
|
17
|
+
file: Optional["File"] = None,
|
|
18
|
+
**kwargs: Any,
|
|
19
|
+
) -> Optional["DraftFile"]:
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FileResource(BaseModel):
|
|
14
24
|
dataset_resource: "DatasetResource"
|
|
15
25
|
file_id: str
|
|
16
26
|
last_modified: datetime
|
|
17
27
|
data_feed_key: str
|
|
18
28
|
data_spec_version: str
|
|
19
|
-
|
|
20
|
-
# DataSerializationFormat is "json" in case of json_content, otherwise file_loader will return it
|
|
21
|
-
# data_serialization_format: str
|
|
22
|
-
|
|
23
29
|
json_content: Optional[dict] = None
|
|
24
|
-
|
|
25
30
|
url: Optional[str] = None
|
|
26
31
|
http_options: Optional[dict] = None
|
|
32
|
+
# DataSerializationFormat is "json" in case of json_content, otherwise file_loader will return it
|
|
27
33
|
data_serialization_format: Optional[str] = None
|
|
28
|
-
|
|
29
34
|
file_loader: Optional[
|
|
30
35
|
Callable[["FileResource", Optional["File"]], Optional["DraftFile"]]
|
|
31
36
|
] = None
|
|
32
|
-
loader_kwargs: dict =
|
|
37
|
+
loader_kwargs: dict = Field(default_factory=dict)
|
|
33
38
|
|
|
34
39
|
def __post_init__(self):
|
|
35
40
|
if self.json_content is None and not self.url and not self.file_loader:
|
|
@@ -38,27 +43,14 @@ class FileResource:
|
|
|
38
43
|
)
|
|
39
44
|
|
|
40
45
|
|
|
41
|
-
class DatasetResource:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
metadata: Optional[dict] = None,
|
|
50
|
-
state: Optional["DatasetState"] = None,
|
|
51
|
-
):
|
|
52
|
-
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
53
|
-
|
|
54
|
-
self.dataset_type = dataset_type
|
|
55
|
-
self.provider = provider
|
|
56
|
-
self.dataset_resource_id = dataset_resource_id
|
|
57
|
-
self.name = name
|
|
58
|
-
self.metadata = metadata or {}
|
|
59
|
-
self.state = state or DatasetState.COMPLETE
|
|
60
|
-
|
|
61
|
-
self.files = {}
|
|
46
|
+
class DatasetResource(BaseModel):
|
|
47
|
+
dataset_resource_id: dict
|
|
48
|
+
dataset_type: str
|
|
49
|
+
provider: str
|
|
50
|
+
name: str
|
|
51
|
+
metadata: dict = Field(default_factory=dict)
|
|
52
|
+
state: DatasetState = Field(default_factory=lambda: DatasetState.COMPLETE)
|
|
53
|
+
files: dict[str, FileResource] = Field(default_factory=dict)
|
|
62
54
|
|
|
63
55
|
def add_file(
|
|
64
56
|
self,
|
|
@@ -72,8 +64,8 @@ class DatasetResource:
|
|
|
72
64
|
data_serialization_format: Optional[str] = None,
|
|
73
65
|
file_loader: Optional[
|
|
74
66
|
Callable[
|
|
75
|
-
[
|
|
76
|
-
Optional[
|
|
67
|
+
[FileResource, Optional[File]],
|
|
68
|
+
Optional[DraftFile],
|
|
77
69
|
]
|
|
78
70
|
] = None,
|
|
79
71
|
loader_kwargs: Optional[dict] = None,
|
ingestify/domain/models/sink.py
CHANGED
|
@@ -1,16 +1,9 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
|
|
3
|
-
from ingestify.utils import ComponentFactory, ComponentRegistry
|
|
4
|
-
|
|
5
3
|
from .dataset import Dataset
|
|
6
4
|
|
|
7
|
-
sink_registry = ComponentRegistry()
|
|
8
|
-
|
|
9
5
|
|
|
10
|
-
class Sink(ABC
|
|
6
|
+
class Sink(ABC):
|
|
11
7
|
@abstractmethod
|
|
12
8
|
def upsert(self, dataset: Dataset, data, params: dict):
|
|
13
9
|
pass
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
sink_factory = ComponentFactory.build_factory(Sink, sink_registry)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import traceback
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Optional, List
|
|
7
|
+
from pydantic import Field, field_validator
|
|
8
|
+
|
|
9
|
+
from ingestify.domain.models.base import BaseModel
|
|
10
|
+
from ingestify.domain.models.dataset.identifier import Identifier
|
|
11
|
+
from ingestify.domain.models.timing import Timing
|
|
12
|
+
from ingestify.exceptions import IngestifyError
|
|
13
|
+
from ingestify.utils import utcnow
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TaskStatus(str, Enum):
|
|
20
|
+
RUNNING = "RUNNING"
|
|
21
|
+
FINISHED = "FINISHED"
|
|
22
|
+
FINISHED_IGNORED = "FINISHED_IGNORED" # Finished, but didn't produce any new data
|
|
23
|
+
FAILED = "FAILED"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Operation(str, Enum):
|
|
27
|
+
CREATE = "CREATE"
|
|
28
|
+
UPDATE = "UPDATE"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class TaskSummary(BaseModel):
|
|
32
|
+
task_id: str
|
|
33
|
+
started_at: datetime
|
|
34
|
+
operation: Operation
|
|
35
|
+
dataset_identifier: Identifier
|
|
36
|
+
ended_at: Optional[datetime] = None
|
|
37
|
+
persisted_file_count: int = 0
|
|
38
|
+
bytes_retrieved: int = 0
|
|
39
|
+
last_modified: Optional[datetime] = None
|
|
40
|
+
status: TaskStatus = TaskStatus.RUNNING
|
|
41
|
+
timings: List[Timing] = Field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
@field_validator("dataset_identifier", mode="before")
|
|
44
|
+
@classmethod
|
|
45
|
+
def ensure_list(cls, value) -> Identifier:
|
|
46
|
+
if not isinstance(value, Identifier):
|
|
47
|
+
return Identifier(**value)
|
|
48
|
+
return value
|
|
49
|
+
|
|
50
|
+
def record_load_file(self, fn, metadata: dict):
|
|
51
|
+
start = utcnow()
|
|
52
|
+
try:
|
|
53
|
+
result = None
|
|
54
|
+
return fn()
|
|
55
|
+
except Exception as e:
|
|
56
|
+
result = {
|
|
57
|
+
"type": type(e).__name__,
|
|
58
|
+
"message": str(e),
|
|
59
|
+
"traceback": traceback.format_exc(),
|
|
60
|
+
}
|
|
61
|
+
raise e
|
|
62
|
+
finally:
|
|
63
|
+
metadata = dict(result=result, **metadata)
|
|
64
|
+
self.timings.append(
|
|
65
|
+
Timing(
|
|
66
|
+
name=f"Load of {metadata.get('file_id', 'file')}",
|
|
67
|
+
started_at=start,
|
|
68
|
+
ended_at=utcnow(),
|
|
69
|
+
metadata=metadata,
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
@classmethod
|
|
74
|
+
@contextmanager
|
|
75
|
+
def new(cls, task_id: str, operation: Operation, dataset_identifier: Identifier):
|
|
76
|
+
start = utcnow()
|
|
77
|
+
task_summary = cls(
|
|
78
|
+
task_id=task_id,
|
|
79
|
+
started_at=start,
|
|
80
|
+
operation=operation,
|
|
81
|
+
dataset_identifier=dataset_identifier,
|
|
82
|
+
)
|
|
83
|
+
try:
|
|
84
|
+
yield task_summary
|
|
85
|
+
|
|
86
|
+
task_summary.set_status(TaskStatus.FINISHED)
|
|
87
|
+
except Exception as e:
|
|
88
|
+
logger.exception(f"Failed to execute task.")
|
|
89
|
+
task_summary.set_status(TaskStatus.FAILED)
|
|
90
|
+
|
|
91
|
+
# When the error comes from our own code, make sure it will be raised to the highest level
|
|
92
|
+
# raise
|
|
93
|
+
if isinstance(e, IngestifyError):
|
|
94
|
+
raise
|
|
95
|
+
finally:
|
|
96
|
+
task_summary.ended_at = utcnow()
|
|
97
|
+
|
|
98
|
+
@classmethod
|
|
99
|
+
def update(cls, task_id: str, dataset_identifier: Identifier):
|
|
100
|
+
return cls.new(task_id, Operation.UPDATE, dataset_identifier)
|
|
101
|
+
|
|
102
|
+
@classmethod
|
|
103
|
+
def create(cls, task_id: str, dataset_identifier: Identifier):
|
|
104
|
+
return cls.new(task_id, Operation.CREATE, dataset_identifier)
|
|
105
|
+
|
|
106
|
+
def set_stats_from_revision(self, revision: Optional["Revision"]):
|
|
107
|
+
if revision:
|
|
108
|
+
self.persisted_file_count = len(revision.modified_files)
|
|
109
|
+
self.bytes_retrieved = sum(file.size for file in revision.modified_files)
|
|
110
|
+
self.last_modified = max(
|
|
111
|
+
file.modified_at for file in revision.modified_files
|
|
112
|
+
)
|
|
113
|
+
else:
|
|
114
|
+
self.status = TaskStatus.FINISHED_IGNORED
|
|
115
|
+
|
|
116
|
+
def set_status(self, status: TaskStatus):
|
|
117
|
+
if self.status == TaskStatus.RUNNING:
|
|
118
|
+
self.status = status
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from typing import Optional, Any
|
|
3
|
+
from pydantic import BaseModel, ConfigDict
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Timing(BaseModel):
|
|
7
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
8
|
+
|
|
9
|
+
name: str
|
|
10
|
+
started_at: datetime
|
|
11
|
+
ended_at: datetime
|
|
12
|
+
metadata: Optional[dict[str, Any]] = None
|
|
13
|
+
|
|
14
|
+
@property
|
|
15
|
+
def duration(self):
|
|
16
|
+
return self.ended_at - self.started_at
|