ingestify 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +47 -36
- ingestify/application/ingestion_engine.py +3 -3
- ingestify/application/loader.py +71 -241
- ingestify/domain/models/__init__.py +1 -6
- ingestify/domain/models/base.py +22 -0
- ingestify/domain/models/data_spec_version_collection.py +6 -0
- ingestify/domain/models/dataset/__init__.py +3 -5
- ingestify/domain/models/dataset/dataset.py +15 -32
- ingestify/domain/models/dataset/dataset_repository.py +1 -15
- ingestify/domain/models/dataset/dataset_state.py +11 -0
- ingestify/domain/models/dataset/events.py +6 -16
- ingestify/domain/models/dataset/file.py +21 -34
- ingestify/domain/models/dataset/file_collection.py +3 -1
- ingestify/domain/models/dataset/file_repository.py +29 -28
- ingestify/domain/models/dataset/revision.py +26 -3
- ingestify/domain/models/event/domain_event.py +8 -4
- ingestify/domain/models/ingestion/__init__.py +0 -0
- ingestify/domain/models/ingestion/ingestion_job.py +325 -0
- ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
- ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
- ingestify/domain/models/resources/dataset_resource.py +29 -37
- ingestify/domain/models/sink.py +1 -8
- ingestify/domain/models/task/task.py +3 -1
- ingestify/domain/models/task/task_summary.py +118 -0
- ingestify/domain/models/timing.py +16 -0
- ingestify/domain/services/identifier_key_transformer.py +111 -0
- ingestify/infra/fetch/http.py +5 -0
- ingestify/infra/source/statsbomb_github.py +67 -54
- ingestify/infra/store/dataset/__init__.py +0 -2
- ingestify/infra/store/dataset/sqlalchemy/mapping.py +187 -4
- ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
- ingestify/infra/store/file/local_file_repository.py +3 -5
- ingestify/infra/store/file/s3_file_repository.py +4 -9
- ingestify/main.py +64 -25
- ingestify/utils.py +15 -78
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/METADATA +2 -1
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/RECORD +41 -34
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/WHEEL +1 -1
- ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import itertools
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import uuid
|
|
5
|
+
from typing import Optional, Iterator
|
|
6
|
+
|
|
7
|
+
from ingestify import retrieve_http
|
|
8
|
+
from ingestify.application.dataset_store import DatasetStore
|
|
9
|
+
from ingestify.domain import Selector, Identifier, TaskSet, Dataset, DraftFile, Task
|
|
10
|
+
from ingestify.domain.models.dataset.revision import RevisionSource, SourceType
|
|
11
|
+
from ingestify.domain.models.ingestion.ingestion_job_summary import (
|
|
12
|
+
IngestionJobSummary,
|
|
13
|
+
)
|
|
14
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
15
|
+
from ingestify.domain.models.resources.dataset_resource import (
|
|
16
|
+
FileResource,
|
|
17
|
+
DatasetResource,
|
|
18
|
+
)
|
|
19
|
+
from ingestify.domain.models.task.task_summary import TaskSummary
|
|
20
|
+
from ingestify.utils import TaskExecutor, chunker
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
DEFAULT_CHUNK_SIZE = 1_000
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def run_task(task):
|
|
29
|
+
logger.info(f"Running task {task}")
|
|
30
|
+
return task.run()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def to_batches(input_):
|
|
34
|
+
if isinstance(input_, list):
|
|
35
|
+
batches = iter(input_)
|
|
36
|
+
else:
|
|
37
|
+
# Assume it's an iterator. Peek what's inside, and put it back
|
|
38
|
+
try:
|
|
39
|
+
peek = next(input_)
|
|
40
|
+
except StopIteration:
|
|
41
|
+
# Nothing to batch
|
|
42
|
+
return iter([])
|
|
43
|
+
|
|
44
|
+
input_ = itertools.chain([peek], input_)
|
|
45
|
+
|
|
46
|
+
if not isinstance(peek, list):
|
|
47
|
+
batches = chunker(input_, DEFAULT_CHUNK_SIZE)
|
|
48
|
+
else:
|
|
49
|
+
batches = input_
|
|
50
|
+
return batches
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def load_file(
|
|
54
|
+
file_resource: FileResource, dataset: Optional[Dataset] = None
|
|
55
|
+
) -> Optional[DraftFile]:
|
|
56
|
+
current_file = None
|
|
57
|
+
if dataset:
|
|
58
|
+
current_file = dataset.current_revision.modified_files_map.get(
|
|
59
|
+
file_resource.file_id
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
if file_resource.json_content is not None:
|
|
63
|
+
# Empty dictionary is allowed
|
|
64
|
+
file = DraftFile.from_input(
|
|
65
|
+
file_=json.dumps(file_resource.json_content, indent=4),
|
|
66
|
+
data_serialization_format="json",
|
|
67
|
+
data_feed_key=file_resource.data_feed_key,
|
|
68
|
+
data_spec_version=file_resource.data_spec_version,
|
|
69
|
+
modified_at=file_resource.last_modified,
|
|
70
|
+
)
|
|
71
|
+
if current_file and current_file.tag == file.tag:
|
|
72
|
+
# Nothing changed
|
|
73
|
+
return None
|
|
74
|
+
return file
|
|
75
|
+
elif file_resource.url:
|
|
76
|
+
http_options = {}
|
|
77
|
+
if file_resource.http_options:
|
|
78
|
+
for k, v in file_resource.http_options.items():
|
|
79
|
+
http_options[f"http_{k}"] = v
|
|
80
|
+
|
|
81
|
+
return retrieve_http(
|
|
82
|
+
url=file_resource.url,
|
|
83
|
+
current_file=current_file,
|
|
84
|
+
file_data_feed_key=file_resource.data_feed_key,
|
|
85
|
+
file_data_spec_version=file_resource.data_spec_version,
|
|
86
|
+
file_data_serialization_format=file_resource.data_serialization_format
|
|
87
|
+
or "txt",
|
|
88
|
+
last_modified=file_resource.last_modified,
|
|
89
|
+
**http_options,
|
|
90
|
+
**file_resource.loader_kwargs,
|
|
91
|
+
)
|
|
92
|
+
else:
|
|
93
|
+
return file_resource.file_loader(
|
|
94
|
+
file_resource,
|
|
95
|
+
current_file,
|
|
96
|
+
# TODO: check how to fix this with typehints
|
|
97
|
+
**file_resource.loader_kwargs,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class UpdateDatasetTask(Task):
|
|
102
|
+
def __init__(
|
|
103
|
+
self,
|
|
104
|
+
dataset: Dataset,
|
|
105
|
+
dataset_resource: DatasetResource,
|
|
106
|
+
store: DatasetStore,
|
|
107
|
+
):
|
|
108
|
+
self.dataset = dataset
|
|
109
|
+
self.dataset_resource = dataset_resource
|
|
110
|
+
self.store = store
|
|
111
|
+
self.task_id = str(uuid.uuid1())
|
|
112
|
+
|
|
113
|
+
def run(self):
|
|
114
|
+
dataset_identifier = Identifier(**self.dataset_resource.dataset_resource_id)
|
|
115
|
+
|
|
116
|
+
revision_source = RevisionSource(
|
|
117
|
+
source_id=self.task_id, source_type=SourceType.TASK
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
with TaskSummary.update(
|
|
121
|
+
self.task_id, dataset_identifier=dataset_identifier
|
|
122
|
+
) as task_summary:
|
|
123
|
+
revision = self.store.update_dataset(
|
|
124
|
+
dataset=self.dataset,
|
|
125
|
+
name=self.dataset_resource.name,
|
|
126
|
+
state=self.dataset_resource.state,
|
|
127
|
+
metadata=self.dataset_resource.metadata,
|
|
128
|
+
files={
|
|
129
|
+
file_id: task_summary.record_load_file(
|
|
130
|
+
lambda: load_file(file_resource, dataset=self.dataset),
|
|
131
|
+
metadata={"file_id": file_id},
|
|
132
|
+
)
|
|
133
|
+
for file_id, file_resource in self.dataset_resource.files.items()
|
|
134
|
+
},
|
|
135
|
+
revision_source=revision_source,
|
|
136
|
+
)
|
|
137
|
+
task_summary.set_stats_from_revision(revision)
|
|
138
|
+
|
|
139
|
+
return task_summary
|
|
140
|
+
|
|
141
|
+
def __repr__(self):
|
|
142
|
+
return f"UpdateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class CreateDatasetTask(Task):
|
|
146
|
+
def __init__(
|
|
147
|
+
self,
|
|
148
|
+
dataset_resource: DatasetResource,
|
|
149
|
+
store: DatasetStore,
|
|
150
|
+
):
|
|
151
|
+
self.dataset_resource = dataset_resource
|
|
152
|
+
self.store = store
|
|
153
|
+
self.task_id = str(uuid.uuid1())
|
|
154
|
+
|
|
155
|
+
def run(self):
|
|
156
|
+
dataset_identifier = Identifier(**self.dataset_resource.dataset_resource_id)
|
|
157
|
+
revision_source = RevisionSource(
|
|
158
|
+
source_id=self.task_id, source_type=SourceType.TASK
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
with TaskSummary.create(self.task_id, dataset_identifier) as task_summary:
|
|
162
|
+
revision = self.store.create_dataset(
|
|
163
|
+
dataset_type=self.dataset_resource.dataset_type,
|
|
164
|
+
provider=self.dataset_resource.provider,
|
|
165
|
+
dataset_identifier=dataset_identifier,
|
|
166
|
+
name=self.dataset_resource.name,
|
|
167
|
+
state=self.dataset_resource.state,
|
|
168
|
+
metadata=self.dataset_resource.metadata,
|
|
169
|
+
files={
|
|
170
|
+
file_id: task_summary.record_load_file(
|
|
171
|
+
lambda: load_file(file_resource, dataset=None),
|
|
172
|
+
metadata={"file_id": file_id},
|
|
173
|
+
)
|
|
174
|
+
for file_id, file_resource in self.dataset_resource.files.items()
|
|
175
|
+
},
|
|
176
|
+
revision_source=revision_source,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
task_summary.set_stats_from_revision(revision)
|
|
180
|
+
|
|
181
|
+
return task_summary
|
|
182
|
+
|
|
183
|
+
def __repr__(self):
|
|
184
|
+
return f"CreateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
MAX_TASKS_PER_CHUNK = 10_000
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class IngestionJob:
|
|
191
|
+
def __init__(
|
|
192
|
+
self,
|
|
193
|
+
ingestion_job_id: str,
|
|
194
|
+
ingestion_plan: IngestionPlan,
|
|
195
|
+
selector: Selector,
|
|
196
|
+
):
|
|
197
|
+
self.ingestion_job_id = ingestion_job_id
|
|
198
|
+
self.ingestion_plan = ingestion_plan
|
|
199
|
+
self.selector = selector
|
|
200
|
+
|
|
201
|
+
def execute(
|
|
202
|
+
self, store: DatasetStore, task_executor: TaskExecutor
|
|
203
|
+
) -> Iterator[IngestionJobSummary]:
|
|
204
|
+
is_first_chunk = True
|
|
205
|
+
ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
|
|
206
|
+
# Process all items in batches. Yield a IngestionJobSummary per batch
|
|
207
|
+
|
|
208
|
+
logger.info("Finding metadata")
|
|
209
|
+
with ingestion_job_summary.record_timing("get_dataset_collection"):
|
|
210
|
+
dataset_collection_metadata = store.get_dataset_collection(
|
|
211
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
212
|
+
data_spec_versions=self.selector.data_spec_versions,
|
|
213
|
+
selector=self.selector,
|
|
214
|
+
metadata_only=True,
|
|
215
|
+
).metadata
|
|
216
|
+
logger.info(f"Done: {dataset_collection_metadata}")
|
|
217
|
+
|
|
218
|
+
# There are two different, but similar flows here:
|
|
219
|
+
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
220
|
+
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
221
|
+
with ingestion_job_summary.record_timing("find_datasets"):
|
|
222
|
+
# Timing might be incorrect as it is an iterator
|
|
223
|
+
dataset_resources = self.ingestion_plan.source.find_datasets(
|
|
224
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
225
|
+
data_spec_versions=self.selector.data_spec_versions,
|
|
226
|
+
dataset_collection_metadata=dataset_collection_metadata,
|
|
227
|
+
**self.selector.custom_attributes,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
231
|
+
|
|
232
|
+
batches = to_batches(dataset_resources)
|
|
233
|
+
|
|
234
|
+
while True:
|
|
235
|
+
try:
|
|
236
|
+
batch = next(batches)
|
|
237
|
+
except StopIteration:
|
|
238
|
+
break
|
|
239
|
+
except Exception:
|
|
240
|
+
# TODO: handle exception on IngestionJob level
|
|
241
|
+
raise
|
|
242
|
+
|
|
243
|
+
dataset_identifiers = [
|
|
244
|
+
Identifier.create_from_selector(
|
|
245
|
+
self.selector, **dataset_resource.dataset_resource_id
|
|
246
|
+
)
|
|
247
|
+
# We have to pass the data_spec_versions here as a Source can add some
|
|
248
|
+
# extra data to the identifier which is retrieved in a certain data format
|
|
249
|
+
for dataset_resource in batch
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
# Load all available datasets based on the discovered dataset identifiers
|
|
253
|
+
dataset_collection = store.get_dataset_collection(
|
|
254
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
255
|
+
# Assume all DatasetResources share the same provider
|
|
256
|
+
provider=batch[0].provider,
|
|
257
|
+
selector=dataset_identifiers,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
skipped_datasets = 0
|
|
261
|
+
|
|
262
|
+
task_set = TaskSet()
|
|
263
|
+
for dataset_resource in batch:
|
|
264
|
+
dataset_identifier = Identifier.create_from_selector(
|
|
265
|
+
self.selector, **dataset_resource.dataset_resource_id
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
if dataset := dataset_collection.get(dataset_identifier):
|
|
269
|
+
if self.ingestion_plan.fetch_policy.should_refetch(
|
|
270
|
+
dataset, dataset_resource
|
|
271
|
+
):
|
|
272
|
+
task_set.add(
|
|
273
|
+
UpdateDatasetTask(
|
|
274
|
+
dataset=dataset, # Current dataset from the database
|
|
275
|
+
dataset_resource=dataset_resource, # Most recent dataset_resource
|
|
276
|
+
store=store,
|
|
277
|
+
)
|
|
278
|
+
)
|
|
279
|
+
else:
|
|
280
|
+
skipped_datasets += 1
|
|
281
|
+
else:
|
|
282
|
+
if self.ingestion_plan.fetch_policy.should_fetch(dataset_resource):
|
|
283
|
+
task_set.add(
|
|
284
|
+
CreateDatasetTask(
|
|
285
|
+
dataset_resource=dataset_resource,
|
|
286
|
+
store=store,
|
|
287
|
+
)
|
|
288
|
+
)
|
|
289
|
+
else:
|
|
290
|
+
skipped_datasets += 1
|
|
291
|
+
|
|
292
|
+
if task_set:
|
|
293
|
+
logger.info(
|
|
294
|
+
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
295
|
+
f"using selector {self.selector} => {len(task_set)} tasks. {skipped_datasets} skipped."
|
|
296
|
+
)
|
|
297
|
+
logger.info(f"Running {len(task_set)} tasks")
|
|
298
|
+
ingestion_job_summary.add_task_summaries(
|
|
299
|
+
task_executor.run(run_task, task_set)
|
|
300
|
+
)
|
|
301
|
+
else:
|
|
302
|
+
logger.info(
|
|
303
|
+
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
304
|
+
f"using selector {self.selector} => nothing to do"
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
ingestion_job_summary.increase_skipped_datasets(skipped_datasets)
|
|
308
|
+
|
|
309
|
+
if ingestion_job_summary.task_count() >= MAX_TASKS_PER_CHUNK:
|
|
310
|
+
finish_task_timer()
|
|
311
|
+
ingestion_job_summary.set_finished()
|
|
312
|
+
yield ingestion_job_summary
|
|
313
|
+
|
|
314
|
+
# Start a new one
|
|
315
|
+
is_first_chunk = False
|
|
316
|
+
ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
|
|
317
|
+
|
|
318
|
+
# We will resume tasks, start timer right away
|
|
319
|
+
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
320
|
+
|
|
321
|
+
if ingestion_job_summary.task_count() > 0 or is_first_chunk:
|
|
322
|
+
# When there is interesting information to store, or there was no data at all, store it
|
|
323
|
+
finish_task_timer()
|
|
324
|
+
ingestion_job_summary.set_finished()
|
|
325
|
+
yield ingestion_job_summary
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
from typing import Optional, List, TYPE_CHECKING
|
|
5
|
+
from pydantic import Field
|
|
6
|
+
|
|
7
|
+
from ingestify.domain import Selector, DataSpecVersionCollection
|
|
8
|
+
from ingestify.domain.models.base import BaseModel
|
|
9
|
+
from ingestify.domain.models.task.task_summary import TaskSummary, TaskStatus
|
|
10
|
+
from ingestify.domain.models.timing import Timing
|
|
11
|
+
from ingestify.utils import utcnow
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def format_duration(duration: timedelta):
|
|
18
|
+
return f"{duration.total_seconds():.2f}sec"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class IngestionJobSummary(BaseModel):
|
|
22
|
+
ingestion_job_summary_id: str
|
|
23
|
+
ingestion_job_id: str
|
|
24
|
+
|
|
25
|
+
# From the IngestionPlan
|
|
26
|
+
provider: str
|
|
27
|
+
source_name: str
|
|
28
|
+
dataset_type: str
|
|
29
|
+
data_spec_versions: DataSpecVersionCollection
|
|
30
|
+
selector: Selector
|
|
31
|
+
|
|
32
|
+
started_at: datetime = Field(default_factory=utcnow)
|
|
33
|
+
finished_at: Optional[datetime] = None
|
|
34
|
+
timings: List[Timing] = Field(default_factory=list)
|
|
35
|
+
task_summaries: List[TaskSummary] = Field(default_factory=list)
|
|
36
|
+
|
|
37
|
+
skipped_datasets: int = 0
|
|
38
|
+
failed_tasks: int = 0
|
|
39
|
+
successful_tasks: int = 0
|
|
40
|
+
ignored_successful_tasks: int = 0
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def new(cls, ingestion_job: "IngestionJob"):
|
|
44
|
+
args = dict(
|
|
45
|
+
ingestion_job_summary_id=str(uuid.uuid1()),
|
|
46
|
+
ingestion_job_id=ingestion_job.ingestion_job_id,
|
|
47
|
+
provider=ingestion_job.ingestion_plan.source.provider,
|
|
48
|
+
source_name=ingestion_job.ingestion_plan.source.name,
|
|
49
|
+
dataset_type=ingestion_job.ingestion_plan.dataset_type,
|
|
50
|
+
data_spec_versions=ingestion_job.ingestion_plan.data_spec_versions,
|
|
51
|
+
selector=ingestion_job.selector,
|
|
52
|
+
)
|
|
53
|
+
return cls(**args)
|
|
54
|
+
|
|
55
|
+
@contextmanager
|
|
56
|
+
def record_timing(self, name: str):
|
|
57
|
+
start = utcnow()
|
|
58
|
+
yield
|
|
59
|
+
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
60
|
+
|
|
61
|
+
def start_timing(self, name):
|
|
62
|
+
start = utcnow()
|
|
63
|
+
|
|
64
|
+
def finish():
|
|
65
|
+
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
66
|
+
|
|
67
|
+
return finish
|
|
68
|
+
|
|
69
|
+
def add_task_summaries(self, task_summaries: List[TaskSummary]):
|
|
70
|
+
self.task_summaries.extend(task_summaries)
|
|
71
|
+
|
|
72
|
+
def increase_skipped_datasets(self, skipped_datasets: int):
|
|
73
|
+
self.skipped_datasets += skipped_datasets
|
|
74
|
+
|
|
75
|
+
def task_count(self):
|
|
76
|
+
return len(self.task_summaries)
|
|
77
|
+
|
|
78
|
+
def set_finished(self):
|
|
79
|
+
self.failed_tasks = len(
|
|
80
|
+
[task for task in self.task_summaries if task.status == TaskStatus.FAILED]
|
|
81
|
+
)
|
|
82
|
+
self.successful_tasks = len(
|
|
83
|
+
[task for task in self.task_summaries if task.status == TaskStatus.FINISHED]
|
|
84
|
+
)
|
|
85
|
+
self.ignored_successful_tasks = len(
|
|
86
|
+
[
|
|
87
|
+
task
|
|
88
|
+
for task in self.task_summaries
|
|
89
|
+
if task.status == TaskStatus.FINISHED_IGNORED
|
|
90
|
+
]
|
|
91
|
+
)
|
|
92
|
+
self.finished_at = utcnow()
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def duration(self) -> timedelta:
|
|
96
|
+
return self.finished_at - self.started_at
|
|
97
|
+
|
|
98
|
+
def output_report(self):
|
|
99
|
+
print(f"\nIngestionJobSummary finished in {format_duration(self.duration)}")
|
|
100
|
+
print("--------------------")
|
|
101
|
+
print(f" - IngestionPlan:")
|
|
102
|
+
print(f" Source: {self.source_name}")
|
|
103
|
+
print(f" Provider: {self.provider}")
|
|
104
|
+
print(f" DatasetType: {self.dataset_type}")
|
|
105
|
+
print(f" - Selector: {self.selector}")
|
|
106
|
+
print(f" - Timings: ")
|
|
107
|
+
for timing in self.timings:
|
|
108
|
+
print(f" - {timing.name}: {format_duration(timing.duration)}")
|
|
109
|
+
print(
|
|
110
|
+
f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
print(f" - Failed tasks: {self.failed_tasks}")
|
|
114
|
+
print(f" - Successful tasks: {self.successful_tasks}")
|
|
115
|
+
print(f" - Successful ignored tasks: {self.successful_tasks}")
|
|
116
|
+
print(f" - Skipped datasets: {self.skipped_datasets}")
|
|
117
|
+
print("--------------------")
|
|
118
|
+
|
|
119
|
+
def __enter__(self):
|
|
120
|
+
return self
|
|
121
|
+
|
|
122
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
123
|
+
pass
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
1
|
from typing import List
|
|
3
2
|
|
|
4
3
|
from ingestify.domain.models import Source, Selector
|
|
4
|
+
from ingestify.domain.models.base import BaseModel
|
|
5
5
|
from ingestify.domain.models.data_spec_version_collection import (
|
|
6
6
|
DataSpecVersionCollection,
|
|
7
7
|
)
|
|
8
8
|
from ingestify.domain.models.fetch_policy import FetchPolicy
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
class IngestionPlan(BaseModel):
|
|
12
|
+
|
|
13
13
|
source: Source
|
|
14
14
|
selectors: List[Selector]
|
|
15
15
|
fetch_policy: FetchPolicy
|
|
@@ -17,7 +17,7 @@ class ExtractJob:
|
|
|
17
17
|
data_spec_versions: DataSpecVersionCollection
|
|
18
18
|
|
|
19
19
|
def __repr__(self):
|
|
20
|
-
return f'<
|
|
20
|
+
return f'<IngestionPlan source="{self.source.name}" dataset_type="{self.dataset_type}">'
|
|
21
21
|
|
|
22
22
|
def __str__(self):
|
|
23
23
|
return repr(self)
|
|
@@ -1,35 +1,40 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
1
|
+
from dataclasses import dataclass
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import Optional, Callable, TYPE_CHECKING
|
|
3
|
+
from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING # noqa
|
|
4
|
+
from pydantic import Field
|
|
4
5
|
|
|
6
|
+
from ingestify.domain.models.base import BaseModel
|
|
7
|
+
from ingestify.domain.models.dataset.dataset_state import DatasetState
|
|
5
8
|
from ingestify.exceptions import DuplicateFile
|
|
6
9
|
|
|
7
|
-
|
|
8
|
-
from ingestify.domain import DraftFile, File
|
|
9
|
-
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
10
|
+
from ingestify.domain.models import File, DraftFile
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
|
|
13
|
-
|
|
13
|
+
class FileLoaderProtocol(Protocol):
|
|
14
|
+
def __call__(
|
|
15
|
+
self,
|
|
16
|
+
file_resource: "FileResource",
|
|
17
|
+
file: Optional["File"] = None,
|
|
18
|
+
**kwargs: Any,
|
|
19
|
+
) -> Optional["DraftFile"]:
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FileResource(BaseModel):
|
|
14
24
|
dataset_resource: "DatasetResource"
|
|
15
25
|
file_id: str
|
|
16
26
|
last_modified: datetime
|
|
17
27
|
data_feed_key: str
|
|
18
28
|
data_spec_version: str
|
|
19
|
-
|
|
20
|
-
# DataSerializationFormat is "json" in case of json_content, otherwise file_loader will return it
|
|
21
|
-
# data_serialization_format: str
|
|
22
|
-
|
|
23
29
|
json_content: Optional[dict] = None
|
|
24
|
-
|
|
25
30
|
url: Optional[str] = None
|
|
26
31
|
http_options: Optional[dict] = None
|
|
32
|
+
# DataSerializationFormat is "json" in case of json_content, otherwise file_loader will return it
|
|
27
33
|
data_serialization_format: Optional[str] = None
|
|
28
|
-
|
|
29
34
|
file_loader: Optional[
|
|
30
35
|
Callable[["FileResource", Optional["File"]], Optional["DraftFile"]]
|
|
31
36
|
] = None
|
|
32
|
-
loader_kwargs: dict =
|
|
37
|
+
loader_kwargs: dict = Field(default_factory=dict)
|
|
33
38
|
|
|
34
39
|
def __post_init__(self):
|
|
35
40
|
if self.json_content is None and not self.url and not self.file_loader:
|
|
@@ -38,27 +43,14 @@ class FileResource:
|
|
|
38
43
|
)
|
|
39
44
|
|
|
40
45
|
|
|
41
|
-
class DatasetResource:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
metadata: Optional[dict] = None,
|
|
50
|
-
state: Optional["DatasetState"] = None,
|
|
51
|
-
):
|
|
52
|
-
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
53
|
-
|
|
54
|
-
self.dataset_type = dataset_type
|
|
55
|
-
self.provider = provider
|
|
56
|
-
self.dataset_resource_id = dataset_resource_id
|
|
57
|
-
self.name = name
|
|
58
|
-
self.metadata = metadata or {}
|
|
59
|
-
self.state = state or DatasetState.COMPLETE
|
|
60
|
-
|
|
61
|
-
self.files = {}
|
|
46
|
+
class DatasetResource(BaseModel):
|
|
47
|
+
dataset_resource_id: dict
|
|
48
|
+
dataset_type: str
|
|
49
|
+
provider: str
|
|
50
|
+
name: str
|
|
51
|
+
metadata: dict = Field(default_factory=dict)
|
|
52
|
+
state: DatasetState = Field(default_factory=lambda: DatasetState.COMPLETE)
|
|
53
|
+
files: dict[str, FileResource] = Field(default_factory=dict)
|
|
62
54
|
|
|
63
55
|
def add_file(
|
|
64
56
|
self,
|
|
@@ -72,8 +64,8 @@ class DatasetResource:
|
|
|
72
64
|
data_serialization_format: Optional[str] = None,
|
|
73
65
|
file_loader: Optional[
|
|
74
66
|
Callable[
|
|
75
|
-
[
|
|
76
|
-
Optional[
|
|
67
|
+
[FileResource, Optional[File]],
|
|
68
|
+
Optional[DraftFile],
|
|
77
69
|
]
|
|
78
70
|
] = None,
|
|
79
71
|
loader_kwargs: Optional[dict] = None,
|
ingestify/domain/models/sink.py
CHANGED
|
@@ -1,16 +1,9 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
|
|
3
|
-
from ingestify.utils import ComponentFactory, ComponentRegistry
|
|
4
|
-
|
|
5
3
|
from .dataset import Dataset
|
|
6
4
|
|
|
7
|
-
sink_registry = ComponentRegistry()
|
|
8
|
-
|
|
9
5
|
|
|
10
|
-
class Sink(ABC
|
|
6
|
+
class Sink(ABC):
|
|
11
7
|
@abstractmethod
|
|
12
8
|
def upsert(self, dataset: Dataset, data, params: dict):
|
|
13
9
|
pass
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
sink_factory = ComponentFactory.build_factory(Sink, sink_registry)
|