ingestify 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +47 -36
- ingestify/application/ingestion_engine.py +3 -3
- ingestify/application/loader.py +71 -241
- ingestify/domain/models/__init__.py +1 -6
- ingestify/domain/models/base.py +22 -0
- ingestify/domain/models/data_spec_version_collection.py +6 -0
- ingestify/domain/models/dataset/__init__.py +3 -5
- ingestify/domain/models/dataset/dataset.py +15 -32
- ingestify/domain/models/dataset/dataset_repository.py +1 -15
- ingestify/domain/models/dataset/dataset_state.py +11 -0
- ingestify/domain/models/dataset/events.py +6 -16
- ingestify/domain/models/dataset/file.py +21 -34
- ingestify/domain/models/dataset/file_collection.py +3 -1
- ingestify/domain/models/dataset/file_repository.py +29 -28
- ingestify/domain/models/dataset/revision.py +26 -3
- ingestify/domain/models/event/domain_event.py +8 -4
- ingestify/domain/models/ingestion/__init__.py +0 -0
- ingestify/domain/models/ingestion/ingestion_job.py +325 -0
- ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
- ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
- ingestify/domain/models/resources/dataset_resource.py +29 -37
- ingestify/domain/models/sink.py +1 -8
- ingestify/domain/models/task/task.py +3 -1
- ingestify/domain/models/task/task_summary.py +118 -0
- ingestify/domain/models/timing.py +16 -0
- ingestify/domain/services/identifier_key_transformer.py +111 -0
- ingestify/infra/fetch/http.py +5 -0
- ingestify/infra/source/statsbomb_github.py +67 -54
- ingestify/infra/store/dataset/__init__.py +0 -2
- ingestify/infra/store/dataset/sqlalchemy/mapping.py +187 -4
- ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
- ingestify/infra/store/file/local_file_repository.py +3 -5
- ingestify/infra/store/file/s3_file_repository.py +4 -9
- ingestify/main.py +64 -25
- ingestify/utils.py +15 -78
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/METADATA +2 -1
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/RECORD +41 -34
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/WHEEL +1 -1
- ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/top_level.txt +0 -0
ingestify/__init__.py
CHANGED
|
@@ -5,13 +5,14 @@ import mimetypes
|
|
|
5
5
|
import os
|
|
6
6
|
import shutil
|
|
7
7
|
from dataclasses import asdict
|
|
8
|
-
from io import BytesIO
|
|
8
|
+
from io import BytesIO
|
|
9
9
|
|
|
10
|
-
from typing import Dict, List, Optional, Union, Callable, BinaryIO
|
|
10
|
+
from typing import Dict, List, Optional, Union, Callable, BinaryIO, Awaitable
|
|
11
11
|
|
|
12
12
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
13
13
|
from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
|
|
14
14
|
from ingestify.domain.models.dataset.file_collection import FileCollection
|
|
15
|
+
from ingestify.domain.models.dataset.revision import RevisionSource
|
|
15
16
|
from ingestify.domain.models.event import EventBus
|
|
16
17
|
from ingestify.domain.models import (
|
|
17
18
|
Dataset,
|
|
@@ -27,7 +28,7 @@ from ingestify.domain.models import (
|
|
|
27
28
|
Revision,
|
|
28
29
|
DatasetCreated,
|
|
29
30
|
)
|
|
30
|
-
from ingestify.utils import utcnow
|
|
31
|
+
from ingestify.utils import utcnow
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger(__name__)
|
|
@@ -56,11 +57,16 @@ class DatasetStore:
|
|
|
56
57
|
if self.event_bus:
|
|
57
58
|
self.event_bus.dispatch(event)
|
|
58
59
|
|
|
60
|
+
def save_ingestion_job_summary(self, ingestion_job_summary):
|
|
61
|
+
self.dataset_repository.session.add(ingestion_job_summary)
|
|
62
|
+
self.dataset_repository.session.commit()
|
|
63
|
+
|
|
59
64
|
def get_dataset_collection(
|
|
60
65
|
self,
|
|
61
66
|
dataset_type: Optional[str] = None,
|
|
62
67
|
provider: Optional[str] = None,
|
|
63
68
|
dataset_id: Optional[str] = None,
|
|
69
|
+
metadata_only: Optional[bool] = False,
|
|
64
70
|
**selector,
|
|
65
71
|
) -> DatasetCollection:
|
|
66
72
|
if "selector" in selector:
|
|
@@ -81,6 +87,7 @@ class DatasetStore:
|
|
|
81
87
|
dataset_type=dataset_type,
|
|
82
88
|
dataset_id=dataset_id,
|
|
83
89
|
provider=provider,
|
|
90
|
+
metadata_only=metadata_only,
|
|
84
91
|
selector=selector,
|
|
85
92
|
)
|
|
86
93
|
return dataset_collection
|
|
@@ -107,7 +114,9 @@ class DatasetStore:
|
|
|
107
114
|
|
|
108
115
|
return stream, storage_size, suffix
|
|
109
116
|
|
|
110
|
-
def _prepare_read_stream(
|
|
117
|
+
def _prepare_read_stream(
|
|
118
|
+
self,
|
|
119
|
+
) -> tuple[Callable[[BinaryIO], Awaitable[BytesIO]], str]:
|
|
111
120
|
if self.storage_compression_method == "gzip":
|
|
112
121
|
|
|
113
122
|
def reader(fh: BinaryIO) -> BytesIO:
|
|
@@ -168,7 +177,11 @@ class DatasetStore:
|
|
|
168
177
|
return modified_files_
|
|
169
178
|
|
|
170
179
|
def add_revision(
|
|
171
|
-
self,
|
|
180
|
+
self,
|
|
181
|
+
dataset: Dataset,
|
|
182
|
+
files: Dict[str, DraftFile],
|
|
183
|
+
revision_source: RevisionSource,
|
|
184
|
+
description: str = "Update",
|
|
172
185
|
):
|
|
173
186
|
"""
|
|
174
187
|
Create new revision first, so FileRepository can use
|
|
@@ -182,46 +195,53 @@ class DatasetStore:
|
|
|
182
195
|
# It can happen an API tells us data is changed, but it was not changed. In this case
|
|
183
196
|
# we decide to ignore it.
|
|
184
197
|
# Make sure there are files changed before creating a new revision
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
)
|
|
198
|
+
revision = Revision(
|
|
199
|
+
revision_id=revision_id,
|
|
200
|
+
created_at=created_at,
|
|
201
|
+
description=description,
|
|
202
|
+
modified_files=persisted_files_,
|
|
203
|
+
source=revision_source,
|
|
192
204
|
)
|
|
193
205
|
|
|
206
|
+
dataset.add_revision(revision)
|
|
207
|
+
|
|
194
208
|
self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
|
|
195
209
|
self.dispatch(RevisionAdded(dataset=dataset))
|
|
196
210
|
logger.info(
|
|
197
211
|
f"Added a new revision to {dataset.identifier} -> {', '.join([file.file_id for file in persisted_files_])}"
|
|
198
212
|
)
|
|
199
|
-
return True
|
|
200
213
|
else:
|
|
201
214
|
logger.info(
|
|
202
215
|
f"Ignoring a new revision without changed files -> {dataset.identifier}"
|
|
203
216
|
)
|
|
204
|
-
|
|
217
|
+
revision = None
|
|
218
|
+
|
|
219
|
+
return revision
|
|
205
220
|
|
|
206
221
|
def update_dataset(
|
|
207
222
|
self,
|
|
208
223
|
dataset: Dataset,
|
|
209
|
-
|
|
224
|
+
name: str,
|
|
225
|
+
state: DatasetState,
|
|
226
|
+
metadata: dict,
|
|
210
227
|
files: Dict[str, DraftFile],
|
|
228
|
+
revision_source: RevisionSource,
|
|
211
229
|
):
|
|
212
230
|
"""The add_revision will also save the dataset."""
|
|
213
231
|
metadata_changed = False
|
|
214
|
-
if dataset.
|
|
232
|
+
if dataset.update_metadata(name, metadata, state):
|
|
215
233
|
self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
|
|
216
234
|
metadata_changed = True
|
|
217
235
|
|
|
218
|
-
self.add_revision(dataset, files)
|
|
236
|
+
revision = self.add_revision(dataset, files, revision_source)
|
|
219
237
|
|
|
220
238
|
if metadata_changed:
|
|
221
239
|
# Dispatch after revision added. Otherwise, the downstream handlers are not able to see
|
|
222
240
|
# the new revision
|
|
223
241
|
self.dispatch(MetadataUpdated(dataset=dataset))
|
|
224
242
|
|
|
243
|
+
return revision
|
|
244
|
+
|
|
225
245
|
def destroy_dataset(self, dataset: Dataset):
|
|
226
246
|
# TODO: remove files. Now we leave some orphaned files around
|
|
227
247
|
self.dataset_repository.destroy(dataset)
|
|
@@ -235,6 +255,7 @@ class DatasetStore:
|
|
|
235
255
|
state: DatasetState,
|
|
236
256
|
metadata: dict,
|
|
237
257
|
files: Dict[str, DraftFile],
|
|
258
|
+
revision_source: RevisionSource,
|
|
238
259
|
description: str = "Create",
|
|
239
260
|
):
|
|
240
261
|
now = utcnow()
|
|
@@ -251,9 +272,10 @@ class DatasetStore:
|
|
|
251
272
|
created_at=now,
|
|
252
273
|
updated_at=now,
|
|
253
274
|
)
|
|
254
|
-
self.add_revision(dataset, files, description)
|
|
275
|
+
revision = self.add_revision(dataset, files, revision_source, description)
|
|
255
276
|
|
|
256
277
|
self.dispatch(DatasetCreated(dataset=dataset))
|
|
278
|
+
return revision
|
|
257
279
|
|
|
258
280
|
def load_files(
|
|
259
281
|
self,
|
|
@@ -271,20 +293,9 @@ class DatasetStore:
|
|
|
271
293
|
continue
|
|
272
294
|
|
|
273
295
|
def get_stream(file_):
|
|
274
|
-
revision_id = file_.revision_id
|
|
275
|
-
if revision_id is None:
|
|
276
|
-
revision_id = current_revision.revision_id
|
|
277
|
-
|
|
278
296
|
return reader(
|
|
279
297
|
self.file_repository.load_content(
|
|
280
|
-
bucket=self.bucket,
|
|
281
|
-
dataset=dataset,
|
|
282
|
-
# When file.revision_id is set we must use it.
|
|
283
|
-
revision_id=revision_id,
|
|
284
|
-
filename=file_.file_id
|
|
285
|
-
+ "."
|
|
286
|
-
+ file_.data_serialization_format
|
|
287
|
-
+ suffix,
|
|
298
|
+
bucket=self.bucket, storage_path=file_.storage_path
|
|
288
299
|
)
|
|
289
300
|
)
|
|
290
301
|
|
|
@@ -302,8 +313,8 @@ class DatasetStore:
|
|
|
302
313
|
|
|
303
314
|
try:
|
|
304
315
|
return statsbomb.load(
|
|
305
|
-
event_data=files.get_file("events").stream,
|
|
306
|
-
lineup_data=files.get_file("lineups").stream,
|
|
316
|
+
event_data=(files.get_file("events")).stream,
|
|
317
|
+
lineup_data=(files.get_file("lineups")).stream,
|
|
307
318
|
**kwargs,
|
|
308
319
|
)
|
|
309
320
|
except Exception as e:
|
|
@@ -333,7 +344,7 @@ class DatasetStore:
|
|
|
333
344
|
# filename=filename,
|
|
334
345
|
# )
|
|
335
346
|
|
|
336
|
-
def map(
|
|
337
|
-
|
|
338
|
-
):
|
|
339
|
-
|
|
347
|
+
# def map(
|
|
348
|
+
# self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
|
|
349
|
+
# ):
|
|
350
|
+
# return map_in_pool(fn, dataset_collection, processes)
|
|
@@ -5,7 +5,7 @@ from typing import Optional, List
|
|
|
5
5
|
|
|
6
6
|
from .loader import Loader
|
|
7
7
|
from .dataset_store import DatasetStore
|
|
8
|
-
from
|
|
8
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
9
9
|
|
|
10
10
|
logger = logging.getLogger(__name__)
|
|
11
11
|
|
|
@@ -18,8 +18,8 @@ class IngestionEngine:
|
|
|
18
18
|
self.store = store
|
|
19
19
|
self.loader = Loader(self.store)
|
|
20
20
|
|
|
21
|
-
def
|
|
22
|
-
self.loader.
|
|
21
|
+
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
22
|
+
self.loader.add_ingestion_plan(ingestion_plan)
|
|
23
23
|
|
|
24
24
|
def load(self, dry_run: bool = False, provider: Optional[str] = None):
|
|
25
25
|
self.loader.collect_and_run(dry_run=dry_run, provider=provider)
|
ingestify/application/loader.py
CHANGED
|
@@ -1,19 +1,15 @@
|
|
|
1
|
-
import itertools
|
|
2
|
-
import json
|
|
3
1
|
import logging
|
|
4
2
|
import platform
|
|
5
|
-
|
|
3
|
+
import uuid
|
|
4
|
+
from multiprocessing import set_start_method
|
|
6
5
|
from typing import List, Optional
|
|
7
6
|
|
|
8
|
-
from ingestify.domain.models import
|
|
9
|
-
from ingestify.utils import
|
|
7
|
+
from ingestify.domain.models import Selector
|
|
8
|
+
from ingestify.utils import TaskExecutor
|
|
10
9
|
|
|
11
10
|
from .dataset_store import DatasetStore
|
|
12
|
-
from
|
|
13
|
-
from ..domain import
|
|
14
|
-
from ..domain.models.data_spec_version_collection import DataSpecVersionCollection
|
|
15
|
-
from ..domain.models.extract_job import ExtractJob
|
|
16
|
-
from ..domain.models.resources.dataset_resource import FileResource
|
|
11
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
12
|
+
from ..domain.models.ingestion.ingestion_job import IngestionJob
|
|
17
13
|
from ..exceptions import ConfigurationError
|
|
18
14
|
|
|
19
15
|
if platform.system() == "Darwin":
|
|
@@ -25,176 +21,54 @@ else:
|
|
|
25
21
|
logger = logging.getLogger(__name__)
|
|
26
22
|
|
|
27
23
|
|
|
28
|
-
DEFAULT_CHUNK_SIZE = 1000
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def to_batches(input_):
|
|
32
|
-
if isinstance(input_, list):
|
|
33
|
-
batches = [input_]
|
|
34
|
-
else:
|
|
35
|
-
# Assume it's an iterator. Peek what's inside, and put it back
|
|
36
|
-
try:
|
|
37
|
-
peek = next(input_)
|
|
38
|
-
except StopIteration:
|
|
39
|
-
# Nothing to batch
|
|
40
|
-
return []
|
|
41
|
-
|
|
42
|
-
input_ = itertools.chain([peek], input_)
|
|
43
|
-
|
|
44
|
-
if not isinstance(peek, list):
|
|
45
|
-
batches = chunker(input_, DEFAULT_CHUNK_SIZE)
|
|
46
|
-
else:
|
|
47
|
-
batches = input_
|
|
48
|
-
return batches
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def load_file(
|
|
52
|
-
file_resource: FileResource, dataset: Optional[Dataset] = None
|
|
53
|
-
) -> Optional[DraftFile]:
|
|
54
|
-
current_file = None
|
|
55
|
-
if dataset:
|
|
56
|
-
current_file = dataset.current_revision.modified_files_map.get(
|
|
57
|
-
file_resource.file_id
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
if file_resource.json_content is not None:
|
|
61
|
-
# Empty dictionary is allowed
|
|
62
|
-
file = DraftFile.from_input(
|
|
63
|
-
file_=json.dumps(file_resource.json_content, indent=4),
|
|
64
|
-
data_serialization_format="json",
|
|
65
|
-
data_feed_key=file_resource.data_feed_key,
|
|
66
|
-
data_spec_version=file_resource.data_spec_version,
|
|
67
|
-
modified_at=file_resource.last_modified,
|
|
68
|
-
)
|
|
69
|
-
if current_file and current_file.tag == file.tag:
|
|
70
|
-
# Nothing changed
|
|
71
|
-
return None
|
|
72
|
-
return file
|
|
73
|
-
elif file_resource.url:
|
|
74
|
-
http_options = {}
|
|
75
|
-
if file_resource.http_options:
|
|
76
|
-
for k, v in file_resource.http_options.items():
|
|
77
|
-
http_options[f"http_{k}"] = v
|
|
78
|
-
|
|
79
|
-
return retrieve_http(
|
|
80
|
-
url=file_resource.url,
|
|
81
|
-
current_file=current_file,
|
|
82
|
-
file_data_feed_key=file_resource.data_feed_key,
|
|
83
|
-
file_data_spec_version=file_resource.data_spec_version,
|
|
84
|
-
file_data_serialization_format=file_resource.data_serialization_format
|
|
85
|
-
or "txt",
|
|
86
|
-
**http_options,
|
|
87
|
-
**file_resource.loader_kwargs,
|
|
88
|
-
)
|
|
89
|
-
else:
|
|
90
|
-
return file_resource.file_loader(
|
|
91
|
-
file_resource,
|
|
92
|
-
current_file,
|
|
93
|
-
# TODO: check how to fix this with typehints
|
|
94
|
-
**file_resource.loader_kwargs,
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
class UpdateDatasetTask(Task):
|
|
99
|
-
def __init__(
|
|
100
|
-
self,
|
|
101
|
-
dataset: Dataset,
|
|
102
|
-
dataset_resource: DatasetResource,
|
|
103
|
-
store: DatasetStore,
|
|
104
|
-
):
|
|
105
|
-
self.dataset = dataset
|
|
106
|
-
self.dataset_resource = dataset_resource
|
|
107
|
-
self.store = store
|
|
108
|
-
|
|
109
|
-
def run(self):
|
|
110
|
-
self.store.update_dataset(
|
|
111
|
-
dataset=self.dataset,
|
|
112
|
-
dataset_resource=self.dataset_resource,
|
|
113
|
-
files={
|
|
114
|
-
file_id: load_file(file_resource, dataset=self.dataset)
|
|
115
|
-
for file_id, file_resource in self.dataset_resource.files.items()
|
|
116
|
-
},
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
def __repr__(self):
|
|
120
|
-
return f"UpdateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
class CreateDatasetTask(Task):
|
|
124
|
-
def __init__(
|
|
125
|
-
self,
|
|
126
|
-
dataset_resource: DatasetResource,
|
|
127
|
-
store: DatasetStore,
|
|
128
|
-
):
|
|
129
|
-
self.dataset_resource = dataset_resource
|
|
130
|
-
self.store = store
|
|
131
|
-
|
|
132
|
-
def run(self):
|
|
133
|
-
self.store.create_dataset(
|
|
134
|
-
dataset_type=self.dataset_resource.dataset_type,
|
|
135
|
-
provider=self.dataset_resource.provider,
|
|
136
|
-
dataset_identifier=Identifier(**self.dataset_resource.dataset_resource_id),
|
|
137
|
-
name=self.dataset_resource.name,
|
|
138
|
-
state=self.dataset_resource.state,
|
|
139
|
-
metadata=self.dataset_resource.metadata,
|
|
140
|
-
files={
|
|
141
|
-
file_id: load_file(file_resource)
|
|
142
|
-
for file_id, file_resource in self.dataset_resource.files.items()
|
|
143
|
-
},
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
def __repr__(self):
|
|
147
|
-
return f"CreateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
|
|
148
|
-
|
|
149
|
-
|
|
150
24
|
class Loader:
|
|
151
25
|
def __init__(self, store: DatasetStore):
|
|
152
26
|
self.store = store
|
|
153
|
-
self.
|
|
27
|
+
self.ingestion_plans: List[IngestionPlan] = []
|
|
154
28
|
|
|
155
|
-
def
|
|
156
|
-
self.
|
|
29
|
+
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
30
|
+
self.ingestion_plans.append(ingestion_plan)
|
|
157
31
|
|
|
158
32
|
def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
|
|
159
|
-
total_dataset_count = 0
|
|
160
|
-
|
|
161
33
|
# First collect all selectors, before discovering datasets
|
|
162
34
|
selectors = {}
|
|
163
|
-
for
|
|
35
|
+
for ingestion_plan in self.ingestion_plans:
|
|
36
|
+
logger.info(f"Determining selectors for {ingestion_plan}")
|
|
37
|
+
|
|
164
38
|
if provider is not None:
|
|
165
|
-
if
|
|
39
|
+
if ingestion_plan.source.provider != provider:
|
|
166
40
|
logger.info(
|
|
167
|
-
f"Skipping {
|
|
41
|
+
f"Skipping {ingestion_plan} because provider doesn't match '{provider}'"
|
|
168
42
|
)
|
|
169
43
|
continue
|
|
170
44
|
|
|
171
45
|
static_selectors = [
|
|
172
46
|
selector
|
|
173
|
-
for selector in
|
|
47
|
+
for selector in ingestion_plan.selectors
|
|
174
48
|
if not selector.is_dynamic
|
|
175
49
|
]
|
|
176
50
|
dynamic_selectors = [
|
|
177
|
-
selector for selector in
|
|
51
|
+
selector for selector in ingestion_plan.selectors if selector.is_dynamic
|
|
178
52
|
]
|
|
179
53
|
|
|
180
54
|
no_selectors = len(static_selectors) == 1 and not bool(static_selectors[0])
|
|
181
55
|
if dynamic_selectors or no_selectors:
|
|
182
|
-
if hasattr(
|
|
56
|
+
if hasattr(ingestion_plan.source, "discover_selectors"):
|
|
183
57
|
logger.debug(
|
|
184
|
-
f"Discovering selectors from {
|
|
58
|
+
f"Discovering selectors from {ingestion_plan.source.__class__.__name__}"
|
|
185
59
|
)
|
|
186
60
|
|
|
187
61
|
# TODO: consider making this lazy and fetch once per Source instead of
|
|
188
|
-
# once per
|
|
189
|
-
all_selectors =
|
|
190
|
-
|
|
62
|
+
# once per IngestionPlan
|
|
63
|
+
all_selectors = ingestion_plan.source.discover_selectors(
|
|
64
|
+
ingestion_plan.dataset_type
|
|
191
65
|
)
|
|
192
66
|
if no_selectors:
|
|
193
67
|
# When there were no selectors specified, just use all of them
|
|
194
68
|
extra_static_selectors = [
|
|
195
69
|
Selector.build(
|
|
196
70
|
job_selector,
|
|
197
|
-
data_spec_versions=
|
|
71
|
+
data_spec_versions=ingestion_plan.data_spec_versions,
|
|
198
72
|
)
|
|
199
73
|
for job_selector in all_selectors
|
|
200
74
|
]
|
|
@@ -205,7 +79,7 @@ class Loader:
|
|
|
205
79
|
dynamic_job_selectors = [
|
|
206
80
|
Selector.build(
|
|
207
81
|
job_selector,
|
|
208
|
-
data_spec_versions=
|
|
82
|
+
data_spec_versions=ingestion_plan.data_spec_versions,
|
|
209
83
|
)
|
|
210
84
|
for job_selector in all_selectors
|
|
211
85
|
if dynamic_selector.is_match(job_selector)
|
|
@@ -216,7 +90,7 @@ class Loader:
|
|
|
216
90
|
static_selectors.extend(extra_static_selectors)
|
|
217
91
|
|
|
218
92
|
logger.info(
|
|
219
|
-
f"Discovered {len(extra_static_selectors)} selectors from {
|
|
93
|
+
f"Discovered {len(extra_static_selectors)} selectors from {ingestion_plan.source.__class__.__name__}"
|
|
220
94
|
)
|
|
221
95
|
else:
|
|
222
96
|
if not no_selectors:
|
|
@@ -224,112 +98,68 @@ class Loader:
|
|
|
224
98
|
# later on
|
|
225
99
|
raise ConfigurationError(
|
|
226
100
|
f"Dynamic selectors cannot be used for "
|
|
227
|
-
f"{
|
|
101
|
+
f"{ingestion_plan.source.__class__.__name__} because it doesn't support"
|
|
228
102
|
f" selector discovery"
|
|
229
103
|
)
|
|
230
104
|
|
|
231
105
|
# Merge selectors when source, dataset_type and actual selector is the same. This makes
|
|
232
106
|
# sure there will be only 1 dataset for this combination
|
|
233
107
|
for selector in static_selectors:
|
|
234
|
-
key = (
|
|
108
|
+
key = (
|
|
109
|
+
ingestion_plan.source.name,
|
|
110
|
+
ingestion_plan.dataset_type,
|
|
111
|
+
selector.key,
|
|
112
|
+
)
|
|
235
113
|
if existing_selector := selectors.get(key):
|
|
236
114
|
existing_selector[1].data_spec_versions.merge(
|
|
237
115
|
selector.data_spec_versions
|
|
238
116
|
)
|
|
239
117
|
else:
|
|
240
|
-
selectors[key] = (
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
118
|
+
selectors[key] = (ingestion_plan, selector)
|
|
119
|
+
|
|
120
|
+
"""
|
|
121
|
+
Data is denormalized:
|
|
122
|
+
|
|
123
|
+
It actually looks like:
|
|
124
|
+
- IngestionPlan #1
|
|
125
|
+
- Selector 1.1
|
|
126
|
+
- Selector 1.2
|
|
127
|
+
- Selector 1.3
|
|
128
|
+
- IngestionPlan #2
|
|
129
|
+
- Selector 2.1
|
|
130
|
+
- Selector 2.2
|
|
131
|
+
|
|
132
|
+
We process this as:
|
|
133
|
+
- IngestionPlan #1, Selector 1.1
|
|
134
|
+
- IngestionPlan #1, Selector 1.2
|
|
135
|
+
- IngestionPlan #1, Selector 1.3
|
|
136
|
+
- IngestionPlan #2, Selector 2.1
|
|
137
|
+
- IngestionPlan #2, Selector 2.2
|
|
138
|
+
|
|
139
|
+
IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
|
|
140
|
+
"""
|
|
141
|
+
for ingestion_plan, selector in selectors.values():
|
|
142
|
+
logger.info(
|
|
143
|
+
f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
|
|
249
144
|
)
|
|
250
145
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
146
|
+
ingestion_job = IngestionJob(
|
|
147
|
+
ingestion_job_id=str(uuid.uuid1()),
|
|
148
|
+
ingestion_plan=ingestion_plan,
|
|
254
149
|
selector=selector,
|
|
255
|
-
metadata_only=True,
|
|
256
|
-
).metadata
|
|
257
|
-
|
|
258
|
-
# There are two different, but similar flows here:
|
|
259
|
-
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
260
|
-
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
261
|
-
datasets = extract_job.source.find_datasets(
|
|
262
|
-
dataset_type=extract_job.dataset_type,
|
|
263
|
-
data_spec_versions=selector.data_spec_versions,
|
|
264
|
-
dataset_collection_metadata=dataset_collection_metadata,
|
|
265
|
-
**selector.custom_attributes,
|
|
266
150
|
)
|
|
267
151
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
#
|
|
276
|
-
#
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
# Load all available datasets based on the discovered dataset identifiers
|
|
281
|
-
dataset_collection = self.store.get_dataset_collection(
|
|
282
|
-
dataset_type=extract_job.dataset_type,
|
|
283
|
-
# Assume all DatasetResources share the same provider
|
|
284
|
-
provider=batch[0].provider,
|
|
285
|
-
selector=dataset_identifiers,
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
skip_count = 0
|
|
289
|
-
total_dataset_count += len(dataset_identifiers)
|
|
290
|
-
|
|
291
|
-
task_set = TaskSet()
|
|
292
|
-
for dataset_resource in batch:
|
|
293
|
-
dataset_identifier = Identifier.create_from_selector(
|
|
294
|
-
selector, **dataset_resource.dataset_resource_id
|
|
295
|
-
)
|
|
296
|
-
|
|
297
|
-
if dataset := dataset_collection.get(dataset_identifier):
|
|
298
|
-
if extract_job.fetch_policy.should_refetch(
|
|
299
|
-
dataset, dataset_resource
|
|
300
|
-
):
|
|
301
|
-
task_set.add(
|
|
302
|
-
UpdateDatasetTask(
|
|
303
|
-
dataset=dataset, # Current dataset from the database
|
|
304
|
-
dataset_resource=dataset_resource, # Most recent dataset_resource
|
|
305
|
-
store=self.store,
|
|
306
|
-
)
|
|
307
|
-
)
|
|
308
|
-
else:
|
|
309
|
-
skip_count += 1
|
|
310
|
-
else:
|
|
311
|
-
if extract_job.fetch_policy.should_fetch(dataset_resource):
|
|
312
|
-
task_set.add(
|
|
313
|
-
CreateDatasetTask(
|
|
314
|
-
dataset_resource=dataset_resource,
|
|
315
|
-
store=self.store,
|
|
316
|
-
)
|
|
317
|
-
)
|
|
318
|
-
else:
|
|
319
|
-
skip_count += 1
|
|
320
|
-
|
|
321
|
-
if task_set:
|
|
322
|
-
logger.info(
|
|
323
|
-
f"Discovered {len(dataset_identifiers)} datasets from {extract_job.source.__class__.__name__} "
|
|
324
|
-
f"using selector {selector} => {len(task_set)} tasks. {skip_count} skipped."
|
|
325
|
-
)
|
|
326
|
-
logger.info(f"Running {len(task_set)} tasks")
|
|
327
|
-
with TaskExecutor(dry_run=dry_run) as task_executor:
|
|
328
|
-
task_executor.run(run_task, task_set)
|
|
329
|
-
else:
|
|
330
|
-
logger.info(
|
|
331
|
-
f"Discovered {len(dataset_identifiers)} datasets from {extract_job.source.__class__.__name__} "
|
|
332
|
-
f"using selector {selector} => nothing to do"
|
|
333
|
-
)
|
|
152
|
+
with TaskExecutor(dry_run=dry_run) as task_executor:
|
|
153
|
+
for ingestion_job_summary in ingestion_job.execute(
|
|
154
|
+
self.store, task_executor=task_executor
|
|
155
|
+
):
|
|
156
|
+
# TODO: handle task_summaries
|
|
157
|
+
# Summarize to a IngestionJobSummary, and save to a database. This Summary can later be used in a
|
|
158
|
+
# next run to determine where to resume.
|
|
159
|
+
# TODO 2: Do we want to add additional information from the summary back to the Task, so it can use
|
|
160
|
+
# extra information to determine how/where to resume
|
|
161
|
+
ingestion_job_summary.output_report()
|
|
162
|
+
logger.info(f"Storing IngestionJobSummary")
|
|
163
|
+
self.store.save_ingestion_job_summary(ingestion_job_summary)
|
|
334
164
|
|
|
335
165
|
logger.info("Done")
|
|
@@ -11,10 +11,8 @@ from .dataset import (
|
|
|
11
11
|
LoadedFile,
|
|
12
12
|
Selector,
|
|
13
13
|
Revision,
|
|
14
|
-
dataset_repository_factory,
|
|
15
|
-
file_repository_factory,
|
|
16
14
|
)
|
|
17
|
-
from .sink import Sink
|
|
15
|
+
from .sink import Sink
|
|
18
16
|
from .source import Source
|
|
19
17
|
from .task import Task, TaskSet
|
|
20
18
|
from .data_spec_version_collection import DataSpecVersionCollection
|
|
@@ -35,11 +33,8 @@ __all__ = [
|
|
|
35
33
|
"FileRepository",
|
|
36
34
|
"FileCollection",
|
|
37
35
|
"DatasetRepository",
|
|
38
|
-
"dataset_repository_factory",
|
|
39
|
-
"file_repository_factory",
|
|
40
36
|
"TaskSet",
|
|
41
37
|
"Task",
|
|
42
38
|
"Sink",
|
|
43
|
-
"sink_factory",
|
|
44
39
|
"DataSpecVersionCollection",
|
|
45
40
|
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from functools import partial
|
|
2
|
+
from typing import ClassVar, Any, Optional
|
|
3
|
+
|
|
4
|
+
import pydantic
|
|
5
|
+
from pydantic import BaseModel as PydanticBaseModel, ConfigDict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# class BaseModel(PydanticBaseModel):
|
|
9
|
+
# model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
10
|
+
#
|
|
11
|
+
# _sa_instance_state: Optional[dict] = None
|
|
12
|
+
from sqlalchemy.orm import MappedAsDataclass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseModel(
|
|
16
|
+
MappedAsDataclass,
|
|
17
|
+
# DeclarativeBase,
|
|
18
|
+
dataclass_callable=partial(
|
|
19
|
+
pydantic.dataclasses.dataclass, config=ConfigDict(arbitrary_types_allowed=True)
|
|
20
|
+
),
|
|
21
|
+
):
|
|
22
|
+
pass
|