ingestify 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +44 -24
- ingestify/application/ingestion_engine.py +3 -3
- ingestify/application/loader.py +67 -237
- ingestify/domain/models/__init__.py +1 -6
- ingestify/domain/models/base.py +22 -0
- ingestify/domain/models/data_spec_version_collection.py +6 -0
- ingestify/domain/models/dataset/__init__.py +3 -5
- ingestify/domain/models/dataset/dataset.py +15 -32
- ingestify/domain/models/dataset/dataset_repository.py +1 -15
- ingestify/domain/models/dataset/dataset_state.py +11 -0
- ingestify/domain/models/dataset/events.py +6 -16
- ingestify/domain/models/dataset/file.py +21 -34
- ingestify/domain/models/dataset/file_collection.py +3 -1
- ingestify/domain/models/dataset/file_repository.py +1 -10
- ingestify/domain/models/dataset/revision.py +26 -3
- ingestify/domain/models/event/domain_event.py +8 -4
- ingestify/domain/models/ingestion/__init__.py +0 -0
- ingestify/domain/models/ingestion/ingestion_job.py +292 -0
- ingestify/domain/models/ingestion/ingestion_job_summary.py +106 -0
- ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
- ingestify/domain/models/resources/dataset_resource.py +29 -37
- ingestify/domain/models/sink.py +1 -8
- ingestify/domain/models/task/task.py +3 -1
- ingestify/domain/models/task/task_summary.py +118 -0
- ingestify/domain/models/timing.py +16 -0
- ingestify/infra/fetch/http.py +5 -0
- ingestify/infra/source/statsbomb_github.py +67 -54
- ingestify/infra/store/dataset/__init__.py +0 -2
- ingestify/infra/store/dataset/sqlalchemy/mapping.py +184 -4
- ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -22
- ingestify/main.py +42 -22
- ingestify/utils.py +15 -78
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/METADATA +2 -1
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/RECORD +38 -32
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/WHEEL +1 -1
- ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/top_level.txt +0 -0
ingestify/__init__.py
CHANGED
|
@@ -5,13 +5,14 @@ import mimetypes
|
|
|
5
5
|
import os
|
|
6
6
|
import shutil
|
|
7
7
|
from dataclasses import asdict
|
|
8
|
-
from io import BytesIO
|
|
8
|
+
from io import BytesIO
|
|
9
9
|
|
|
10
|
-
from typing import Dict, List, Optional, Union, Callable, BinaryIO
|
|
10
|
+
from typing import Dict, List, Optional, Union, Callable, BinaryIO, Awaitable
|
|
11
11
|
|
|
12
12
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
13
13
|
from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
|
|
14
14
|
from ingestify.domain.models.dataset.file_collection import FileCollection
|
|
15
|
+
from ingestify.domain.models.dataset.revision import RevisionSource
|
|
15
16
|
from ingestify.domain.models.event import EventBus
|
|
16
17
|
from ingestify.domain.models import (
|
|
17
18
|
Dataset,
|
|
@@ -27,7 +28,7 @@ from ingestify.domain.models import (
|
|
|
27
28
|
Revision,
|
|
28
29
|
DatasetCreated,
|
|
29
30
|
)
|
|
30
|
-
from ingestify.utils import utcnow
|
|
31
|
+
from ingestify.utils import utcnow
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger(__name__)
|
|
@@ -56,6 +57,10 @@ class DatasetStore:
|
|
|
56
57
|
if self.event_bus:
|
|
57
58
|
self.event_bus.dispatch(event)
|
|
58
59
|
|
|
60
|
+
def save_ingestion_job_summary(self, ingestion_job_summary):
|
|
61
|
+
self.dataset_repository.session.add(ingestion_job_summary)
|
|
62
|
+
self.dataset_repository.session.commit()
|
|
63
|
+
|
|
59
64
|
def get_dataset_collection(
|
|
60
65
|
self,
|
|
61
66
|
dataset_type: Optional[str] = None,
|
|
@@ -107,7 +112,9 @@ class DatasetStore:
|
|
|
107
112
|
|
|
108
113
|
return stream, storage_size, suffix
|
|
109
114
|
|
|
110
|
-
def _prepare_read_stream(
|
|
115
|
+
def _prepare_read_stream(
|
|
116
|
+
self,
|
|
117
|
+
) -> tuple[Callable[[BinaryIO], Awaitable[BytesIO]], str]:
|
|
111
118
|
if self.storage_compression_method == "gzip":
|
|
112
119
|
|
|
113
120
|
def reader(fh: BinaryIO) -> BytesIO:
|
|
@@ -168,7 +175,11 @@ class DatasetStore:
|
|
|
168
175
|
return modified_files_
|
|
169
176
|
|
|
170
177
|
def add_revision(
|
|
171
|
-
self,
|
|
178
|
+
self,
|
|
179
|
+
dataset: Dataset,
|
|
180
|
+
files: Dict[str, DraftFile],
|
|
181
|
+
revision_source: RevisionSource,
|
|
182
|
+
description: str = "Update",
|
|
172
183
|
):
|
|
173
184
|
"""
|
|
174
185
|
Create new revision first, so FileRepository can use
|
|
@@ -182,46 +193,53 @@ class DatasetStore:
|
|
|
182
193
|
# It can happen an API tells us data is changed, but it was not changed. In this case
|
|
183
194
|
# we decide to ignore it.
|
|
184
195
|
# Make sure there are files changed before creating a new revision
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
)
|
|
196
|
+
revision = Revision(
|
|
197
|
+
revision_id=revision_id,
|
|
198
|
+
created_at=created_at,
|
|
199
|
+
description=description,
|
|
200
|
+
modified_files=persisted_files_,
|
|
201
|
+
source=revision_source,
|
|
192
202
|
)
|
|
193
203
|
|
|
204
|
+
dataset.add_revision(revision)
|
|
205
|
+
|
|
194
206
|
self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
|
|
195
207
|
self.dispatch(RevisionAdded(dataset=dataset))
|
|
196
208
|
logger.info(
|
|
197
209
|
f"Added a new revision to {dataset.identifier} -> {', '.join([file.file_id for file in persisted_files_])}"
|
|
198
210
|
)
|
|
199
|
-
return True
|
|
200
211
|
else:
|
|
201
212
|
logger.info(
|
|
202
213
|
f"Ignoring a new revision without changed files -> {dataset.identifier}"
|
|
203
214
|
)
|
|
204
|
-
|
|
215
|
+
revision = None
|
|
216
|
+
|
|
217
|
+
return revision
|
|
205
218
|
|
|
206
219
|
def update_dataset(
|
|
207
220
|
self,
|
|
208
221
|
dataset: Dataset,
|
|
209
|
-
|
|
222
|
+
name: str,
|
|
223
|
+
state: DatasetState,
|
|
224
|
+
metadata: dict,
|
|
210
225
|
files: Dict[str, DraftFile],
|
|
226
|
+
revision_source: RevisionSource,
|
|
211
227
|
):
|
|
212
228
|
"""The add_revision will also save the dataset."""
|
|
213
229
|
metadata_changed = False
|
|
214
|
-
if dataset.
|
|
230
|
+
if dataset.update_metadata(name, metadata, state):
|
|
215
231
|
self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
|
|
216
232
|
metadata_changed = True
|
|
217
233
|
|
|
218
|
-
self.add_revision(dataset, files)
|
|
234
|
+
revision = self.add_revision(dataset, files, revision_source)
|
|
219
235
|
|
|
220
236
|
if metadata_changed:
|
|
221
237
|
# Dispatch after revision added. Otherwise, the downstream handlers are not able to see
|
|
222
238
|
# the new revision
|
|
223
239
|
self.dispatch(MetadataUpdated(dataset=dataset))
|
|
224
240
|
|
|
241
|
+
return revision
|
|
242
|
+
|
|
225
243
|
def destroy_dataset(self, dataset: Dataset):
|
|
226
244
|
# TODO: remove files. Now we leave some orphaned files around
|
|
227
245
|
self.dataset_repository.destroy(dataset)
|
|
@@ -235,6 +253,7 @@ class DatasetStore:
|
|
|
235
253
|
state: DatasetState,
|
|
236
254
|
metadata: dict,
|
|
237
255
|
files: Dict[str, DraftFile],
|
|
256
|
+
revision_source: RevisionSource,
|
|
238
257
|
description: str = "Create",
|
|
239
258
|
):
|
|
240
259
|
now = utcnow()
|
|
@@ -251,9 +270,10 @@ class DatasetStore:
|
|
|
251
270
|
created_at=now,
|
|
252
271
|
updated_at=now,
|
|
253
272
|
)
|
|
254
|
-
self.add_revision(dataset, files, description)
|
|
273
|
+
revision = self.add_revision(dataset, files, revision_source, description)
|
|
255
274
|
|
|
256
275
|
self.dispatch(DatasetCreated(dataset=dataset))
|
|
276
|
+
return revision
|
|
257
277
|
|
|
258
278
|
def load_files(
|
|
259
279
|
self,
|
|
@@ -302,8 +322,8 @@ class DatasetStore:
|
|
|
302
322
|
|
|
303
323
|
try:
|
|
304
324
|
return statsbomb.load(
|
|
305
|
-
event_data=files.get_file("events").stream,
|
|
306
|
-
lineup_data=files.get_file("lineups").stream,
|
|
325
|
+
event_data=(files.get_file("events")).stream,
|
|
326
|
+
lineup_data=(files.get_file("lineups")).stream,
|
|
307
327
|
**kwargs,
|
|
308
328
|
)
|
|
309
329
|
except Exception as e:
|
|
@@ -333,7 +353,7 @@ class DatasetStore:
|
|
|
333
353
|
# filename=filename,
|
|
334
354
|
# )
|
|
335
355
|
|
|
336
|
-
def map(
|
|
337
|
-
|
|
338
|
-
):
|
|
339
|
-
|
|
356
|
+
# def map(
|
|
357
|
+
# self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
|
|
358
|
+
# ):
|
|
359
|
+
# return map_in_pool(fn, dataset_collection, processes)
|
|
@@ -5,7 +5,7 @@ from typing import Optional, List
|
|
|
5
5
|
|
|
6
6
|
from .loader import Loader
|
|
7
7
|
from .dataset_store import DatasetStore
|
|
8
|
-
from
|
|
8
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
9
9
|
|
|
10
10
|
logger = logging.getLogger(__name__)
|
|
11
11
|
|
|
@@ -18,8 +18,8 @@ class IngestionEngine:
|
|
|
18
18
|
self.store = store
|
|
19
19
|
self.loader = Loader(self.store)
|
|
20
20
|
|
|
21
|
-
def
|
|
22
|
-
self.loader.
|
|
21
|
+
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
22
|
+
self.loader.add_ingestion_plan(ingestion_plan)
|
|
23
23
|
|
|
24
24
|
def load(self, dry_run: bool = False, provider: Optional[str] = None):
|
|
25
25
|
self.loader.collect_and_run(dry_run=dry_run, provider=provider)
|
ingestify/application/loader.py
CHANGED
|
@@ -1,19 +1,15 @@
|
|
|
1
|
-
import itertools
|
|
2
|
-
import json
|
|
3
1
|
import logging
|
|
4
2
|
import platform
|
|
5
|
-
|
|
3
|
+
import uuid
|
|
4
|
+
from multiprocessing import set_start_method
|
|
6
5
|
from typing import List, Optional
|
|
7
6
|
|
|
8
|
-
from ingestify.domain.models import
|
|
9
|
-
from ingestify.utils import
|
|
7
|
+
from ingestify.domain.models import Selector
|
|
8
|
+
from ingestify.utils import TaskExecutor
|
|
10
9
|
|
|
11
10
|
from .dataset_store import DatasetStore
|
|
12
|
-
from
|
|
13
|
-
from ..domain import
|
|
14
|
-
from ..domain.models.data_spec_version_collection import DataSpecVersionCollection
|
|
15
|
-
from ..domain.models.extract_job import ExtractJob
|
|
16
|
-
from ..domain.models.resources.dataset_resource import FileResource
|
|
11
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
12
|
+
from ..domain.models.ingestion.ingestion_job import IngestionJob
|
|
17
13
|
from ..exceptions import ConfigurationError
|
|
18
14
|
|
|
19
15
|
if platform.system() == "Darwin":
|
|
@@ -25,176 +21,52 @@ else:
|
|
|
25
21
|
logger = logging.getLogger(__name__)
|
|
26
22
|
|
|
27
23
|
|
|
28
|
-
DEFAULT_CHUNK_SIZE = 1000
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def to_batches(input_):
|
|
32
|
-
if isinstance(input_, list):
|
|
33
|
-
batches = [input_]
|
|
34
|
-
else:
|
|
35
|
-
# Assume it's an iterator. Peek what's inside, and put it back
|
|
36
|
-
try:
|
|
37
|
-
peek = next(input_)
|
|
38
|
-
except StopIteration:
|
|
39
|
-
# Nothing to batch
|
|
40
|
-
return []
|
|
41
|
-
|
|
42
|
-
input_ = itertools.chain([peek], input_)
|
|
43
|
-
|
|
44
|
-
if not isinstance(peek, list):
|
|
45
|
-
batches = chunker(input_, DEFAULT_CHUNK_SIZE)
|
|
46
|
-
else:
|
|
47
|
-
batches = input_
|
|
48
|
-
return batches
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def load_file(
|
|
52
|
-
file_resource: FileResource, dataset: Optional[Dataset] = None
|
|
53
|
-
) -> Optional[DraftFile]:
|
|
54
|
-
current_file = None
|
|
55
|
-
if dataset:
|
|
56
|
-
current_file = dataset.current_revision.modified_files_map.get(
|
|
57
|
-
file_resource.file_id
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
if file_resource.json_content is not None:
|
|
61
|
-
# Empty dictionary is allowed
|
|
62
|
-
file = DraftFile.from_input(
|
|
63
|
-
file_=json.dumps(file_resource.json_content, indent=4),
|
|
64
|
-
data_serialization_format="json",
|
|
65
|
-
data_feed_key=file_resource.data_feed_key,
|
|
66
|
-
data_spec_version=file_resource.data_spec_version,
|
|
67
|
-
modified_at=file_resource.last_modified,
|
|
68
|
-
)
|
|
69
|
-
if current_file and current_file.tag == file.tag:
|
|
70
|
-
# Nothing changed
|
|
71
|
-
return None
|
|
72
|
-
return file
|
|
73
|
-
elif file_resource.url:
|
|
74
|
-
http_options = {}
|
|
75
|
-
if file_resource.http_options:
|
|
76
|
-
for k, v in file_resource.http_options.items():
|
|
77
|
-
http_options[f"http_{k}"] = v
|
|
78
|
-
|
|
79
|
-
return retrieve_http(
|
|
80
|
-
url=file_resource.url,
|
|
81
|
-
current_file=current_file,
|
|
82
|
-
file_data_feed_key=file_resource.data_feed_key,
|
|
83
|
-
file_data_spec_version=file_resource.data_spec_version,
|
|
84
|
-
file_data_serialization_format=file_resource.data_serialization_format
|
|
85
|
-
or "txt",
|
|
86
|
-
**http_options,
|
|
87
|
-
**file_resource.loader_kwargs,
|
|
88
|
-
)
|
|
89
|
-
else:
|
|
90
|
-
return file_resource.file_loader(
|
|
91
|
-
file_resource,
|
|
92
|
-
current_file,
|
|
93
|
-
# TODO: check how to fix this with typehints
|
|
94
|
-
**file_resource.loader_kwargs,
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
class UpdateDatasetTask(Task):
|
|
99
|
-
def __init__(
|
|
100
|
-
self,
|
|
101
|
-
dataset: Dataset,
|
|
102
|
-
dataset_resource: DatasetResource,
|
|
103
|
-
store: DatasetStore,
|
|
104
|
-
):
|
|
105
|
-
self.dataset = dataset
|
|
106
|
-
self.dataset_resource = dataset_resource
|
|
107
|
-
self.store = store
|
|
108
|
-
|
|
109
|
-
def run(self):
|
|
110
|
-
self.store.update_dataset(
|
|
111
|
-
dataset=self.dataset,
|
|
112
|
-
dataset_resource=self.dataset_resource,
|
|
113
|
-
files={
|
|
114
|
-
file_id: load_file(file_resource, dataset=self.dataset)
|
|
115
|
-
for file_id, file_resource in self.dataset_resource.files.items()
|
|
116
|
-
},
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
def __repr__(self):
|
|
120
|
-
return f"UpdateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
class CreateDatasetTask(Task):
|
|
124
|
-
def __init__(
|
|
125
|
-
self,
|
|
126
|
-
dataset_resource: DatasetResource,
|
|
127
|
-
store: DatasetStore,
|
|
128
|
-
):
|
|
129
|
-
self.dataset_resource = dataset_resource
|
|
130
|
-
self.store = store
|
|
131
|
-
|
|
132
|
-
def run(self):
|
|
133
|
-
self.store.create_dataset(
|
|
134
|
-
dataset_type=self.dataset_resource.dataset_type,
|
|
135
|
-
provider=self.dataset_resource.provider,
|
|
136
|
-
dataset_identifier=Identifier(**self.dataset_resource.dataset_resource_id),
|
|
137
|
-
name=self.dataset_resource.name,
|
|
138
|
-
state=self.dataset_resource.state,
|
|
139
|
-
metadata=self.dataset_resource.metadata,
|
|
140
|
-
files={
|
|
141
|
-
file_id: load_file(file_resource)
|
|
142
|
-
for file_id, file_resource in self.dataset_resource.files.items()
|
|
143
|
-
},
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
def __repr__(self):
|
|
147
|
-
return f"CreateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
|
|
148
|
-
|
|
149
|
-
|
|
150
24
|
class Loader:
|
|
151
25
|
def __init__(self, store: DatasetStore):
|
|
152
26
|
self.store = store
|
|
153
|
-
self.
|
|
27
|
+
self.ingestion_plans: List[IngestionPlan] = []
|
|
154
28
|
|
|
155
|
-
def
|
|
156
|
-
self.
|
|
29
|
+
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
30
|
+
self.ingestion_plans.append(ingestion_plan)
|
|
157
31
|
|
|
158
32
|
def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
|
|
159
|
-
total_dataset_count = 0
|
|
160
|
-
|
|
161
33
|
# First collect all selectors, before discovering datasets
|
|
162
34
|
selectors = {}
|
|
163
|
-
for
|
|
35
|
+
for ingestion_plan in self.ingestion_plans:
|
|
164
36
|
if provider is not None:
|
|
165
|
-
if
|
|
37
|
+
if ingestion_plan.source.provider != provider:
|
|
166
38
|
logger.info(
|
|
167
|
-
f"Skipping {
|
|
39
|
+
f"Skipping {ingestion_plan} because provider doesn't match '{provider}'"
|
|
168
40
|
)
|
|
169
41
|
continue
|
|
170
42
|
|
|
171
43
|
static_selectors = [
|
|
172
44
|
selector
|
|
173
|
-
for selector in
|
|
45
|
+
for selector in ingestion_plan.selectors
|
|
174
46
|
if not selector.is_dynamic
|
|
175
47
|
]
|
|
176
48
|
dynamic_selectors = [
|
|
177
|
-
selector for selector in
|
|
49
|
+
selector for selector in ingestion_plan.selectors if selector.is_dynamic
|
|
178
50
|
]
|
|
179
51
|
|
|
180
52
|
no_selectors = len(static_selectors) == 1 and not bool(static_selectors[0])
|
|
181
53
|
if dynamic_selectors or no_selectors:
|
|
182
|
-
if hasattr(
|
|
54
|
+
if hasattr(ingestion_plan.source, "discover_selectors"):
|
|
183
55
|
logger.debug(
|
|
184
|
-
f"Discovering selectors from {
|
|
56
|
+
f"Discovering selectors from {ingestion_plan.source.__class__.__name__}"
|
|
185
57
|
)
|
|
186
58
|
|
|
187
59
|
# TODO: consider making this lazy and fetch once per Source instead of
|
|
188
|
-
# once per
|
|
189
|
-
all_selectors =
|
|
190
|
-
|
|
60
|
+
# once per IngestionPlan
|
|
61
|
+
all_selectors = ingestion_plan.source.discover_selectors(
|
|
62
|
+
ingestion_plan.dataset_type
|
|
191
63
|
)
|
|
192
64
|
if no_selectors:
|
|
193
65
|
# When there were no selectors specified, just use all of them
|
|
194
66
|
extra_static_selectors = [
|
|
195
67
|
Selector.build(
|
|
196
68
|
job_selector,
|
|
197
|
-
data_spec_versions=
|
|
69
|
+
data_spec_versions=ingestion_plan.data_spec_versions,
|
|
198
70
|
)
|
|
199
71
|
for job_selector in all_selectors
|
|
200
72
|
]
|
|
@@ -205,7 +77,7 @@ class Loader:
|
|
|
205
77
|
dynamic_job_selectors = [
|
|
206
78
|
Selector.build(
|
|
207
79
|
job_selector,
|
|
208
|
-
data_spec_versions=
|
|
80
|
+
data_spec_versions=ingestion_plan.data_spec_versions,
|
|
209
81
|
)
|
|
210
82
|
for job_selector in all_selectors
|
|
211
83
|
if dynamic_selector.is_match(job_selector)
|
|
@@ -216,7 +88,7 @@ class Loader:
|
|
|
216
88
|
static_selectors.extend(extra_static_selectors)
|
|
217
89
|
|
|
218
90
|
logger.info(
|
|
219
|
-
f"Discovered {len(extra_static_selectors)} selectors from {
|
|
91
|
+
f"Discovered {len(extra_static_selectors)} selectors from {ingestion_plan.source.__class__.__name__}"
|
|
220
92
|
)
|
|
221
93
|
else:
|
|
222
94
|
if not no_selectors:
|
|
@@ -224,112 +96,70 @@ class Loader:
|
|
|
224
96
|
# later on
|
|
225
97
|
raise ConfigurationError(
|
|
226
98
|
f"Dynamic selectors cannot be used for "
|
|
227
|
-
f"{
|
|
99
|
+
f"{ingestion_plan.source.__class__.__name__} because it doesn't support"
|
|
228
100
|
f" selector discovery"
|
|
229
101
|
)
|
|
230
102
|
|
|
231
103
|
# Merge selectors when source, dataset_type and actual selector is the same. This makes
|
|
232
104
|
# sure there will be only 1 dataset for this combination
|
|
233
105
|
for selector in static_selectors:
|
|
234
|
-
key = (
|
|
106
|
+
key = (
|
|
107
|
+
ingestion_plan.source.name,
|
|
108
|
+
ingestion_plan.dataset_type,
|
|
109
|
+
selector.key,
|
|
110
|
+
)
|
|
235
111
|
if existing_selector := selectors.get(key):
|
|
236
112
|
existing_selector[1].data_spec_versions.merge(
|
|
237
113
|
selector.data_spec_versions
|
|
238
114
|
)
|
|
239
115
|
else:
|
|
240
|
-
selectors[key] = (
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
116
|
+
selectors[key] = (ingestion_plan, selector)
|
|
117
|
+
|
|
118
|
+
"""
|
|
119
|
+
Data is denormalized:
|
|
120
|
+
|
|
121
|
+
It actually looks like:
|
|
122
|
+
- IngestionPlan #1
|
|
123
|
+
- Selector 1.1
|
|
124
|
+
- Selector 1.2
|
|
125
|
+
- Selector 1.3
|
|
126
|
+
- IngestionPlan #2
|
|
127
|
+
- Selector 2.1
|
|
128
|
+
- Selector 2.2
|
|
129
|
+
|
|
130
|
+
We process this as:
|
|
131
|
+
- IngestionPlan #1, Selector 1.1
|
|
132
|
+
- IngestionPlan #1, Selector 1.2
|
|
133
|
+
- IngestionPlan #1, Selector 1.3
|
|
134
|
+
- IngestionPlan #2, Selector 2.1
|
|
135
|
+
- IngestionPlan #2, Selector 2.2
|
|
136
|
+
|
|
137
|
+
IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
|
|
138
|
+
"""
|
|
139
|
+
for ingestion_plan, selector in selectors.values():
|
|
247
140
|
logger.debug(
|
|
248
|
-
f"Discovering datasets from {
|
|
141
|
+
f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
|
|
249
142
|
)
|
|
250
143
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
144
|
+
ingestion_job = IngestionJob(
|
|
145
|
+
ingestion_job_id=str(uuid.uuid1()),
|
|
146
|
+
ingestion_plan=ingestion_plan,
|
|
254
147
|
selector=selector,
|
|
255
|
-
metadata_only=True,
|
|
256
|
-
).metadata
|
|
257
|
-
|
|
258
|
-
# There are two different, but similar flows here:
|
|
259
|
-
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
260
|
-
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
261
|
-
datasets = extract_job.source.find_datasets(
|
|
262
|
-
dataset_type=extract_job.dataset_type,
|
|
263
|
-
data_spec_versions=selector.data_spec_versions,
|
|
264
|
-
dataset_collection_metadata=dataset_collection_metadata,
|
|
265
|
-
**selector.custom_attributes,
|
|
266
148
|
)
|
|
267
149
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
dataset_identifiers = [
|
|
272
|
-
Identifier.create_from_selector(
|
|
273
|
-
selector, **dataset_resource.dataset_resource_id
|
|
274
|
-
)
|
|
275
|
-
# We have to pass the data_spec_versions here as a Source can add some
|
|
276
|
-
# extra data to the identifier which is retrieved in a certain data format
|
|
277
|
-
for dataset_resource in batch
|
|
278
|
-
]
|
|
279
|
-
|
|
280
|
-
# Load all available datasets based on the discovered dataset identifiers
|
|
281
|
-
dataset_collection = self.store.get_dataset_collection(
|
|
282
|
-
dataset_type=extract_job.dataset_type,
|
|
283
|
-
# Assume all DatasetResources share the same provider
|
|
284
|
-
provider=batch[0].provider,
|
|
285
|
-
selector=dataset_identifiers,
|
|
150
|
+
with TaskExecutor(dry_run=dry_run) as task_executor:
|
|
151
|
+
ingestion_job_summary = ingestion_job.execute(
|
|
152
|
+
self.store, task_executor=task_executor
|
|
286
153
|
)
|
|
287
154
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
selector, **dataset_resource.dataset_resource_id
|
|
295
|
-
)
|
|
296
|
-
|
|
297
|
-
if dataset := dataset_collection.get(dataset_identifier):
|
|
298
|
-
if extract_job.fetch_policy.should_refetch(
|
|
299
|
-
dataset, dataset_resource
|
|
300
|
-
):
|
|
301
|
-
task_set.add(
|
|
302
|
-
UpdateDatasetTask(
|
|
303
|
-
dataset=dataset, # Current dataset from the database
|
|
304
|
-
dataset_resource=dataset_resource, # Most recent dataset_resource
|
|
305
|
-
store=self.store,
|
|
306
|
-
)
|
|
307
|
-
)
|
|
308
|
-
else:
|
|
309
|
-
skip_count += 1
|
|
310
|
-
else:
|
|
311
|
-
if extract_job.fetch_policy.should_fetch(dataset_resource):
|
|
312
|
-
task_set.add(
|
|
313
|
-
CreateDatasetTask(
|
|
314
|
-
dataset_resource=dataset_resource,
|
|
315
|
-
store=self.store,
|
|
316
|
-
)
|
|
317
|
-
)
|
|
318
|
-
else:
|
|
319
|
-
skip_count += 1
|
|
155
|
+
# TODO: handle task_summaries
|
|
156
|
+
# Summarize to a IngestionJobSummary, and save to a database. This Summary can later be used in a
|
|
157
|
+
# next run to determine where to resume.
|
|
158
|
+
# TODO 2: Do we want to add additional information from the summary back to the Task, so it can use
|
|
159
|
+
# extra information to determine how/where to resume
|
|
160
|
+
ingestion_job_summary.set_finished()
|
|
320
161
|
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
f"Discovered {len(dataset_identifiers)} datasets from {extract_job.source.__class__.__name__} "
|
|
324
|
-
f"using selector {selector} => {len(task_set)} tasks. {skip_count} skipped."
|
|
325
|
-
)
|
|
326
|
-
logger.info(f"Running {len(task_set)} tasks")
|
|
327
|
-
with TaskExecutor(dry_run=dry_run) as task_executor:
|
|
328
|
-
task_executor.run(run_task, task_set)
|
|
329
|
-
else:
|
|
330
|
-
logger.info(
|
|
331
|
-
f"Discovered {len(dataset_identifiers)} datasets from {extract_job.source.__class__.__name__} "
|
|
332
|
-
f"using selector {selector} => nothing to do"
|
|
333
|
-
)
|
|
162
|
+
ingestion_job_summary.output_report()
|
|
163
|
+
self.store.save_ingestion_job_summary(ingestion_job_summary)
|
|
334
164
|
|
|
335
165
|
logger.info("Done")
|
|
@@ -11,10 +11,8 @@ from .dataset import (
|
|
|
11
11
|
LoadedFile,
|
|
12
12
|
Selector,
|
|
13
13
|
Revision,
|
|
14
|
-
dataset_repository_factory,
|
|
15
|
-
file_repository_factory,
|
|
16
14
|
)
|
|
17
|
-
from .sink import Sink
|
|
15
|
+
from .sink import Sink
|
|
18
16
|
from .source import Source
|
|
19
17
|
from .task import Task, TaskSet
|
|
20
18
|
from .data_spec_version_collection import DataSpecVersionCollection
|
|
@@ -35,11 +33,8 @@ __all__ = [
|
|
|
35
33
|
"FileRepository",
|
|
36
34
|
"FileCollection",
|
|
37
35
|
"DatasetRepository",
|
|
38
|
-
"dataset_repository_factory",
|
|
39
|
-
"file_repository_factory",
|
|
40
36
|
"TaskSet",
|
|
41
37
|
"Task",
|
|
42
38
|
"Sink",
|
|
43
|
-
"sink_factory",
|
|
44
39
|
"DataSpecVersionCollection",
|
|
45
40
|
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from functools import partial
|
|
2
|
+
from typing import ClassVar, Any, Optional
|
|
3
|
+
|
|
4
|
+
import pydantic
|
|
5
|
+
from pydantic import BaseModel as PydanticBaseModel, ConfigDict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# class BaseModel(PydanticBaseModel):
|
|
9
|
+
# model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
10
|
+
#
|
|
11
|
+
# _sa_instance_state: Optional[dict] = None
|
|
12
|
+
from sqlalchemy.orm import MappedAsDataclass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseModel(
|
|
16
|
+
MappedAsDataclass,
|
|
17
|
+
# DeclarativeBase,
|
|
18
|
+
dataclass_callable=partial(
|
|
19
|
+
pydantic.dataclasses.dataclass, config=ConfigDict(arbitrary_types_allowed=True)
|
|
20
|
+
),
|
|
21
|
+
):
|
|
22
|
+
pass
|
|
@@ -16,6 +16,12 @@ class DataSpecVersionCollection(dict):
|
|
|
16
16
|
|
|
17
17
|
return cls(items_)
|
|
18
18
|
|
|
19
|
+
def to_dict(self):
|
|
20
|
+
return {
|
|
21
|
+
data_feed_key: list(data_spec_versions)
|
|
22
|
+
for data_feed_key, data_spec_versions in self.items()
|
|
23
|
+
}
|
|
24
|
+
|
|
19
25
|
def copy(self):
|
|
20
26
|
return DataSpecVersionCollection(copy.deepcopy(self))
|
|
21
27
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
+
from .file import DraftFile, File, LoadedFile
|
|
1
2
|
from .collection import DatasetCollection
|
|
2
3
|
from .dataset import Dataset
|
|
3
|
-
from .dataset_repository import DatasetRepository
|
|
4
|
-
from .
|
|
5
|
-
from .file_repository import FileRepository, file_repository_factory
|
|
4
|
+
from .dataset_repository import DatasetRepository
|
|
5
|
+
from .file_repository import FileRepository
|
|
6
6
|
from .file_collection import FileCollection
|
|
7
7
|
from .identifier import Identifier
|
|
8
8
|
from .selector import Selector
|
|
@@ -16,12 +16,10 @@ __all__ = [
|
|
|
16
16
|
"Identifier",
|
|
17
17
|
"DatasetCollection",
|
|
18
18
|
"DatasetCreated",
|
|
19
|
-
"dataset_repository_factory",
|
|
20
19
|
"File",
|
|
21
20
|
"DraftFile",
|
|
22
21
|
"LoadedFile",
|
|
23
22
|
"DatasetRepository",
|
|
24
23
|
"FileRepository",
|
|
25
|
-
"file_repository_factory",
|
|
26
24
|
"FileCollection",
|
|
27
25
|
]
|