ingestify 0.1.3__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.1.3 → ingestify-0.3.0}/PKG-INFO +1 -1
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/__init__.py +1 -1
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/application/dataset_store.py +47 -36
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/application/ingestion_engine.py +3 -3
- ingestify-0.3.0/ingestify/application/loader.py +165 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/__init__.py +1 -6
- ingestify-0.3.0/ingestify/domain/models/base.py +22 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/data_spec_version_collection.py +6 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/__init__.py +3 -5
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/dataset.py +15 -32
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/dataset_repository.py +1 -15
- ingestify-0.3.0/ingestify/domain/models/dataset/dataset_state.py +11 -0
- ingestify-0.3.0/ingestify/domain/models/dataset/events.py +21 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/file.py +21 -34
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/file_collection.py +3 -1
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/file_repository.py +29 -28
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/revision.py +26 -3
- ingestify-0.3.0/ingestify/domain/models/event/domain_event.py +14 -0
- ingestify-0.3.0/ingestify/domain/models/ingestion/ingestion_job.py +325 -0
- ingestify-0.3.0/ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
- ingestify-0.1.3/ingestify/domain/models/extract_job.py → ingestify-0.3.0/ingestify/domain/models/ingestion/ingestion_plan.py +4 -4
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/resources/dataset_resource.py +29 -37
- ingestify-0.3.0/ingestify/domain/models/sink.py +9 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/task/task.py +3 -1
- ingestify-0.3.0/ingestify/domain/models/task/task_summary.py +118 -0
- ingestify-0.3.0/ingestify/domain/models/timing.py +16 -0
- ingestify-0.3.0/ingestify/domain/services/identifier_key_transformer.py +111 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/fetch/http.py +5 -0
- ingestify-0.3.0/ingestify/infra/source/statsbomb_github.py +105 -0
- ingestify-0.3.0/ingestify/infra/store/dataset/sqlalchemy/mapping.py +336 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/file/local_file_repository.py +3 -5
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/file/s3_file_repository.py +4 -9
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/main.py +64 -25
- ingestify-0.3.0/ingestify/static/templates/statsbomb_github/README.md +0 -0
- ingestify-0.3.0/ingestify/static/templates/wyscout/README.md +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/utils.py +15 -78
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/PKG-INFO +1 -1
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/SOURCES.txt +9 -2
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/requires.txt +1 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/setup.py +1 -0
- ingestify-0.1.3/ingestify/application/loader.py +0 -335
- ingestify-0.1.3/ingestify/domain/models/dataset/events.py +0 -31
- ingestify-0.1.3/ingestify/domain/models/event/domain_event.py +0 -10
- ingestify-0.1.3/ingestify/domain/models/sink.py +0 -16
- ingestify-0.1.3/ingestify/infra/source/statsbomb_github.py +0 -92
- ingestify-0.1.3/ingestify/infra/store/dataset/__init__.py +0 -2
- ingestify-0.1.3/ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
- ingestify-0.1.3/ingestify/infra/store/dataset/sqlalchemy/mapping.py +0 -153
- {ingestify-0.1.3 → ingestify-0.3.0}/README.md +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/application/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/cmdline.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.1.3/ingestify/domain/services → ingestify-0.3.0/ingestify/domain/models/ingestion}/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.1.3/ingestify/domain/services/transformers → ingestify-0.3.0/ingestify/domain/services}/__init__.py +0 -0
- {ingestify-0.1.3/ingestify/infra/fetch → ingestify-0.3.0/ingestify/domain/services/transformers}/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/exceptions.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.1.3/ingestify/infra/sink → ingestify-0.3.0/ingestify/infra/fetch}/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.1.3/ingestify/infra/source → ingestify-0.3.0/ingestify/infra/sink}/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/sink/postgresql.py +0 -0
- /ingestify-0.1.3/ingestify/static/templates/statsbomb_github/README.md → /ingestify-0.3.0/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/source/wyscout.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/__init__.py +0 -0
- /ingestify-0.1.3/ingestify/static/templates/wyscout/README.md → /ingestify-0.3.0/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/server.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/source_base.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/statsbomb_github/query.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/wyscout/.env +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/wyscout/.gitignore +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/wyscout/database/README.md +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/wyscout/query.py +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.1.3 → ingestify-0.3.0}/setup.cfg +0 -0
|
@@ -5,13 +5,14 @@ import mimetypes
|
|
|
5
5
|
import os
|
|
6
6
|
import shutil
|
|
7
7
|
from dataclasses import asdict
|
|
8
|
-
from io import BytesIO
|
|
8
|
+
from io import BytesIO
|
|
9
9
|
|
|
10
|
-
from typing import Dict, List, Optional, Union, Callable, BinaryIO
|
|
10
|
+
from typing import Dict, List, Optional, Union, Callable, BinaryIO, Awaitable
|
|
11
11
|
|
|
12
12
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
13
13
|
from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
|
|
14
14
|
from ingestify.domain.models.dataset.file_collection import FileCollection
|
|
15
|
+
from ingestify.domain.models.dataset.revision import RevisionSource
|
|
15
16
|
from ingestify.domain.models.event import EventBus
|
|
16
17
|
from ingestify.domain.models import (
|
|
17
18
|
Dataset,
|
|
@@ -27,7 +28,7 @@ from ingestify.domain.models import (
|
|
|
27
28
|
Revision,
|
|
28
29
|
DatasetCreated,
|
|
29
30
|
)
|
|
30
|
-
from ingestify.utils import utcnow
|
|
31
|
+
from ingestify.utils import utcnow
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger(__name__)
|
|
@@ -56,11 +57,16 @@ class DatasetStore:
|
|
|
56
57
|
if self.event_bus:
|
|
57
58
|
self.event_bus.dispatch(event)
|
|
58
59
|
|
|
60
|
+
def save_ingestion_job_summary(self, ingestion_job_summary):
|
|
61
|
+
self.dataset_repository.session.add(ingestion_job_summary)
|
|
62
|
+
self.dataset_repository.session.commit()
|
|
63
|
+
|
|
59
64
|
def get_dataset_collection(
|
|
60
65
|
self,
|
|
61
66
|
dataset_type: Optional[str] = None,
|
|
62
67
|
provider: Optional[str] = None,
|
|
63
68
|
dataset_id: Optional[str] = None,
|
|
69
|
+
metadata_only: Optional[bool] = False,
|
|
64
70
|
**selector,
|
|
65
71
|
) -> DatasetCollection:
|
|
66
72
|
if "selector" in selector:
|
|
@@ -81,6 +87,7 @@ class DatasetStore:
|
|
|
81
87
|
dataset_type=dataset_type,
|
|
82
88
|
dataset_id=dataset_id,
|
|
83
89
|
provider=provider,
|
|
90
|
+
metadata_only=metadata_only,
|
|
84
91
|
selector=selector,
|
|
85
92
|
)
|
|
86
93
|
return dataset_collection
|
|
@@ -107,7 +114,9 @@ class DatasetStore:
|
|
|
107
114
|
|
|
108
115
|
return stream, storage_size, suffix
|
|
109
116
|
|
|
110
|
-
def _prepare_read_stream(
|
|
117
|
+
def _prepare_read_stream(
|
|
118
|
+
self,
|
|
119
|
+
) -> tuple[Callable[[BinaryIO], Awaitable[BytesIO]], str]:
|
|
111
120
|
if self.storage_compression_method == "gzip":
|
|
112
121
|
|
|
113
122
|
def reader(fh: BinaryIO) -> BytesIO:
|
|
@@ -168,7 +177,11 @@ class DatasetStore:
|
|
|
168
177
|
return modified_files_
|
|
169
178
|
|
|
170
179
|
def add_revision(
|
|
171
|
-
self,
|
|
180
|
+
self,
|
|
181
|
+
dataset: Dataset,
|
|
182
|
+
files: Dict[str, DraftFile],
|
|
183
|
+
revision_source: RevisionSource,
|
|
184
|
+
description: str = "Update",
|
|
172
185
|
):
|
|
173
186
|
"""
|
|
174
187
|
Create new revision first, so FileRepository can use
|
|
@@ -182,46 +195,53 @@ class DatasetStore:
|
|
|
182
195
|
# It can happen an API tells us data is changed, but it was not changed. In this case
|
|
183
196
|
# we decide to ignore it.
|
|
184
197
|
# Make sure there are files changed before creating a new revision
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
)
|
|
198
|
+
revision = Revision(
|
|
199
|
+
revision_id=revision_id,
|
|
200
|
+
created_at=created_at,
|
|
201
|
+
description=description,
|
|
202
|
+
modified_files=persisted_files_,
|
|
203
|
+
source=revision_source,
|
|
192
204
|
)
|
|
193
205
|
|
|
206
|
+
dataset.add_revision(revision)
|
|
207
|
+
|
|
194
208
|
self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
|
|
195
209
|
self.dispatch(RevisionAdded(dataset=dataset))
|
|
196
210
|
logger.info(
|
|
197
211
|
f"Added a new revision to {dataset.identifier} -> {', '.join([file.file_id for file in persisted_files_])}"
|
|
198
212
|
)
|
|
199
|
-
return True
|
|
200
213
|
else:
|
|
201
214
|
logger.info(
|
|
202
215
|
f"Ignoring a new revision without changed files -> {dataset.identifier}"
|
|
203
216
|
)
|
|
204
|
-
|
|
217
|
+
revision = None
|
|
218
|
+
|
|
219
|
+
return revision
|
|
205
220
|
|
|
206
221
|
def update_dataset(
|
|
207
222
|
self,
|
|
208
223
|
dataset: Dataset,
|
|
209
|
-
|
|
224
|
+
name: str,
|
|
225
|
+
state: DatasetState,
|
|
226
|
+
metadata: dict,
|
|
210
227
|
files: Dict[str, DraftFile],
|
|
228
|
+
revision_source: RevisionSource,
|
|
211
229
|
):
|
|
212
230
|
"""The add_revision will also save the dataset."""
|
|
213
231
|
metadata_changed = False
|
|
214
|
-
if dataset.
|
|
232
|
+
if dataset.update_metadata(name, metadata, state):
|
|
215
233
|
self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
|
|
216
234
|
metadata_changed = True
|
|
217
235
|
|
|
218
|
-
self.add_revision(dataset, files)
|
|
236
|
+
revision = self.add_revision(dataset, files, revision_source)
|
|
219
237
|
|
|
220
238
|
if metadata_changed:
|
|
221
239
|
# Dispatch after revision added. Otherwise, the downstream handlers are not able to see
|
|
222
240
|
# the new revision
|
|
223
241
|
self.dispatch(MetadataUpdated(dataset=dataset))
|
|
224
242
|
|
|
243
|
+
return revision
|
|
244
|
+
|
|
225
245
|
def destroy_dataset(self, dataset: Dataset):
|
|
226
246
|
# TODO: remove files. Now we leave some orphaned files around
|
|
227
247
|
self.dataset_repository.destroy(dataset)
|
|
@@ -235,6 +255,7 @@ class DatasetStore:
|
|
|
235
255
|
state: DatasetState,
|
|
236
256
|
metadata: dict,
|
|
237
257
|
files: Dict[str, DraftFile],
|
|
258
|
+
revision_source: RevisionSource,
|
|
238
259
|
description: str = "Create",
|
|
239
260
|
):
|
|
240
261
|
now = utcnow()
|
|
@@ -251,9 +272,10 @@ class DatasetStore:
|
|
|
251
272
|
created_at=now,
|
|
252
273
|
updated_at=now,
|
|
253
274
|
)
|
|
254
|
-
self.add_revision(dataset, files, description)
|
|
275
|
+
revision = self.add_revision(dataset, files, revision_source, description)
|
|
255
276
|
|
|
256
277
|
self.dispatch(DatasetCreated(dataset=dataset))
|
|
278
|
+
return revision
|
|
257
279
|
|
|
258
280
|
def load_files(
|
|
259
281
|
self,
|
|
@@ -271,20 +293,9 @@ class DatasetStore:
|
|
|
271
293
|
continue
|
|
272
294
|
|
|
273
295
|
def get_stream(file_):
|
|
274
|
-
revision_id = file_.revision_id
|
|
275
|
-
if revision_id is None:
|
|
276
|
-
revision_id = current_revision.revision_id
|
|
277
|
-
|
|
278
296
|
return reader(
|
|
279
297
|
self.file_repository.load_content(
|
|
280
|
-
bucket=self.bucket,
|
|
281
|
-
dataset=dataset,
|
|
282
|
-
# When file.revision_id is set we must use it.
|
|
283
|
-
revision_id=revision_id,
|
|
284
|
-
filename=file_.file_id
|
|
285
|
-
+ "."
|
|
286
|
-
+ file_.data_serialization_format
|
|
287
|
-
+ suffix,
|
|
298
|
+
bucket=self.bucket, storage_path=file_.storage_path
|
|
288
299
|
)
|
|
289
300
|
)
|
|
290
301
|
|
|
@@ -302,8 +313,8 @@ class DatasetStore:
|
|
|
302
313
|
|
|
303
314
|
try:
|
|
304
315
|
return statsbomb.load(
|
|
305
|
-
event_data=files.get_file("events").stream,
|
|
306
|
-
lineup_data=files.get_file("lineups").stream,
|
|
316
|
+
event_data=(files.get_file("events")).stream,
|
|
317
|
+
lineup_data=(files.get_file("lineups")).stream,
|
|
307
318
|
**kwargs,
|
|
308
319
|
)
|
|
309
320
|
except Exception as e:
|
|
@@ -333,7 +344,7 @@ class DatasetStore:
|
|
|
333
344
|
# filename=filename,
|
|
334
345
|
# )
|
|
335
346
|
|
|
336
|
-
def map(
|
|
337
|
-
|
|
338
|
-
):
|
|
339
|
-
|
|
347
|
+
# def map(
|
|
348
|
+
# self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
|
|
349
|
+
# ):
|
|
350
|
+
# return map_in_pool(fn, dataset_collection, processes)
|
|
@@ -5,7 +5,7 @@ from typing import Optional, List
|
|
|
5
5
|
|
|
6
6
|
from .loader import Loader
|
|
7
7
|
from .dataset_store import DatasetStore
|
|
8
|
-
from
|
|
8
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
9
9
|
|
|
10
10
|
logger = logging.getLogger(__name__)
|
|
11
11
|
|
|
@@ -18,8 +18,8 @@ class IngestionEngine:
|
|
|
18
18
|
self.store = store
|
|
19
19
|
self.loader = Loader(self.store)
|
|
20
20
|
|
|
21
|
-
def
|
|
22
|
-
self.loader.
|
|
21
|
+
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
22
|
+
self.loader.add_ingestion_plan(ingestion_plan)
|
|
23
23
|
|
|
24
24
|
def load(self, dry_run: bool = False, provider: Optional[str] = None):
|
|
25
25
|
self.loader.collect_and_run(dry_run=dry_run, provider=provider)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import platform
|
|
3
|
+
import uuid
|
|
4
|
+
from multiprocessing import set_start_method
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from ingestify.domain.models import Selector
|
|
8
|
+
from ingestify.utils import TaskExecutor
|
|
9
|
+
|
|
10
|
+
from .dataset_store import DatasetStore
|
|
11
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
12
|
+
from ..domain.models.ingestion.ingestion_job import IngestionJob
|
|
13
|
+
from ..exceptions import ConfigurationError
|
|
14
|
+
|
|
15
|
+
if platform.system() == "Darwin":
|
|
16
|
+
set_start_method("fork", force=True)
|
|
17
|
+
else:
|
|
18
|
+
set_start_method("spawn", force=True)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Loader:
|
|
25
|
+
def __init__(self, store: DatasetStore):
|
|
26
|
+
self.store = store
|
|
27
|
+
self.ingestion_plans: List[IngestionPlan] = []
|
|
28
|
+
|
|
29
|
+
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
30
|
+
self.ingestion_plans.append(ingestion_plan)
|
|
31
|
+
|
|
32
|
+
def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
|
|
33
|
+
# First collect all selectors, before discovering datasets
|
|
34
|
+
selectors = {}
|
|
35
|
+
for ingestion_plan in self.ingestion_plans:
|
|
36
|
+
logger.info(f"Determining selectors for {ingestion_plan}")
|
|
37
|
+
|
|
38
|
+
if provider is not None:
|
|
39
|
+
if ingestion_plan.source.provider != provider:
|
|
40
|
+
logger.info(
|
|
41
|
+
f"Skipping {ingestion_plan} because provider doesn't match '{provider}'"
|
|
42
|
+
)
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
static_selectors = [
|
|
46
|
+
selector
|
|
47
|
+
for selector in ingestion_plan.selectors
|
|
48
|
+
if not selector.is_dynamic
|
|
49
|
+
]
|
|
50
|
+
dynamic_selectors = [
|
|
51
|
+
selector for selector in ingestion_plan.selectors if selector.is_dynamic
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
no_selectors = len(static_selectors) == 1 and not bool(static_selectors[0])
|
|
55
|
+
if dynamic_selectors or no_selectors:
|
|
56
|
+
if hasattr(ingestion_plan.source, "discover_selectors"):
|
|
57
|
+
logger.debug(
|
|
58
|
+
f"Discovering selectors from {ingestion_plan.source.__class__.__name__}"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# TODO: consider making this lazy and fetch once per Source instead of
|
|
62
|
+
# once per IngestionPlan
|
|
63
|
+
all_selectors = ingestion_plan.source.discover_selectors(
|
|
64
|
+
ingestion_plan.dataset_type
|
|
65
|
+
)
|
|
66
|
+
if no_selectors:
|
|
67
|
+
# When there were no selectors specified, just use all of them
|
|
68
|
+
extra_static_selectors = [
|
|
69
|
+
Selector.build(
|
|
70
|
+
job_selector,
|
|
71
|
+
data_spec_versions=ingestion_plan.data_spec_versions,
|
|
72
|
+
)
|
|
73
|
+
for job_selector in all_selectors
|
|
74
|
+
]
|
|
75
|
+
static_selectors = []
|
|
76
|
+
else:
|
|
77
|
+
extra_static_selectors = []
|
|
78
|
+
for dynamic_selector in dynamic_selectors:
|
|
79
|
+
dynamic_job_selectors = [
|
|
80
|
+
Selector.build(
|
|
81
|
+
job_selector,
|
|
82
|
+
data_spec_versions=ingestion_plan.data_spec_versions,
|
|
83
|
+
)
|
|
84
|
+
for job_selector in all_selectors
|
|
85
|
+
if dynamic_selector.is_match(job_selector)
|
|
86
|
+
]
|
|
87
|
+
extra_static_selectors.extend(dynamic_job_selectors)
|
|
88
|
+
logger.info(f"Added {len(dynamic_job_selectors)} selectors")
|
|
89
|
+
|
|
90
|
+
static_selectors.extend(extra_static_selectors)
|
|
91
|
+
|
|
92
|
+
logger.info(
|
|
93
|
+
f"Discovered {len(extra_static_selectors)} selectors from {ingestion_plan.source.__class__.__name__}"
|
|
94
|
+
)
|
|
95
|
+
else:
|
|
96
|
+
if not no_selectors:
|
|
97
|
+
# When there are no selectors and no discover_selectors, just pass it through. It might break
|
|
98
|
+
# later on
|
|
99
|
+
raise ConfigurationError(
|
|
100
|
+
f"Dynamic selectors cannot be used for "
|
|
101
|
+
f"{ingestion_plan.source.__class__.__name__} because it doesn't support"
|
|
102
|
+
f" selector discovery"
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
# Merge selectors when source, dataset_type and actual selector is the same. This makes
|
|
106
|
+
# sure there will be only 1 dataset for this combination
|
|
107
|
+
for selector in static_selectors:
|
|
108
|
+
key = (
|
|
109
|
+
ingestion_plan.source.name,
|
|
110
|
+
ingestion_plan.dataset_type,
|
|
111
|
+
selector.key,
|
|
112
|
+
)
|
|
113
|
+
if existing_selector := selectors.get(key):
|
|
114
|
+
existing_selector[1].data_spec_versions.merge(
|
|
115
|
+
selector.data_spec_versions
|
|
116
|
+
)
|
|
117
|
+
else:
|
|
118
|
+
selectors[key] = (ingestion_plan, selector)
|
|
119
|
+
|
|
120
|
+
"""
|
|
121
|
+
Data is denormalized:
|
|
122
|
+
|
|
123
|
+
It actually looks like:
|
|
124
|
+
- IngestionPlan #1
|
|
125
|
+
- Selector 1.1
|
|
126
|
+
- Selector 1.2
|
|
127
|
+
- Selector 1.3
|
|
128
|
+
- IngestionPlan #2
|
|
129
|
+
- Selector 2.1
|
|
130
|
+
- Selector 2.2
|
|
131
|
+
|
|
132
|
+
We process this as:
|
|
133
|
+
- IngestionPlan #1, Selector 1.1
|
|
134
|
+
- IngestionPlan #1, Selector 1.2
|
|
135
|
+
- IngestionPlan #1, Selector 1.3
|
|
136
|
+
- IngestionPlan #2, Selector 2.1
|
|
137
|
+
- IngestionPlan #2, Selector 2.2
|
|
138
|
+
|
|
139
|
+
IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
|
|
140
|
+
"""
|
|
141
|
+
for ingestion_plan, selector in selectors.values():
|
|
142
|
+
logger.info(
|
|
143
|
+
f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
ingestion_job = IngestionJob(
|
|
147
|
+
ingestion_job_id=str(uuid.uuid1()),
|
|
148
|
+
ingestion_plan=ingestion_plan,
|
|
149
|
+
selector=selector,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
with TaskExecutor(dry_run=dry_run) as task_executor:
|
|
153
|
+
for ingestion_job_summary in ingestion_job.execute(
|
|
154
|
+
self.store, task_executor=task_executor
|
|
155
|
+
):
|
|
156
|
+
# TODO: handle task_summaries
|
|
157
|
+
# Summarize to a IngestionJobSummary, and save to a database. This Summary can later be used in a
|
|
158
|
+
# next run to determine where to resume.
|
|
159
|
+
# TODO 2: Do we want to add additional information from the summary back to the Task, so it can use
|
|
160
|
+
# extra information to determine how/where to resume
|
|
161
|
+
ingestion_job_summary.output_report()
|
|
162
|
+
logger.info(f"Storing IngestionJobSummary")
|
|
163
|
+
self.store.save_ingestion_job_summary(ingestion_job_summary)
|
|
164
|
+
|
|
165
|
+
logger.info("Done")
|
|
@@ -11,10 +11,8 @@ from .dataset import (
|
|
|
11
11
|
LoadedFile,
|
|
12
12
|
Selector,
|
|
13
13
|
Revision,
|
|
14
|
-
dataset_repository_factory,
|
|
15
|
-
file_repository_factory,
|
|
16
14
|
)
|
|
17
|
-
from .sink import Sink
|
|
15
|
+
from .sink import Sink
|
|
18
16
|
from .source import Source
|
|
19
17
|
from .task import Task, TaskSet
|
|
20
18
|
from .data_spec_version_collection import DataSpecVersionCollection
|
|
@@ -35,11 +33,8 @@ __all__ = [
|
|
|
35
33
|
"FileRepository",
|
|
36
34
|
"FileCollection",
|
|
37
35
|
"DatasetRepository",
|
|
38
|
-
"dataset_repository_factory",
|
|
39
|
-
"file_repository_factory",
|
|
40
36
|
"TaskSet",
|
|
41
37
|
"Task",
|
|
42
38
|
"Sink",
|
|
43
|
-
"sink_factory",
|
|
44
39
|
"DataSpecVersionCollection",
|
|
45
40
|
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from functools import partial
|
|
2
|
+
from typing import ClassVar, Any, Optional
|
|
3
|
+
|
|
4
|
+
import pydantic
|
|
5
|
+
from pydantic import BaseModel as PydanticBaseModel, ConfigDict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# class BaseModel(PydanticBaseModel):
|
|
9
|
+
# model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
10
|
+
#
|
|
11
|
+
# _sa_instance_state: Optional[dict] = None
|
|
12
|
+
from sqlalchemy.orm import MappedAsDataclass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseModel(
|
|
16
|
+
MappedAsDataclass,
|
|
17
|
+
# DeclarativeBase,
|
|
18
|
+
dataclass_callable=partial(
|
|
19
|
+
pydantic.dataclasses.dataclass, config=ConfigDict(arbitrary_types_allowed=True)
|
|
20
|
+
),
|
|
21
|
+
):
|
|
22
|
+
pass
|
|
@@ -16,6 +16,12 @@ class DataSpecVersionCollection(dict):
|
|
|
16
16
|
|
|
17
17
|
return cls(items_)
|
|
18
18
|
|
|
19
|
+
def to_dict(self):
|
|
20
|
+
return {
|
|
21
|
+
data_feed_key: list(data_spec_versions)
|
|
22
|
+
for data_feed_key, data_spec_versions in self.items()
|
|
23
|
+
}
|
|
24
|
+
|
|
19
25
|
def copy(self):
|
|
20
26
|
return DataSpecVersionCollection(copy.deepcopy(self))
|
|
21
27
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
+
from .file import DraftFile, File, LoadedFile
|
|
1
2
|
from .collection import DatasetCollection
|
|
2
3
|
from .dataset import Dataset
|
|
3
|
-
from .dataset_repository import DatasetRepository
|
|
4
|
-
from .
|
|
5
|
-
from .file_repository import FileRepository, file_repository_factory
|
|
4
|
+
from .dataset_repository import DatasetRepository
|
|
5
|
+
from .file_repository import FileRepository
|
|
6
6
|
from .file_collection import FileCollection
|
|
7
7
|
from .identifier import Identifier
|
|
8
8
|
from .selector import Selector
|
|
@@ -16,12 +16,10 @@ __all__ = [
|
|
|
16
16
|
"Identifier",
|
|
17
17
|
"DatasetCollection",
|
|
18
18
|
"DatasetCreated",
|
|
19
|
-
"dataset_repository_factory",
|
|
20
19
|
"File",
|
|
21
20
|
"DraftFile",
|
|
22
21
|
"LoadedFile",
|
|
23
22
|
"DatasetRepository",
|
|
24
23
|
"FileRepository",
|
|
25
|
-
"file_repository_factory",
|
|
26
24
|
"FileCollection",
|
|
27
25
|
]
|
|
@@ -1,70 +1,52 @@
|
|
|
1
|
-
from dataclasses import dataclass, field
|
|
2
1
|
from datetime import datetime
|
|
3
2
|
from enum import Enum
|
|
4
3
|
from typing import List, Optional
|
|
4
|
+
from pydantic import Field
|
|
5
5
|
|
|
6
6
|
from ingestify.utils import utcnow
|
|
7
|
-
|
|
7
|
+
from .dataset_state import DatasetState
|
|
8
8
|
from .file import DraftFile
|
|
9
9
|
from .identifier import Identifier
|
|
10
|
-
from .revision import Revision
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class DatasetState(Enum):
|
|
14
|
-
SCHEDULED = "SCHEDULED"
|
|
15
|
-
PARTIAL = "PARTIAL"
|
|
16
|
-
COMPLETE = "COMPLETE"
|
|
17
|
-
|
|
18
|
-
@property
|
|
19
|
-
def is_complete(self):
|
|
20
|
-
return self == DatasetState.COMPLETE
|
|
10
|
+
from .revision import Revision, RevisionSource, SourceType
|
|
11
|
+
from ..base import BaseModel
|
|
21
12
|
|
|
22
|
-
def __str__(self):
|
|
23
|
-
return self.value
|
|
24
13
|
|
|
25
|
-
|
|
26
|
-
@dataclass
|
|
27
|
-
class Dataset:
|
|
14
|
+
class Dataset(BaseModel):
|
|
28
15
|
bucket: str # This must be set by the DatasetRepository
|
|
29
|
-
|
|
30
16
|
dataset_id: str
|
|
31
17
|
name: str
|
|
32
18
|
state: DatasetState
|
|
33
|
-
|
|
34
19
|
dataset_type: str
|
|
35
20
|
provider: str
|
|
36
|
-
|
|
37
21
|
identifier: Identifier
|
|
38
22
|
metadata: dict
|
|
39
|
-
|
|
40
23
|
created_at: datetime
|
|
41
24
|
updated_at: datetime
|
|
42
|
-
|
|
43
|
-
revisions: List[Revision] = field(default_factory=list)
|
|
25
|
+
revisions: List[Revision] = Field(default_factory=list)
|
|
44
26
|
|
|
45
27
|
@property
|
|
46
28
|
def is_complete(self):
|
|
47
29
|
return self.state.is_complete
|
|
48
30
|
|
|
49
|
-
def next_revision_id(self):
|
|
31
|
+
def next_revision_id(self) -> int:
|
|
50
32
|
return len(self.revisions)
|
|
51
33
|
|
|
52
34
|
def add_revision(self, revision: Revision):
|
|
53
35
|
self.revisions.append(revision)
|
|
54
36
|
self.updated_at = utcnow()
|
|
55
37
|
|
|
56
|
-
def
|
|
38
|
+
def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
|
|
57
39
|
changed = False
|
|
58
|
-
if self.name !=
|
|
59
|
-
self.name =
|
|
40
|
+
if self.name != name:
|
|
41
|
+
self.name = name
|
|
60
42
|
changed = True
|
|
61
43
|
|
|
62
|
-
if self.metadata !=
|
|
63
|
-
self.metadata =
|
|
44
|
+
if self.metadata != metadata:
|
|
45
|
+
self.metadata = metadata
|
|
64
46
|
changed = True
|
|
65
47
|
|
|
66
|
-
if self.state !=
|
|
67
|
-
self.state =
|
|
48
|
+
if self.state != state:
|
|
49
|
+
self.state = state
|
|
68
50
|
changed = True
|
|
69
51
|
|
|
70
52
|
if changed:
|
|
@@ -101,4 +83,5 @@ class Dataset:
|
|
|
101
83
|
description="Squashed revision",
|
|
102
84
|
is_squashed=True,
|
|
103
85
|
modified_files=list(files.values()),
|
|
86
|
+
source=RevisionSource(source_type=SourceType.SQUASHED, source_id=""),
|
|
104
87
|
)
|
|
@@ -1,16 +1,12 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from typing import Optional, List, Union
|
|
3
3
|
|
|
4
|
-
from ingestify.utils import ComponentFactory, ComponentRegistry
|
|
5
|
-
|
|
6
4
|
from .collection import DatasetCollection
|
|
7
5
|
from .dataset import Dataset
|
|
8
6
|
from .selector import Selector
|
|
9
7
|
|
|
10
|
-
dataset_repository_registry = ComponentRegistry()
|
|
11
|
-
|
|
12
8
|
|
|
13
|
-
class DatasetRepository(ABC
|
|
9
|
+
class DatasetRepository(ABC):
|
|
14
10
|
@abstractmethod
|
|
15
11
|
def get_dataset_collection(
|
|
16
12
|
self,
|
|
@@ -34,13 +30,3 @@ class DatasetRepository(ABC, metaclass=dataset_repository_registry.metaclass):
|
|
|
34
30
|
@abstractmethod
|
|
35
31
|
def next_identity(self):
|
|
36
32
|
pass
|
|
37
|
-
|
|
38
|
-
@classmethod
|
|
39
|
-
@abstractmethod
|
|
40
|
-
def supports(cls, url: str) -> bool:
|
|
41
|
-
pass
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
dataset_repository_factory = ComponentFactory.build_factory(
|
|
45
|
-
DatasetRepository, dataset_repository_registry
|
|
46
|
-
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import ClassVar
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
from ingestify.domain.models.event.domain_event import DomainEvent
|
|
6
|
+
from .dataset import Dataset
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DatasetCreated(DomainEvent):
|
|
10
|
+
dataset: Dataset
|
|
11
|
+
event_type: ClassVar[str] = "dataset_created"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RevisionAdded(DomainEvent):
|
|
15
|
+
dataset: Dataset
|
|
16
|
+
event_type: ClassVar[str] = "revision_added"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MetadataUpdated(DomainEvent):
|
|
20
|
+
dataset: Dataset
|
|
21
|
+
event_type: ClassVar[str] = "metadata_updated"
|