ingestify 0.1.2__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.1.2 → ingestify-0.2.0}/PKG-INFO +1 -1
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/__init__.py +1 -1
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/application/dataset_store.py +44 -24
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/application/ingestion_engine.py +3 -3
- ingestify-0.2.0/ingestify/application/loader.py +165 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/cmdline.py +2 -1
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/__init__.py +1 -6
- ingestify-0.2.0/ingestify/domain/models/base.py +22 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/data_spec_version_collection.py +6 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/__init__.py +3 -5
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/dataset.py +15 -32
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/dataset_repository.py +1 -15
- ingestify-0.2.0/ingestify/domain/models/dataset/dataset_state.py +11 -0
- ingestify-0.2.0/ingestify/domain/models/dataset/events.py +21 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/file.py +21 -34
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/file_collection.py +3 -1
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/file_repository.py +1 -10
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/revision.py +26 -3
- ingestify-0.2.0/ingestify/domain/models/event/domain_event.py +14 -0
- ingestify-0.2.0/ingestify/domain/models/ingestion/ingestion_job.py +292 -0
- ingestify-0.2.0/ingestify/domain/models/ingestion/ingestion_job_summary.py +106 -0
- ingestify-0.1.2/ingestify/domain/models/extract_job.py → ingestify-0.2.0/ingestify/domain/models/ingestion/ingestion_plan.py +4 -4
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/resources/dataset_resource.py +29 -37
- ingestify-0.2.0/ingestify/domain/models/sink.py +9 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/task/task.py +3 -1
- ingestify-0.2.0/ingestify/domain/models/task/task_summary.py +118 -0
- ingestify-0.2.0/ingestify/domain/models/timing.py +16 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/fetch/http.py +5 -0
- ingestify-0.2.0/ingestify/infra/source/statsbomb_github.py +105 -0
- ingestify-0.2.0/ingestify/infra/store/dataset/sqlalchemy/mapping.py +333 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -22
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/main.py +42 -22
- ingestify-0.2.0/ingestify/static/templates/statsbomb_github/README.md +0 -0
- ingestify-0.2.0/ingestify/static/templates/wyscout/README.md +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/utils.py +25 -78
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/PKG-INFO +1 -1
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/SOURCES.txt +8 -2
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/requires.txt +1 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/setup.py +1 -0
- ingestify-0.1.2/ingestify/application/loader.py +0 -335
- ingestify-0.1.2/ingestify/domain/models/dataset/events.py +0 -31
- ingestify-0.1.2/ingestify/domain/models/event/domain_event.py +0 -10
- ingestify-0.1.2/ingestify/domain/models/sink.py +0 -16
- ingestify-0.1.2/ingestify/infra/source/statsbomb_github.py +0 -92
- ingestify-0.1.2/ingestify/infra/store/dataset/__init__.py +0 -2
- ingestify-0.1.2/ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
- ingestify-0.1.2/ingestify/infra/store/dataset/sqlalchemy/mapping.py +0 -153
- {ingestify-0.1.2 → ingestify-0.2.0}/README.md +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/application/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.1.2/ingestify/domain/services → ingestify-0.2.0/ingestify/domain/models/ingestion}/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.1.2/ingestify/domain/services/transformers → ingestify-0.2.0/ingestify/domain/services}/__init__.py +0 -0
- {ingestify-0.1.2/ingestify/infra/fetch → ingestify-0.2.0/ingestify/domain/services/transformers}/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/exceptions.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.1.2/ingestify/infra/sink → ingestify-0.2.0/ingestify/infra/fetch}/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.1.2/ingestify/infra/source → ingestify-0.2.0/ingestify/infra/sink}/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/sink/postgresql.py +0 -0
- /ingestify-0.1.2/ingestify/static/templates/statsbomb_github/README.md → /ingestify-0.2.0/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/source/wyscout.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/__init__.py +0 -0
- /ingestify-0.1.2/ingestify/static/templates/wyscout/README.md → /ingestify-0.2.0/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/server.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/source_base.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/statsbomb_github/query.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/wyscout/.env +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/wyscout/.gitignore +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/wyscout/database/README.md +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/wyscout/query.py +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.1.2 → ingestify-0.2.0}/setup.cfg +0 -0
|
@@ -5,13 +5,14 @@ import mimetypes
|
|
|
5
5
|
import os
|
|
6
6
|
import shutil
|
|
7
7
|
from dataclasses import asdict
|
|
8
|
-
from io import BytesIO
|
|
8
|
+
from io import BytesIO
|
|
9
9
|
|
|
10
|
-
from typing import Dict, List, Optional, Union, Callable, BinaryIO
|
|
10
|
+
from typing import Dict, List, Optional, Union, Callable, BinaryIO, Awaitable
|
|
11
11
|
|
|
12
12
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
13
13
|
from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
|
|
14
14
|
from ingestify.domain.models.dataset.file_collection import FileCollection
|
|
15
|
+
from ingestify.domain.models.dataset.revision import RevisionSource
|
|
15
16
|
from ingestify.domain.models.event import EventBus
|
|
16
17
|
from ingestify.domain.models import (
|
|
17
18
|
Dataset,
|
|
@@ -27,7 +28,7 @@ from ingestify.domain.models import (
|
|
|
27
28
|
Revision,
|
|
28
29
|
DatasetCreated,
|
|
29
30
|
)
|
|
30
|
-
from ingestify.utils import utcnow
|
|
31
|
+
from ingestify.utils import utcnow
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
logger = logging.getLogger(__name__)
|
|
@@ -56,6 +57,10 @@ class DatasetStore:
|
|
|
56
57
|
if self.event_bus:
|
|
57
58
|
self.event_bus.dispatch(event)
|
|
58
59
|
|
|
60
|
+
def save_ingestion_job_summary(self, ingestion_job_summary):
|
|
61
|
+
self.dataset_repository.session.add(ingestion_job_summary)
|
|
62
|
+
self.dataset_repository.session.commit()
|
|
63
|
+
|
|
59
64
|
def get_dataset_collection(
|
|
60
65
|
self,
|
|
61
66
|
dataset_type: Optional[str] = None,
|
|
@@ -107,7 +112,9 @@ class DatasetStore:
|
|
|
107
112
|
|
|
108
113
|
return stream, storage_size, suffix
|
|
109
114
|
|
|
110
|
-
def _prepare_read_stream(
|
|
115
|
+
def _prepare_read_stream(
|
|
116
|
+
self,
|
|
117
|
+
) -> tuple[Callable[[BinaryIO], Awaitable[BytesIO]], str]:
|
|
111
118
|
if self.storage_compression_method == "gzip":
|
|
112
119
|
|
|
113
120
|
def reader(fh: BinaryIO) -> BytesIO:
|
|
@@ -168,7 +175,11 @@ class DatasetStore:
|
|
|
168
175
|
return modified_files_
|
|
169
176
|
|
|
170
177
|
def add_revision(
|
|
171
|
-
self,
|
|
178
|
+
self,
|
|
179
|
+
dataset: Dataset,
|
|
180
|
+
files: Dict[str, DraftFile],
|
|
181
|
+
revision_source: RevisionSource,
|
|
182
|
+
description: str = "Update",
|
|
172
183
|
):
|
|
173
184
|
"""
|
|
174
185
|
Create new revision first, so FileRepository can use
|
|
@@ -182,46 +193,53 @@ class DatasetStore:
|
|
|
182
193
|
# It can happen an API tells us data is changed, but it was not changed. In this case
|
|
183
194
|
# we decide to ignore it.
|
|
184
195
|
# Make sure there are files changed before creating a new revision
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
)
|
|
196
|
+
revision = Revision(
|
|
197
|
+
revision_id=revision_id,
|
|
198
|
+
created_at=created_at,
|
|
199
|
+
description=description,
|
|
200
|
+
modified_files=persisted_files_,
|
|
201
|
+
source=revision_source,
|
|
192
202
|
)
|
|
193
203
|
|
|
204
|
+
dataset.add_revision(revision)
|
|
205
|
+
|
|
194
206
|
self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
|
|
195
207
|
self.dispatch(RevisionAdded(dataset=dataset))
|
|
196
208
|
logger.info(
|
|
197
209
|
f"Added a new revision to {dataset.identifier} -> {', '.join([file.file_id for file in persisted_files_])}"
|
|
198
210
|
)
|
|
199
|
-
return True
|
|
200
211
|
else:
|
|
201
212
|
logger.info(
|
|
202
213
|
f"Ignoring a new revision without changed files -> {dataset.identifier}"
|
|
203
214
|
)
|
|
204
|
-
|
|
215
|
+
revision = None
|
|
216
|
+
|
|
217
|
+
return revision
|
|
205
218
|
|
|
206
219
|
def update_dataset(
|
|
207
220
|
self,
|
|
208
221
|
dataset: Dataset,
|
|
209
|
-
|
|
222
|
+
name: str,
|
|
223
|
+
state: DatasetState,
|
|
224
|
+
metadata: dict,
|
|
210
225
|
files: Dict[str, DraftFile],
|
|
226
|
+
revision_source: RevisionSource,
|
|
211
227
|
):
|
|
212
228
|
"""The add_revision will also save the dataset."""
|
|
213
229
|
metadata_changed = False
|
|
214
|
-
if dataset.
|
|
230
|
+
if dataset.update_metadata(name, metadata, state):
|
|
215
231
|
self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
|
|
216
232
|
metadata_changed = True
|
|
217
233
|
|
|
218
|
-
self.add_revision(dataset, files)
|
|
234
|
+
revision = self.add_revision(dataset, files, revision_source)
|
|
219
235
|
|
|
220
236
|
if metadata_changed:
|
|
221
237
|
# Dispatch after revision added. Otherwise, the downstream handlers are not able to see
|
|
222
238
|
# the new revision
|
|
223
239
|
self.dispatch(MetadataUpdated(dataset=dataset))
|
|
224
240
|
|
|
241
|
+
return revision
|
|
242
|
+
|
|
225
243
|
def destroy_dataset(self, dataset: Dataset):
|
|
226
244
|
# TODO: remove files. Now we leave some orphaned files around
|
|
227
245
|
self.dataset_repository.destroy(dataset)
|
|
@@ -235,6 +253,7 @@ class DatasetStore:
|
|
|
235
253
|
state: DatasetState,
|
|
236
254
|
metadata: dict,
|
|
237
255
|
files: Dict[str, DraftFile],
|
|
256
|
+
revision_source: RevisionSource,
|
|
238
257
|
description: str = "Create",
|
|
239
258
|
):
|
|
240
259
|
now = utcnow()
|
|
@@ -251,9 +270,10 @@ class DatasetStore:
|
|
|
251
270
|
created_at=now,
|
|
252
271
|
updated_at=now,
|
|
253
272
|
)
|
|
254
|
-
self.add_revision(dataset, files, description)
|
|
273
|
+
revision = self.add_revision(dataset, files, revision_source, description)
|
|
255
274
|
|
|
256
275
|
self.dispatch(DatasetCreated(dataset=dataset))
|
|
276
|
+
return revision
|
|
257
277
|
|
|
258
278
|
def load_files(
|
|
259
279
|
self,
|
|
@@ -302,8 +322,8 @@ class DatasetStore:
|
|
|
302
322
|
|
|
303
323
|
try:
|
|
304
324
|
return statsbomb.load(
|
|
305
|
-
event_data=files.get_file("events").stream,
|
|
306
|
-
lineup_data=files.get_file("lineups").stream,
|
|
325
|
+
event_data=(files.get_file("events")).stream,
|
|
326
|
+
lineup_data=(files.get_file("lineups")).stream,
|
|
307
327
|
**kwargs,
|
|
308
328
|
)
|
|
309
329
|
except Exception as e:
|
|
@@ -333,7 +353,7 @@ class DatasetStore:
|
|
|
333
353
|
# filename=filename,
|
|
334
354
|
# )
|
|
335
355
|
|
|
336
|
-
def map(
|
|
337
|
-
|
|
338
|
-
):
|
|
339
|
-
|
|
356
|
+
# def map(
|
|
357
|
+
# self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
|
|
358
|
+
# ):
|
|
359
|
+
# return map_in_pool(fn, dataset_collection, processes)
|
|
@@ -5,7 +5,7 @@ from typing import Optional, List
|
|
|
5
5
|
|
|
6
6
|
from .loader import Loader
|
|
7
7
|
from .dataset_store import DatasetStore
|
|
8
|
-
from
|
|
8
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
9
9
|
|
|
10
10
|
logger = logging.getLogger(__name__)
|
|
11
11
|
|
|
@@ -18,8 +18,8 @@ class IngestionEngine:
|
|
|
18
18
|
self.store = store
|
|
19
19
|
self.loader = Loader(self.store)
|
|
20
20
|
|
|
21
|
-
def
|
|
22
|
-
self.loader.
|
|
21
|
+
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
22
|
+
self.loader.add_ingestion_plan(ingestion_plan)
|
|
23
23
|
|
|
24
24
|
def load(self, dry_run: bool = False, provider: Optional[str] = None):
|
|
25
25
|
self.loader.collect_and_run(dry_run=dry_run, provider=provider)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import platform
|
|
3
|
+
import uuid
|
|
4
|
+
from multiprocessing import set_start_method
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from ingestify.domain.models import Selector
|
|
8
|
+
from ingestify.utils import TaskExecutor
|
|
9
|
+
|
|
10
|
+
from .dataset_store import DatasetStore
|
|
11
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
12
|
+
from ..domain.models.ingestion.ingestion_job import IngestionJob
|
|
13
|
+
from ..exceptions import ConfigurationError
|
|
14
|
+
|
|
15
|
+
if platform.system() == "Darwin":
|
|
16
|
+
set_start_method("fork", force=True)
|
|
17
|
+
else:
|
|
18
|
+
set_start_method("spawn", force=True)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Loader:
|
|
25
|
+
def __init__(self, store: DatasetStore):
|
|
26
|
+
self.store = store
|
|
27
|
+
self.ingestion_plans: List[IngestionPlan] = []
|
|
28
|
+
|
|
29
|
+
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
30
|
+
self.ingestion_plans.append(ingestion_plan)
|
|
31
|
+
|
|
32
|
+
def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
|
|
33
|
+
# First collect all selectors, before discovering datasets
|
|
34
|
+
selectors = {}
|
|
35
|
+
for ingestion_plan in self.ingestion_plans:
|
|
36
|
+
if provider is not None:
|
|
37
|
+
if ingestion_plan.source.provider != provider:
|
|
38
|
+
logger.info(
|
|
39
|
+
f"Skipping {ingestion_plan} because provider doesn't match '{provider}'"
|
|
40
|
+
)
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
static_selectors = [
|
|
44
|
+
selector
|
|
45
|
+
for selector in ingestion_plan.selectors
|
|
46
|
+
if not selector.is_dynamic
|
|
47
|
+
]
|
|
48
|
+
dynamic_selectors = [
|
|
49
|
+
selector for selector in ingestion_plan.selectors if selector.is_dynamic
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
no_selectors = len(static_selectors) == 1 and not bool(static_selectors[0])
|
|
53
|
+
if dynamic_selectors or no_selectors:
|
|
54
|
+
if hasattr(ingestion_plan.source, "discover_selectors"):
|
|
55
|
+
logger.debug(
|
|
56
|
+
f"Discovering selectors from {ingestion_plan.source.__class__.__name__}"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# TODO: consider making this lazy and fetch once per Source instead of
|
|
60
|
+
# once per IngestionPlan
|
|
61
|
+
all_selectors = ingestion_plan.source.discover_selectors(
|
|
62
|
+
ingestion_plan.dataset_type
|
|
63
|
+
)
|
|
64
|
+
if no_selectors:
|
|
65
|
+
# When there were no selectors specified, just use all of them
|
|
66
|
+
extra_static_selectors = [
|
|
67
|
+
Selector.build(
|
|
68
|
+
job_selector,
|
|
69
|
+
data_spec_versions=ingestion_plan.data_spec_versions,
|
|
70
|
+
)
|
|
71
|
+
for job_selector in all_selectors
|
|
72
|
+
]
|
|
73
|
+
static_selectors = []
|
|
74
|
+
else:
|
|
75
|
+
extra_static_selectors = []
|
|
76
|
+
for dynamic_selector in dynamic_selectors:
|
|
77
|
+
dynamic_job_selectors = [
|
|
78
|
+
Selector.build(
|
|
79
|
+
job_selector,
|
|
80
|
+
data_spec_versions=ingestion_plan.data_spec_versions,
|
|
81
|
+
)
|
|
82
|
+
for job_selector in all_selectors
|
|
83
|
+
if dynamic_selector.is_match(job_selector)
|
|
84
|
+
]
|
|
85
|
+
extra_static_selectors.extend(dynamic_job_selectors)
|
|
86
|
+
logger.info(f"Added {len(dynamic_job_selectors)} selectors")
|
|
87
|
+
|
|
88
|
+
static_selectors.extend(extra_static_selectors)
|
|
89
|
+
|
|
90
|
+
logger.info(
|
|
91
|
+
f"Discovered {len(extra_static_selectors)} selectors from {ingestion_plan.source.__class__.__name__}"
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
if not no_selectors:
|
|
95
|
+
# When there are no selectors and no discover_selectors, just pass it through. It might break
|
|
96
|
+
# later on
|
|
97
|
+
raise ConfigurationError(
|
|
98
|
+
f"Dynamic selectors cannot be used for "
|
|
99
|
+
f"{ingestion_plan.source.__class__.__name__} because it doesn't support"
|
|
100
|
+
f" selector discovery"
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# Merge selectors when source, dataset_type and actual selector is the same. This makes
|
|
104
|
+
# sure there will be only 1 dataset for this combination
|
|
105
|
+
for selector in static_selectors:
|
|
106
|
+
key = (
|
|
107
|
+
ingestion_plan.source.name,
|
|
108
|
+
ingestion_plan.dataset_type,
|
|
109
|
+
selector.key,
|
|
110
|
+
)
|
|
111
|
+
if existing_selector := selectors.get(key):
|
|
112
|
+
existing_selector[1].data_spec_versions.merge(
|
|
113
|
+
selector.data_spec_versions
|
|
114
|
+
)
|
|
115
|
+
else:
|
|
116
|
+
selectors[key] = (ingestion_plan, selector)
|
|
117
|
+
|
|
118
|
+
"""
|
|
119
|
+
Data is denormalized:
|
|
120
|
+
|
|
121
|
+
It actually looks like:
|
|
122
|
+
- IngestionPlan #1
|
|
123
|
+
- Selector 1.1
|
|
124
|
+
- Selector 1.2
|
|
125
|
+
- Selector 1.3
|
|
126
|
+
- IngestionPlan #2
|
|
127
|
+
- Selector 2.1
|
|
128
|
+
- Selector 2.2
|
|
129
|
+
|
|
130
|
+
We process this as:
|
|
131
|
+
- IngestionPlan #1, Selector 1.1
|
|
132
|
+
- IngestionPlan #1, Selector 1.2
|
|
133
|
+
- IngestionPlan #1, Selector 1.3
|
|
134
|
+
- IngestionPlan #2, Selector 2.1
|
|
135
|
+
- IngestionPlan #2, Selector 2.2
|
|
136
|
+
|
|
137
|
+
IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
|
|
138
|
+
"""
|
|
139
|
+
for ingestion_plan, selector in selectors.values():
|
|
140
|
+
logger.debug(
|
|
141
|
+
f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
ingestion_job = IngestionJob(
|
|
145
|
+
ingestion_job_id=str(uuid.uuid1()),
|
|
146
|
+
ingestion_plan=ingestion_plan,
|
|
147
|
+
selector=selector,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
with TaskExecutor(dry_run=dry_run) as task_executor:
|
|
151
|
+
ingestion_job_summary = ingestion_job.execute(
|
|
152
|
+
self.store, task_executor=task_executor
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# TODO: handle task_summaries
|
|
156
|
+
# Summarize to a IngestionJobSummary, and save to a database. This Summary can later be used in a
|
|
157
|
+
# next run to determine where to resume.
|
|
158
|
+
# TODO 2: Do we want to add additional information from the summary back to the Task, so it can use
|
|
159
|
+
# extra information to determine how/where to resume
|
|
160
|
+
ingestion_job_summary.set_finished()
|
|
161
|
+
|
|
162
|
+
ingestion_job_summary.output_report()
|
|
163
|
+
self.store.save_ingestion_job_summary(ingestion_job_summary)
|
|
164
|
+
|
|
165
|
+
logger.info("Done")
|
|
@@ -12,6 +12,7 @@ from ingestify.exceptions import ConfigurationError
|
|
|
12
12
|
from ingestify.main import get_engine
|
|
13
13
|
|
|
14
14
|
from ingestify import __version__
|
|
15
|
+
from ingestify.utils import try_number
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger(__name__)
|
|
17
18
|
#
|
|
@@ -174,7 +175,7 @@ def delete_dataset(
|
|
|
174
175
|
if "=" in dataset_id:
|
|
175
176
|
selector = {
|
|
176
177
|
# TODO: this `int` will might break stuff. Issue here is the int != str
|
|
177
|
-
_[0]:
|
|
178
|
+
_[0]: try_number(_[1])
|
|
178
179
|
for _ in [_.split("=") for _ in dataset_id.split("/")]
|
|
179
180
|
}
|
|
180
181
|
else:
|
|
@@ -11,10 +11,8 @@ from .dataset import (
|
|
|
11
11
|
LoadedFile,
|
|
12
12
|
Selector,
|
|
13
13
|
Revision,
|
|
14
|
-
dataset_repository_factory,
|
|
15
|
-
file_repository_factory,
|
|
16
14
|
)
|
|
17
|
-
from .sink import Sink
|
|
15
|
+
from .sink import Sink
|
|
18
16
|
from .source import Source
|
|
19
17
|
from .task import Task, TaskSet
|
|
20
18
|
from .data_spec_version_collection import DataSpecVersionCollection
|
|
@@ -35,11 +33,8 @@ __all__ = [
|
|
|
35
33
|
"FileRepository",
|
|
36
34
|
"FileCollection",
|
|
37
35
|
"DatasetRepository",
|
|
38
|
-
"dataset_repository_factory",
|
|
39
|
-
"file_repository_factory",
|
|
40
36
|
"TaskSet",
|
|
41
37
|
"Task",
|
|
42
38
|
"Sink",
|
|
43
|
-
"sink_factory",
|
|
44
39
|
"DataSpecVersionCollection",
|
|
45
40
|
]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from functools import partial
|
|
2
|
+
from typing import ClassVar, Any, Optional
|
|
3
|
+
|
|
4
|
+
import pydantic
|
|
5
|
+
from pydantic import BaseModel as PydanticBaseModel, ConfigDict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# class BaseModel(PydanticBaseModel):
|
|
9
|
+
# model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
10
|
+
#
|
|
11
|
+
# _sa_instance_state: Optional[dict] = None
|
|
12
|
+
from sqlalchemy.orm import MappedAsDataclass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseModel(
|
|
16
|
+
MappedAsDataclass,
|
|
17
|
+
# DeclarativeBase,
|
|
18
|
+
dataclass_callable=partial(
|
|
19
|
+
pydantic.dataclasses.dataclass, config=ConfigDict(arbitrary_types_allowed=True)
|
|
20
|
+
),
|
|
21
|
+
):
|
|
22
|
+
pass
|
|
@@ -16,6 +16,12 @@ class DataSpecVersionCollection(dict):
|
|
|
16
16
|
|
|
17
17
|
return cls(items_)
|
|
18
18
|
|
|
19
|
+
def to_dict(self):
|
|
20
|
+
return {
|
|
21
|
+
data_feed_key: list(data_spec_versions)
|
|
22
|
+
for data_feed_key, data_spec_versions in self.items()
|
|
23
|
+
}
|
|
24
|
+
|
|
19
25
|
def copy(self):
|
|
20
26
|
return DataSpecVersionCollection(copy.deepcopy(self))
|
|
21
27
|
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
+
from .file import DraftFile, File, LoadedFile
|
|
1
2
|
from .collection import DatasetCollection
|
|
2
3
|
from .dataset import Dataset
|
|
3
|
-
from .dataset_repository import DatasetRepository
|
|
4
|
-
from .
|
|
5
|
-
from .file_repository import FileRepository, file_repository_factory
|
|
4
|
+
from .dataset_repository import DatasetRepository
|
|
5
|
+
from .file_repository import FileRepository
|
|
6
6
|
from .file_collection import FileCollection
|
|
7
7
|
from .identifier import Identifier
|
|
8
8
|
from .selector import Selector
|
|
@@ -16,12 +16,10 @@ __all__ = [
|
|
|
16
16
|
"Identifier",
|
|
17
17
|
"DatasetCollection",
|
|
18
18
|
"DatasetCreated",
|
|
19
|
-
"dataset_repository_factory",
|
|
20
19
|
"File",
|
|
21
20
|
"DraftFile",
|
|
22
21
|
"LoadedFile",
|
|
23
22
|
"DatasetRepository",
|
|
24
23
|
"FileRepository",
|
|
25
|
-
"file_repository_factory",
|
|
26
24
|
"FileCollection",
|
|
27
25
|
]
|
|
@@ -1,70 +1,52 @@
|
|
|
1
|
-
from dataclasses import dataclass, field
|
|
2
1
|
from datetime import datetime
|
|
3
2
|
from enum import Enum
|
|
4
3
|
from typing import List, Optional
|
|
4
|
+
from pydantic import Field
|
|
5
5
|
|
|
6
6
|
from ingestify.utils import utcnow
|
|
7
|
-
|
|
7
|
+
from .dataset_state import DatasetState
|
|
8
8
|
from .file import DraftFile
|
|
9
9
|
from .identifier import Identifier
|
|
10
|
-
from .revision import Revision
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class DatasetState(Enum):
|
|
14
|
-
SCHEDULED = "SCHEDULED"
|
|
15
|
-
PARTIAL = "PARTIAL"
|
|
16
|
-
COMPLETE = "COMPLETE"
|
|
17
|
-
|
|
18
|
-
@property
|
|
19
|
-
def is_complete(self):
|
|
20
|
-
return self == DatasetState.COMPLETE
|
|
10
|
+
from .revision import Revision, RevisionSource, SourceType
|
|
11
|
+
from ..base import BaseModel
|
|
21
12
|
|
|
22
|
-
def __str__(self):
|
|
23
|
-
return self.value
|
|
24
13
|
|
|
25
|
-
|
|
26
|
-
@dataclass
|
|
27
|
-
class Dataset:
|
|
14
|
+
class Dataset(BaseModel):
|
|
28
15
|
bucket: str # This must be set by the DatasetRepository
|
|
29
|
-
|
|
30
16
|
dataset_id: str
|
|
31
17
|
name: str
|
|
32
18
|
state: DatasetState
|
|
33
|
-
|
|
34
19
|
dataset_type: str
|
|
35
20
|
provider: str
|
|
36
|
-
|
|
37
21
|
identifier: Identifier
|
|
38
22
|
metadata: dict
|
|
39
|
-
|
|
40
23
|
created_at: datetime
|
|
41
24
|
updated_at: datetime
|
|
42
|
-
|
|
43
|
-
revisions: List[Revision] = field(default_factory=list)
|
|
25
|
+
revisions: List[Revision] = Field(default_factory=list)
|
|
44
26
|
|
|
45
27
|
@property
|
|
46
28
|
def is_complete(self):
|
|
47
29
|
return self.state.is_complete
|
|
48
30
|
|
|
49
|
-
def next_revision_id(self):
|
|
31
|
+
def next_revision_id(self) -> int:
|
|
50
32
|
return len(self.revisions)
|
|
51
33
|
|
|
52
34
|
def add_revision(self, revision: Revision):
|
|
53
35
|
self.revisions.append(revision)
|
|
54
36
|
self.updated_at = utcnow()
|
|
55
37
|
|
|
56
|
-
def
|
|
38
|
+
def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
|
|
57
39
|
changed = False
|
|
58
|
-
if self.name !=
|
|
59
|
-
self.name =
|
|
40
|
+
if self.name != name:
|
|
41
|
+
self.name = name
|
|
60
42
|
changed = True
|
|
61
43
|
|
|
62
|
-
if self.metadata !=
|
|
63
|
-
self.metadata =
|
|
44
|
+
if self.metadata != metadata:
|
|
45
|
+
self.metadata = metadata
|
|
64
46
|
changed = True
|
|
65
47
|
|
|
66
|
-
if self.state !=
|
|
67
|
-
self.state =
|
|
48
|
+
if self.state != state:
|
|
49
|
+
self.state = state
|
|
68
50
|
changed = True
|
|
69
51
|
|
|
70
52
|
if changed:
|
|
@@ -101,4 +83,5 @@ class Dataset:
|
|
|
101
83
|
description="Squashed revision",
|
|
102
84
|
is_squashed=True,
|
|
103
85
|
modified_files=list(files.values()),
|
|
86
|
+
source=RevisionSource(source_type=SourceType.SQUASHED, source_id=""),
|
|
104
87
|
)
|
|
@@ -1,16 +1,12 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from typing import Optional, List, Union
|
|
3
3
|
|
|
4
|
-
from ingestify.utils import ComponentFactory, ComponentRegistry
|
|
5
|
-
|
|
6
4
|
from .collection import DatasetCollection
|
|
7
5
|
from .dataset import Dataset
|
|
8
6
|
from .selector import Selector
|
|
9
7
|
|
|
10
|
-
dataset_repository_registry = ComponentRegistry()
|
|
11
|
-
|
|
12
8
|
|
|
13
|
-
class DatasetRepository(ABC
|
|
9
|
+
class DatasetRepository(ABC):
|
|
14
10
|
@abstractmethod
|
|
15
11
|
def get_dataset_collection(
|
|
16
12
|
self,
|
|
@@ -34,13 +30,3 @@ class DatasetRepository(ABC, metaclass=dataset_repository_registry.metaclass):
|
|
|
34
30
|
@abstractmethod
|
|
35
31
|
def next_identity(self):
|
|
36
32
|
pass
|
|
37
|
-
|
|
38
|
-
@classmethod
|
|
39
|
-
@abstractmethod
|
|
40
|
-
def supports(cls, url: str) -> bool:
|
|
41
|
-
pass
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
dataset_repository_factory = ComponentFactory.build_factory(
|
|
45
|
-
DatasetRepository, dataset_repository_registry
|
|
46
|
-
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import ClassVar
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
5
|
+
from ingestify.domain.models.event.domain_event import DomainEvent
|
|
6
|
+
from .dataset import Dataset
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DatasetCreated(DomainEvent):
|
|
10
|
+
dataset: Dataset
|
|
11
|
+
event_type: ClassVar[str] = "dataset_created"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class RevisionAdded(DomainEvent):
|
|
15
|
+
dataset: Dataset
|
|
16
|
+
event_type: ClassVar[str] = "revision_added"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MetadataUpdated(DomainEvent):
|
|
20
|
+
dataset: Dataset
|
|
21
|
+
event_type: ClassVar[str] = "metadata_updated"
|