ingestify 0.3.3__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.3.3 → ingestify-0.4.0}/PKG-INFO +16 -16
- {ingestify-0.3.3 → ingestify-0.4.0}/README.md +15 -15
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/__init__.py +1 -1
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/application/dataset_store.py +4 -4
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/application/ingestion_engine.py +7 -2
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/application/loader.py +14 -1
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/cmdline.py +20 -2
- ingestify-0.4.0/ingestify/domain/models/base.py +5 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/collection_metadata.py +2 -1
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/dataset.py +18 -1
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/file.py +5 -5
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/revision.py +6 -2
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/ingestion/ingestion_job.py +69 -45
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +48 -40
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/task/task_summary.py +11 -32
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/exceptions.py +4 -0
- ingestify-0.4.0/ingestify/infra/serialization/__init__.py +22 -0
- ingestify-0.4.0/ingestify/infra/store/dataset/sqlalchemy/repository.py +483 -0
- ingestify-0.3.3/ingestify/infra/store/dataset/sqlalchemy/mapping.py → ingestify-0.4.0/ingestify/infra/store/dataset/sqlalchemy/tables.py +103 -79
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/utils.py +48 -16
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/PKG-INFO +16 -16
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/SOURCES.txt +1 -1
- ingestify-0.3.3/ingestify/domain/models/base.py +0 -22
- ingestify-0.3.3/ingestify/infra/serialization/__init__.py +0 -50
- ingestify-0.3.3/ingestify/infra/store/dataset/sqlalchemy/repository.py +0 -239
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/application/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/data_spec_version_collection.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/dataset_state.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/events.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/resources/dataset_resource.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/fetch/http.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/source/statsbomb_github.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/source/wyscout.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/main.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/server.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/source_base.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/README.md +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/query.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/.env +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/.gitignore +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/README.md +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/database/README.md +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/query.py +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/requires.txt +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/setup.cfg +0 -0
- {ingestify-0.3.3 → ingestify-0.4.0}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ingestify
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Data Ingestion Framework
|
|
5
5
|
Author: Koen Vossen
|
|
6
6
|
Author-email: info@koenvossen.nl
|
|
@@ -215,23 +215,23 @@ dataset_collection = store.get_dataset_collection(
|
|
|
215
215
|
store.map(
|
|
216
216
|
lambda dataset: (
|
|
217
217
|
store
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
"*",
|
|
225
|
-
match_id=dataset.
|
|
226
|
-
competition_id=dataset.
|
|
227
|
-
season_id=dataset.
|
|
228
|
-
|
|
218
|
+
|
|
219
|
+
# As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
|
|
220
|
+
.load_with_kloppy(dataset)
|
|
221
|
+
|
|
222
|
+
# Convert it into a polars dataframe using all columns in the original data and some more additional ones
|
|
223
|
+
.to_df(
|
|
224
|
+
"*",
|
|
225
|
+
match_id=dataset.dataset_resource_id.match_id,
|
|
226
|
+
competition_id=dataset.dataset_resource_id.competition_id,
|
|
227
|
+
season_id=dataset.dataset_resource_id.season_id,
|
|
228
|
+
|
|
229
229
|
engine="polars"
|
|
230
230
|
)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
f"/tmp/files/blaat/{dataset.
|
|
231
|
+
|
|
232
|
+
# Write to parquet format
|
|
233
|
+
.write_parquet(
|
|
234
|
+
f"/tmp/files/blaat/{dataset.dataset_resource_id.match_id}.parquet"
|
|
235
235
|
)
|
|
236
236
|
),
|
|
237
237
|
dataset_collection,
|
|
@@ -205,23 +205,23 @@ dataset_collection = store.get_dataset_collection(
|
|
|
205
205
|
store.map(
|
|
206
206
|
lambda dataset: (
|
|
207
207
|
store
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
"*",
|
|
215
|
-
match_id=dataset.
|
|
216
|
-
competition_id=dataset.
|
|
217
|
-
season_id=dataset.
|
|
218
|
-
|
|
208
|
+
|
|
209
|
+
# As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
|
|
210
|
+
.load_with_kloppy(dataset)
|
|
211
|
+
|
|
212
|
+
# Convert it into a polars dataframe using all columns in the original data and some more additional ones
|
|
213
|
+
.to_df(
|
|
214
|
+
"*",
|
|
215
|
+
match_id=dataset.dataset_resource_id.match_id,
|
|
216
|
+
competition_id=dataset.dataset_resource_id.competition_id,
|
|
217
|
+
season_id=dataset.dataset_resource_id.season_id,
|
|
218
|
+
|
|
219
219
|
engine="polars"
|
|
220
220
|
)
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
f"/tmp/files/blaat/{dataset.
|
|
221
|
+
|
|
222
|
+
# Write to parquet format
|
|
223
|
+
.write_parquet(
|
|
224
|
+
f"/tmp/files/blaat/{dataset.dataset_resource_id.match_id}.parquet"
|
|
225
225
|
)
|
|
226
226
|
),
|
|
227
227
|
dataset_collection,
|
|
@@ -58,8 +58,7 @@ class DatasetStore:
|
|
|
58
58
|
self.event_bus.dispatch(event)
|
|
59
59
|
|
|
60
60
|
def save_ingestion_job_summary(self, ingestion_job_summary):
|
|
61
|
-
self.dataset_repository.
|
|
62
|
-
self.dataset_repository.session.commit()
|
|
61
|
+
self.dataset_repository.save_ingestion_job_summary(ingestion_job_summary)
|
|
63
62
|
|
|
64
63
|
def get_dataset_collection(
|
|
65
64
|
self,
|
|
@@ -271,6 +270,7 @@ class DatasetStore:
|
|
|
271
270
|
metadata=metadata,
|
|
272
271
|
created_at=now,
|
|
273
272
|
updated_at=now,
|
|
273
|
+
last_modified_at=None, # Not known at this moment
|
|
274
274
|
)
|
|
275
275
|
revision = self.add_revision(dataset, files, revision_source, description)
|
|
276
276
|
|
|
@@ -298,8 +298,8 @@ class DatasetStore:
|
|
|
298
298
|
)
|
|
299
299
|
|
|
300
300
|
loaded_file = LoadedFile(
|
|
301
|
-
|
|
302
|
-
**
|
|
301
|
+
stream_=get_stream if lazy else get_stream(file),
|
|
302
|
+
**file.model_dump(),
|
|
303
303
|
)
|
|
304
304
|
files[file.file_id] = loaded_file
|
|
305
305
|
return FileCollection(files, auto_rewind=auto_rewind)
|
|
@@ -21,8 +21,13 @@ class IngestionEngine:
|
|
|
21
21
|
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
22
22
|
self.loader.add_ingestion_plan(ingestion_plan)
|
|
23
23
|
|
|
24
|
-
def load(
|
|
25
|
-
self
|
|
24
|
+
def load(
|
|
25
|
+
self,
|
|
26
|
+
dry_run: bool = False,
|
|
27
|
+
provider: Optional[str] = None,
|
|
28
|
+
source: Optional[str] = None,
|
|
29
|
+
):
|
|
30
|
+
self.loader.collect_and_run(dry_run=dry_run, provider=provider, source=source)
|
|
26
31
|
|
|
27
32
|
def list_datasets(self, as_count: bool = False):
|
|
28
33
|
"""Consider moving this to DataStore"""
|
|
@@ -29,7 +29,12 @@ class Loader:
|
|
|
29
29
|
def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
|
|
30
30
|
self.ingestion_plans.append(ingestion_plan)
|
|
31
31
|
|
|
32
|
-
def collect_and_run(
|
|
32
|
+
def collect_and_run(
|
|
33
|
+
self,
|
|
34
|
+
dry_run: bool = False,
|
|
35
|
+
provider: Optional[str] = None,
|
|
36
|
+
source: Optional[str] = None,
|
|
37
|
+
):
|
|
33
38
|
# First collect all selectors, before discovering datasets
|
|
34
39
|
selectors = {}
|
|
35
40
|
for ingestion_plan in self.ingestion_plans:
|
|
@@ -42,6 +47,13 @@ class Loader:
|
|
|
42
47
|
)
|
|
43
48
|
continue
|
|
44
49
|
|
|
50
|
+
if source is not None:
|
|
51
|
+
if ingestion_plan.source.name != source:
|
|
52
|
+
logger.info(
|
|
53
|
+
f"Skipping {ingestion_plan} because source doesn't match '{source}'"
|
|
54
|
+
)
|
|
55
|
+
continue
|
|
56
|
+
|
|
45
57
|
static_selectors = [
|
|
46
58
|
selector
|
|
47
59
|
for selector in ingestion_plan.selectors
|
|
@@ -60,6 +72,7 @@ class Loader:
|
|
|
60
72
|
|
|
61
73
|
# TODO: consider making this lazy and fetch once per Source instead of
|
|
62
74
|
# once per IngestionPlan
|
|
75
|
+
# TODO: Log exception when `discover_selectors` fails
|
|
63
76
|
all_selectors = ingestion_plan.source.discover_selectors(
|
|
64
77
|
ingestion_plan.dataset_type
|
|
65
78
|
)
|
|
@@ -58,7 +58,14 @@ def cli():
|
|
|
58
58
|
help="bucket",
|
|
59
59
|
type=str,
|
|
60
60
|
)
|
|
61
|
-
@click.option(
|
|
61
|
+
@click.option(
|
|
62
|
+
"--debug",
|
|
63
|
+
"debug",
|
|
64
|
+
required=False,
|
|
65
|
+
help="Debugging enabled",
|
|
66
|
+
is_flag=True,
|
|
67
|
+
type=bool,
|
|
68
|
+
)
|
|
62
69
|
@click.option(
|
|
63
70
|
"--dry-run",
|
|
64
71
|
"dry_run",
|
|
@@ -74,11 +81,19 @@ def cli():
|
|
|
74
81
|
help="Provider - only run tasks for a single provider",
|
|
75
82
|
type=str,
|
|
76
83
|
)
|
|
84
|
+
@click.option(
|
|
85
|
+
"--source",
|
|
86
|
+
"source",
|
|
87
|
+
required=False,
|
|
88
|
+
help="Source - only run tasks for a single source",
|
|
89
|
+
type=str,
|
|
90
|
+
)
|
|
77
91
|
def run(
|
|
78
92
|
config_file: str,
|
|
79
93
|
bucket: Optional[str],
|
|
80
94
|
dry_run: Optional[bool],
|
|
81
95
|
provider: Optional[str],
|
|
96
|
+
source: Optional[str],
|
|
82
97
|
debug: Optional[bool],
|
|
83
98
|
):
|
|
84
99
|
try:
|
|
@@ -90,7 +105,10 @@ def run(
|
|
|
90
105
|
logger.exception(f"Failed due a configuration error: {e}")
|
|
91
106
|
sys.exit(1)
|
|
92
107
|
|
|
93
|
-
|
|
108
|
+
if debug:
|
|
109
|
+
logging.getLogger("root").setLevel(logging.DEBUG)
|
|
110
|
+
|
|
111
|
+
engine.load(dry_run=dry_run, provider=provider, source=source)
|
|
94
112
|
|
|
95
113
|
logger.info("Done")
|
|
96
114
|
|
|
@@ -6,7 +6,8 @@ from typing import Optional
|
|
|
6
6
|
@dataclass
|
|
7
7
|
class DatasetCollectionMetadata:
|
|
8
8
|
# This can be useful to figure out if a backfill is required
|
|
9
|
-
|
|
9
|
+
# TODO - Note: not stored at Dataset level and requires joined query to retrieve
|
|
10
|
+
# first_modified: Optional[datetime]
|
|
10
11
|
|
|
11
12
|
# Use the last modified to only retrieve datasets that are changed
|
|
12
13
|
last_modified: Optional[datetime]
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from typing import List, Optional
|
|
4
|
-
from pydantic import Field
|
|
4
|
+
from pydantic import Field, field_validator
|
|
5
5
|
|
|
6
6
|
from ingestify.utils import utcnow
|
|
7
7
|
from .dataset_state import DatasetState
|
|
@@ -22,7 +22,17 @@ class Dataset(BaseModel):
|
|
|
22
22
|
metadata: dict
|
|
23
23
|
created_at: datetime
|
|
24
24
|
updated_at: datetime
|
|
25
|
+
|
|
25
26
|
revisions: List[Revision] = Field(default_factory=list)
|
|
27
|
+
# The last_modified_at is equal to the max modified_at of all files in all revisions
|
|
28
|
+
last_modified_at: Optional[datetime]
|
|
29
|
+
|
|
30
|
+
@field_validator("identifier", mode="before")
|
|
31
|
+
@classmethod
|
|
32
|
+
def parse_identifier(cls, value):
|
|
33
|
+
if not isinstance(value, Identifier):
|
|
34
|
+
return Identifier(value)
|
|
35
|
+
return value
|
|
26
36
|
|
|
27
37
|
@property
|
|
28
38
|
def is_complete(self):
|
|
@@ -35,6 +45,13 @@ class Dataset(BaseModel):
|
|
|
35
45
|
self.revisions.append(revision)
|
|
36
46
|
self.updated_at = utcnow()
|
|
37
47
|
|
|
48
|
+
if self.last_modified_at:
|
|
49
|
+
self.last_modified_at = max(
|
|
50
|
+
self.last_modified_at, revision.last_modified_at
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
self.last_modified_at = revision.last_modified_at
|
|
54
|
+
|
|
38
55
|
def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
|
|
39
56
|
changed = False
|
|
40
57
|
if self.name != name:
|
|
@@ -116,18 +116,18 @@ class LoadedFile(BaseModel):
|
|
|
116
116
|
data_serialization_format: Optional[str] # Example: 'json'
|
|
117
117
|
storage_compression_method: Optional[str] # Example: 'gzip'
|
|
118
118
|
storage_path: Path
|
|
119
|
-
|
|
119
|
+
stream_: Union[BinaryIO, BytesIO, Callable[[], Awaitable[Union[BinaryIO, BytesIO]]]]
|
|
120
120
|
revision_id: Optional[int] = None # This can be used when a Revision is squashed
|
|
121
121
|
|
|
122
122
|
def load_stream(self):
|
|
123
|
-
if callable(self.
|
|
124
|
-
self.
|
|
123
|
+
if callable(self.stream_):
|
|
124
|
+
self.stream_ = self.stream_(self)
|
|
125
125
|
|
|
126
126
|
@property
|
|
127
127
|
def stream(self):
|
|
128
|
-
if callable(self.
|
|
128
|
+
if callable(self.stream_):
|
|
129
129
|
raise Exception("You should load the stream first using `load_stream`")
|
|
130
|
-
return self.
|
|
130
|
+
return self.stream_
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
__all__ = ["File", "DraftFile", "LoadedFile"]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Dict, List
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
from typing_extensions import TypedDict
|
|
6
6
|
|
|
@@ -32,10 +32,14 @@ class Revision(BaseModel):
|
|
|
32
32
|
created_at: datetime
|
|
33
33
|
description: str
|
|
34
34
|
modified_files: List[File]
|
|
35
|
-
source: RevisionSource
|
|
35
|
+
source: Optional[RevisionSource]
|
|
36
36
|
is_squashed: bool = False
|
|
37
37
|
state: RevisionState = RevisionState.PENDING_VALIDATION
|
|
38
38
|
|
|
39
|
+
@property
|
|
40
|
+
def last_modified_at(self):
|
|
41
|
+
return max(file.modified_at for file in self.modified_files)
|
|
42
|
+
|
|
39
43
|
@property
|
|
40
44
|
def modified_files_map(self) -> Dict[str, File]:
|
|
41
45
|
return {file.file_id: file for file in self.modified_files}
|
|
@@ -2,6 +2,7 @@ import itertools
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import uuid
|
|
5
|
+
from enum import Enum
|
|
5
6
|
from typing import Optional, Iterator
|
|
6
7
|
|
|
7
8
|
from ingestify import retrieve_http
|
|
@@ -17,6 +18,7 @@ from ingestify.domain.models.resources.dataset_resource import (
|
|
|
17
18
|
DatasetResource,
|
|
18
19
|
)
|
|
19
20
|
from ingestify.domain.models.task.task_summary import TaskSummary
|
|
21
|
+
from ingestify.exceptions import SaveError
|
|
20
22
|
from ingestify.utils import TaskExecutor, chunker
|
|
21
23
|
|
|
22
24
|
logger = logging.getLogger(__name__)
|
|
@@ -120,21 +122,27 @@ class UpdateDatasetTask(Task):
|
|
|
120
122
|
with TaskSummary.update(
|
|
121
123
|
self.task_id, dataset_identifier=dataset_identifier
|
|
122
124
|
) as task_summary:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
125
|
+
|
|
126
|
+
files = {
|
|
127
|
+
file_id: task_summary.record_load_file(
|
|
128
|
+
lambda: load_file(file_resource, dataset=self.dataset),
|
|
129
|
+
metadata={"file_id": file_id},
|
|
130
|
+
)
|
|
131
|
+
for file_id, file_resource in self.dataset_resource.files.items()
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
revision = self.store.update_dataset(
|
|
136
|
+
dataset=self.dataset,
|
|
137
|
+
name=self.dataset_resource.name,
|
|
138
|
+
state=self.dataset_resource.state,
|
|
139
|
+
metadata=self.dataset_resource.metadata,
|
|
140
|
+
files=files,
|
|
141
|
+
revision_source=revision_source,
|
|
142
|
+
)
|
|
143
|
+
task_summary.set_stats_from_revision(revision)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
raise SaveError("Could not update dataset") from e
|
|
138
146
|
|
|
139
147
|
return task_summary
|
|
140
148
|
|
|
@@ -159,24 +167,28 @@ class CreateDatasetTask(Task):
|
|
|
159
167
|
)
|
|
160
168
|
|
|
161
169
|
with TaskSummary.create(self.task_id, dataset_identifier) as task_summary:
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
170
|
+
files = {
|
|
171
|
+
file_id: task_summary.record_load_file(
|
|
172
|
+
lambda: load_file(file_resource, dataset=None),
|
|
173
|
+
metadata={"file_id": file_id},
|
|
174
|
+
)
|
|
175
|
+
for file_id, file_resource in self.dataset_resource.files.items()
|
|
176
|
+
}
|
|
177
|
+
try:
|
|
178
|
+
revision = self.store.create_dataset(
|
|
179
|
+
dataset_type=self.dataset_resource.dataset_type,
|
|
180
|
+
provider=self.dataset_resource.provider,
|
|
181
|
+
dataset_identifier=dataset_identifier,
|
|
182
|
+
name=self.dataset_resource.name,
|
|
183
|
+
state=self.dataset_resource.state,
|
|
184
|
+
metadata=self.dataset_resource.metadata,
|
|
185
|
+
files=files,
|
|
186
|
+
revision_source=revision_source,
|
|
187
|
+
)
|
|
178
188
|
|
|
179
|
-
|
|
189
|
+
task_summary.set_stats_from_revision(revision)
|
|
190
|
+
except Exception as e:
|
|
191
|
+
raise SaveError("Could not create dataset") from e
|
|
180
192
|
|
|
181
193
|
return task_summary
|
|
182
194
|
|
|
@@ -209,6 +221,7 @@ class IngestionJob:
|
|
|
209
221
|
with ingestion_job_summary.record_timing("get_dataset_collection"):
|
|
210
222
|
dataset_collection_metadata = store.get_dataset_collection(
|
|
211
223
|
dataset_type=self.ingestion_plan.dataset_type,
|
|
224
|
+
provider=self.ingestion_plan.source.provider,
|
|
212
225
|
data_spec_versions=self.selector.data_spec_versions,
|
|
213
226
|
selector=self.selector,
|
|
214
227
|
metadata_only=True,
|
|
@@ -218,27 +231,38 @@ class IngestionJob:
|
|
|
218
231
|
# There are two different, but similar flows here:
|
|
219
232
|
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
220
233
|
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
234
|
+
try:
|
|
235
|
+
with ingestion_job_summary.record_timing("find_datasets"):
|
|
236
|
+
dataset_resources = self.ingestion_plan.source.find_datasets(
|
|
237
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
238
|
+
data_spec_versions=self.selector.data_spec_versions,
|
|
239
|
+
dataset_collection_metadata=dataset_collection_metadata,
|
|
240
|
+
**self.selector.custom_attributes,
|
|
241
|
+
)
|
|
229
242
|
|
|
230
|
-
|
|
243
|
+
# We need to include the to_batches as that will start the generator
|
|
244
|
+
batches = to_batches(dataset_resources)
|
|
245
|
+
except Exception as e:
|
|
246
|
+
logger.exception("Failed to find datasets")
|
|
231
247
|
|
|
232
|
-
|
|
248
|
+
ingestion_job_summary.set_exception(e)
|
|
249
|
+
yield ingestion_job_summary
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
233
253
|
|
|
234
254
|
while True:
|
|
235
255
|
try:
|
|
236
256
|
batch = next(batches)
|
|
237
257
|
except StopIteration:
|
|
238
258
|
break
|
|
239
|
-
except Exception:
|
|
240
|
-
|
|
241
|
-
|
|
259
|
+
except Exception as e:
|
|
260
|
+
logger.exception("Failed to fetch next batch")
|
|
261
|
+
|
|
262
|
+
finish_task_timer()
|
|
263
|
+
ingestion_job_summary.set_exception(e)
|
|
264
|
+
yield ingestion_job_summary
|
|
265
|
+
return
|
|
242
266
|
|
|
243
267
|
dataset_identifiers = [
|
|
244
268
|
Identifier.create_from_selector(
|
{ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py
RENAMED
|
@@ -1,24 +1,31 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from datetime import datetime, timedelta
|
|
4
|
+
from enum import Enum
|
|
4
5
|
from typing import Optional, List, TYPE_CHECKING
|
|
5
6
|
from pydantic import Field
|
|
6
7
|
|
|
7
8
|
from ingestify.domain import Selector, DataSpecVersionCollection
|
|
8
9
|
from ingestify.domain.models.base import BaseModel
|
|
9
|
-
from ingestify.domain.models.task.task_summary import TaskSummary,
|
|
10
|
+
from ingestify.domain.models.task.task_summary import TaskSummary, TaskState
|
|
10
11
|
from ingestify.domain.models.timing import Timing
|
|
11
|
-
from ingestify.utils import utcnow
|
|
12
|
+
from ingestify.utils import utcnow, HasTiming
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
14
15
|
from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
|
|
15
16
|
|
|
16
17
|
|
|
18
|
+
class IngestionJobState(str, Enum):
|
|
19
|
+
RUNNING = "RUNNING"
|
|
20
|
+
FINISHED = "FINISHED"
|
|
21
|
+
FAILED = "FAILED"
|
|
22
|
+
|
|
23
|
+
|
|
17
24
|
def format_duration(duration: timedelta):
|
|
18
25
|
return f"{duration.total_seconds():.2f}sec"
|
|
19
26
|
|
|
20
27
|
|
|
21
|
-
class IngestionJobSummary(BaseModel):
|
|
28
|
+
class IngestionJobSummary(BaseModel, HasTiming):
|
|
22
29
|
ingestion_job_summary_id: str
|
|
23
30
|
ingestion_job_id: str
|
|
24
31
|
|
|
@@ -30,8 +37,8 @@ class IngestionJobSummary(BaseModel):
|
|
|
30
37
|
selector: Selector
|
|
31
38
|
|
|
32
39
|
started_at: datetime = Field(default_factory=utcnow)
|
|
33
|
-
|
|
34
|
-
|
|
40
|
+
ended_at: Optional[datetime] = None
|
|
41
|
+
state: IngestionJobState = IngestionJobState.RUNNING
|
|
35
42
|
task_summaries: List[TaskSummary] = Field(default_factory=list)
|
|
36
43
|
|
|
37
44
|
skipped_datasets: int = 0
|
|
@@ -52,20 +59,6 @@ class IngestionJobSummary(BaseModel):
|
|
|
52
59
|
)
|
|
53
60
|
return cls(**args)
|
|
54
61
|
|
|
55
|
-
@contextmanager
|
|
56
|
-
def record_timing(self, name: str):
|
|
57
|
-
start = utcnow()
|
|
58
|
-
yield
|
|
59
|
-
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
60
|
-
|
|
61
|
-
def start_timing(self, name):
|
|
62
|
-
start = utcnow()
|
|
63
|
-
|
|
64
|
-
def finish():
|
|
65
|
-
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
66
|
-
|
|
67
|
-
return finish
|
|
68
|
-
|
|
69
62
|
def add_task_summaries(self, task_summaries: List[TaskSummary]):
|
|
70
63
|
self.task_summaries.extend(task_summaries)
|
|
71
64
|
|
|
@@ -75,46 +68,61 @@ class IngestionJobSummary(BaseModel):
|
|
|
75
68
|
def task_count(self):
|
|
76
69
|
return len(self.task_summaries)
|
|
77
70
|
|
|
78
|
-
def
|
|
71
|
+
def _set_ended(self):
|
|
79
72
|
self.failed_tasks = len(
|
|
80
|
-
[task for task in self.task_summaries if task.
|
|
73
|
+
[task for task in self.task_summaries if task.state == TaskState.FAILED]
|
|
81
74
|
)
|
|
82
75
|
self.successful_tasks = len(
|
|
83
|
-
[task for task in self.task_summaries if task.
|
|
76
|
+
[task for task in self.task_summaries if task.state == TaskState.FINISHED]
|
|
84
77
|
)
|
|
85
78
|
self.ignored_successful_tasks = len(
|
|
86
79
|
[
|
|
87
80
|
task
|
|
88
81
|
for task in self.task_summaries
|
|
89
|
-
if task.
|
|
82
|
+
if task.state == TaskState.FINISHED_IGNORED
|
|
90
83
|
]
|
|
91
84
|
)
|
|
92
|
-
self.
|
|
85
|
+
self.ended_at = utcnow()
|
|
86
|
+
|
|
87
|
+
# Only keep failed tasks. Rest isn't interesting
|
|
88
|
+
self.task_summaries = [
|
|
89
|
+
task for task in self.task_summaries if task.state == TaskState.FAILED
|
|
90
|
+
]
|
|
91
|
+
|
|
92
|
+
def set_finished(self):
|
|
93
|
+
self.state = IngestionJobState.FINISHED
|
|
94
|
+
self._set_ended()
|
|
95
|
+
|
|
96
|
+
def set_exception(self, e: Exception):
|
|
97
|
+
self.state = IngestionJobState.FAILED
|
|
98
|
+
self._set_ended()
|
|
93
99
|
|
|
94
100
|
@property
|
|
95
101
|
def duration(self) -> timedelta:
|
|
96
|
-
return self.
|
|
102
|
+
return self.ended_at - self.started_at
|
|
97
103
|
|
|
98
104
|
def output_report(self):
|
|
99
|
-
print(
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
print(
|
|
103
|
-
print(f"
|
|
104
|
-
print(f"
|
|
105
|
-
print(f"
|
|
106
|
-
print(f"
|
|
105
|
+
print(
|
|
106
|
+
f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
|
|
107
|
+
)
|
|
108
|
+
print("********************************")
|
|
109
|
+
print(f"* - IngestionPlan:")
|
|
110
|
+
print(f"* Source: {self.source_name}")
|
|
111
|
+
print(f"* Provider: {self.provider}")
|
|
112
|
+
print(f"* DatasetType: {self.dataset_type}")
|
|
113
|
+
print(f"* - Selector: {self.selector}")
|
|
114
|
+
print(f"* - Timings: ")
|
|
107
115
|
for timing in self.timings:
|
|
108
|
-
print(f" - {timing.name}: {format_duration(timing.duration)}")
|
|
116
|
+
print(f"* - {timing.name}: {format_duration(timing.duration)}")
|
|
109
117
|
print(
|
|
110
|
-
f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
|
|
118
|
+
f"* - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
|
|
111
119
|
)
|
|
112
120
|
|
|
113
|
-
print(f" - Failed tasks: {self.failed_tasks}")
|
|
114
|
-
print(f" - Successful tasks: {self.successful_tasks}")
|
|
115
|
-
print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
|
|
116
|
-
print(f" - Skipped datasets: {self.skipped_datasets}")
|
|
117
|
-
print("
|
|
121
|
+
print(f"* - Failed tasks: {self.failed_tasks}")
|
|
122
|
+
print(f"* - Successful tasks: {self.successful_tasks}")
|
|
123
|
+
print(f"* - Successful ignored tasks: {self.ignored_successful_tasks}")
|
|
124
|
+
print(f"* - Skipped datasets: {self.skipped_datasets}")
|
|
125
|
+
print("********************************")
|
|
118
126
|
|
|
119
127
|
def __enter__(self):
|
|
120
128
|
return self
|