ingestify 0.3.2__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.3.2 → ingestify-0.3.4}/PKG-INFO +16 -16
- {ingestify-0.3.2 → ingestify-0.3.4}/README.md +15 -15
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/__init__.py +1 -1
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/application/dataset_store.py +3 -4
- ingestify-0.3.4/ingestify/domain/models/base.py +5 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/dataset.py +8 -1
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/file.py +5 -5
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/revision.py +2 -2
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/ingestion/ingestion_job.py +70 -44
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/ingestion/ingestion_job_summary.py +29 -11
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/task/task_summary.py +8 -8
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/exceptions.py +4 -0
- ingestify-0.3.4/ingestify/infra/serialization/__init__.py +33 -0
- ingestify-0.3.4/ingestify/infra/store/dataset/sqlalchemy/repository.py +484 -0
- ingestify-0.3.2/ingestify/infra/store/dataset/sqlalchemy/mapping.py → ingestify-0.3.4/ingestify/infra/store/dataset/sqlalchemy/tables.py +90 -73
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/file/s3_file_repository.py +5 -1
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/PKG-INFO +16 -16
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/SOURCES.txt +1 -1
- ingestify-0.3.2/ingestify/domain/models/base.py +0 -22
- ingestify-0.3.2/ingestify/infra/serialization/__init__.py +0 -50
- ingestify-0.3.2/ingestify/infra/store/dataset/sqlalchemy/repository.py +0 -239
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/application/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/application/ingestion_engine.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/application/loader.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/cmdline.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/data_spec_version_collection.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/dataset_state.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/events.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/resources/dataset_resource.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/fetch/http.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/source/statsbomb_github.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/source/wyscout.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/main.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/server.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/source_base.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/statsbomb_github/README.md +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/statsbomb_github/query.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/.env +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/.gitignore +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/README.md +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/database/README.md +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/query.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/utils.py +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/requires.txt +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/setup.cfg +0 -0
- {ingestify-0.3.2 → ingestify-0.3.4}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ingestify
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.4
|
|
4
4
|
Summary: Data Ingestion Framework
|
|
5
5
|
Author: Koen Vossen
|
|
6
6
|
Author-email: info@koenvossen.nl
|
|
@@ -215,23 +215,23 @@ dataset_collection = store.get_dataset_collection(
|
|
|
215
215
|
store.map(
|
|
216
216
|
lambda dataset: (
|
|
217
217
|
store
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
"*",
|
|
225
|
-
match_id=dataset.
|
|
226
|
-
competition_id=dataset.
|
|
227
|
-
season_id=dataset.
|
|
228
|
-
|
|
218
|
+
|
|
219
|
+
# As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
|
|
220
|
+
.load_with_kloppy(dataset)
|
|
221
|
+
|
|
222
|
+
# Convert it into a polars dataframe using all columns in the original data and some more additional ones
|
|
223
|
+
.to_df(
|
|
224
|
+
"*",
|
|
225
|
+
match_id=dataset.dataset_resource_id.match_id,
|
|
226
|
+
competition_id=dataset.dataset_resource_id.competition_id,
|
|
227
|
+
season_id=dataset.dataset_resource_id.season_id,
|
|
228
|
+
|
|
229
229
|
engine="polars"
|
|
230
230
|
)
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
f"/tmp/files/blaat/{dataset.
|
|
231
|
+
|
|
232
|
+
# Write to parquet format
|
|
233
|
+
.write_parquet(
|
|
234
|
+
f"/tmp/files/blaat/{dataset.dataset_resource_id.match_id}.parquet"
|
|
235
235
|
)
|
|
236
236
|
),
|
|
237
237
|
dataset_collection,
|
|
@@ -205,23 +205,23 @@ dataset_collection = store.get_dataset_collection(
|
|
|
205
205
|
store.map(
|
|
206
206
|
lambda dataset: (
|
|
207
207
|
store
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
"*",
|
|
215
|
-
match_id=dataset.
|
|
216
|
-
competition_id=dataset.
|
|
217
|
-
season_id=dataset.
|
|
218
|
-
|
|
208
|
+
|
|
209
|
+
# As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
|
|
210
|
+
.load_with_kloppy(dataset)
|
|
211
|
+
|
|
212
|
+
# Convert it into a polars dataframe using all columns in the original data and some more additional ones
|
|
213
|
+
.to_df(
|
|
214
|
+
"*",
|
|
215
|
+
match_id=dataset.dataset_resource_id.match_id,
|
|
216
|
+
competition_id=dataset.dataset_resource_id.competition_id,
|
|
217
|
+
season_id=dataset.dataset_resource_id.season_id,
|
|
218
|
+
|
|
219
219
|
engine="polars"
|
|
220
220
|
)
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
f"/tmp/files/blaat/{dataset.
|
|
221
|
+
|
|
222
|
+
# Write to parquet format
|
|
223
|
+
.write_parquet(
|
|
224
|
+
f"/tmp/files/blaat/{dataset.dataset_resource_id.match_id}.parquet"
|
|
225
225
|
)
|
|
226
226
|
),
|
|
227
227
|
dataset_collection,
|
|
@@ -58,8 +58,7 @@ class DatasetStore:
|
|
|
58
58
|
self.event_bus.dispatch(event)
|
|
59
59
|
|
|
60
60
|
def save_ingestion_job_summary(self, ingestion_job_summary):
|
|
61
|
-
self.dataset_repository.
|
|
62
|
-
self.dataset_repository.session.commit()
|
|
61
|
+
self.dataset_repository.save_ingestion_job_summary(ingestion_job_summary)
|
|
63
62
|
|
|
64
63
|
def get_dataset_collection(
|
|
65
64
|
self,
|
|
@@ -298,8 +297,8 @@ class DatasetStore:
|
|
|
298
297
|
)
|
|
299
298
|
|
|
300
299
|
loaded_file = LoadedFile(
|
|
301
|
-
|
|
302
|
-
**
|
|
300
|
+
stream_=get_stream if lazy else get_stream(file),
|
|
301
|
+
**file.model_dump(),
|
|
303
302
|
)
|
|
304
303
|
files[file.file_id] = loaded_file
|
|
305
304
|
return FileCollection(files, auto_rewind=auto_rewind)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from typing import List, Optional
|
|
4
|
-
from pydantic import Field
|
|
4
|
+
from pydantic import Field, field_validator
|
|
5
5
|
|
|
6
6
|
from ingestify.utils import utcnow
|
|
7
7
|
from .dataset_state import DatasetState
|
|
@@ -24,6 +24,13 @@ class Dataset(BaseModel):
|
|
|
24
24
|
updated_at: datetime
|
|
25
25
|
revisions: List[Revision] = Field(default_factory=list)
|
|
26
26
|
|
|
27
|
+
@field_validator("identifier", mode="before")
|
|
28
|
+
@classmethod
|
|
29
|
+
def parse_identifier(cls, value):
|
|
30
|
+
if not isinstance(value, Identifier):
|
|
31
|
+
return Identifier(value)
|
|
32
|
+
return value
|
|
33
|
+
|
|
27
34
|
@property
|
|
28
35
|
def is_complete(self):
|
|
29
36
|
return self.state.is_complete
|
|
@@ -116,18 +116,18 @@ class LoadedFile(BaseModel):
|
|
|
116
116
|
data_serialization_format: Optional[str] # Example: 'json'
|
|
117
117
|
storage_compression_method: Optional[str] # Example: 'gzip'
|
|
118
118
|
storage_path: Path
|
|
119
|
-
|
|
119
|
+
stream_: Union[BinaryIO, BytesIO, Callable[[], Awaitable[Union[BinaryIO, BytesIO]]]]
|
|
120
120
|
revision_id: Optional[int] = None # This can be used when a Revision is squashed
|
|
121
121
|
|
|
122
122
|
def load_stream(self):
|
|
123
|
-
if callable(self.
|
|
124
|
-
self.
|
|
123
|
+
if callable(self.stream_):
|
|
124
|
+
self.stream_ = self.stream_(self)
|
|
125
125
|
|
|
126
126
|
@property
|
|
127
127
|
def stream(self):
|
|
128
|
-
if callable(self.
|
|
128
|
+
if callable(self.stream_):
|
|
129
129
|
raise Exception("You should load the stream first using `load_stream`")
|
|
130
|
-
return self.
|
|
130
|
+
return self.stream_
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
__all__ = ["File", "DraftFile", "LoadedFile"]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Dict, List
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
from typing_extensions import TypedDict
|
|
6
6
|
|
|
@@ -32,7 +32,7 @@ class Revision(BaseModel):
|
|
|
32
32
|
created_at: datetime
|
|
33
33
|
description: str
|
|
34
34
|
modified_files: List[File]
|
|
35
|
-
source: RevisionSource
|
|
35
|
+
source: Optional[RevisionSource]
|
|
36
36
|
is_squashed: bool = False
|
|
37
37
|
state: RevisionState = RevisionState.PENDING_VALIDATION
|
|
38
38
|
|
|
@@ -2,6 +2,7 @@ import itertools
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import uuid
|
|
5
|
+
from enum import Enum
|
|
5
6
|
from typing import Optional, Iterator
|
|
6
7
|
|
|
7
8
|
from ingestify import retrieve_http
|
|
@@ -17,6 +18,7 @@ from ingestify.domain.models.resources.dataset_resource import (
|
|
|
17
18
|
DatasetResource,
|
|
18
19
|
)
|
|
19
20
|
from ingestify.domain.models.task.task_summary import TaskSummary
|
|
21
|
+
from ingestify.exceptions import SaveError
|
|
20
22
|
from ingestify.utils import TaskExecutor, chunker
|
|
21
23
|
|
|
22
24
|
logger = logging.getLogger(__name__)
|
|
@@ -120,21 +122,27 @@ class UpdateDatasetTask(Task):
|
|
|
120
122
|
with TaskSummary.update(
|
|
121
123
|
self.task_id, dataset_identifier=dataset_identifier
|
|
122
124
|
) as task_summary:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
125
|
+
|
|
126
|
+
files = {
|
|
127
|
+
file_id: task_summary.record_load_file(
|
|
128
|
+
lambda: load_file(file_resource, dataset=self.dataset),
|
|
129
|
+
metadata={"file_id": file_id},
|
|
130
|
+
)
|
|
131
|
+
for file_id, file_resource in self.dataset_resource.files.items()
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
revision = self.store.update_dataset(
|
|
136
|
+
dataset=self.dataset,
|
|
137
|
+
name=self.dataset_resource.name,
|
|
138
|
+
state=self.dataset_resource.state,
|
|
139
|
+
metadata=self.dataset_resource.metadata,
|
|
140
|
+
files=files,
|
|
141
|
+
revision_source=revision_source,
|
|
142
|
+
)
|
|
143
|
+
task_summary.set_stats_from_revision(revision)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
raise SaveError("Could not update dataset") from e
|
|
138
146
|
|
|
139
147
|
return task_summary
|
|
140
148
|
|
|
@@ -159,24 +167,28 @@ class CreateDatasetTask(Task):
|
|
|
159
167
|
)
|
|
160
168
|
|
|
161
169
|
with TaskSummary.create(self.task_id, dataset_identifier) as task_summary:
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
170
|
+
files = {
|
|
171
|
+
file_id: task_summary.record_load_file(
|
|
172
|
+
lambda: load_file(file_resource, dataset=None),
|
|
173
|
+
metadata={"file_id": file_id},
|
|
174
|
+
)
|
|
175
|
+
for file_id, file_resource in self.dataset_resource.files.items()
|
|
176
|
+
}
|
|
177
|
+
try:
|
|
178
|
+
revision = self.store.create_dataset(
|
|
179
|
+
dataset_type=self.dataset_resource.dataset_type,
|
|
180
|
+
provider=self.dataset_resource.provider,
|
|
181
|
+
dataset_identifier=dataset_identifier,
|
|
182
|
+
name=self.dataset_resource.name,
|
|
183
|
+
state=self.dataset_resource.state,
|
|
184
|
+
metadata=self.dataset_resource.metadata,
|
|
185
|
+
files=files,
|
|
186
|
+
revision_source=revision_source,
|
|
187
|
+
)
|
|
178
188
|
|
|
179
|
-
|
|
189
|
+
task_summary.set_stats_from_revision(revision)
|
|
190
|
+
except Exception as e:
|
|
191
|
+
raise SaveError("Could not create dataset") from e
|
|
180
192
|
|
|
181
193
|
return task_summary
|
|
182
194
|
|
|
@@ -202,6 +214,9 @@ class IngestionJob:
|
|
|
202
214
|
self, store: DatasetStore, task_executor: TaskExecutor
|
|
203
215
|
) -> Iterator[IngestionJobSummary]:
|
|
204
216
|
is_first_chunk = True
|
|
217
|
+
ingestion_job_exception = (
|
|
218
|
+
None # Indicate if there was an exception during the IngestionJob itself
|
|
219
|
+
)
|
|
205
220
|
ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
|
|
206
221
|
# Process all items in batches. Yield a IngestionJobSummary per batch
|
|
207
222
|
|
|
@@ -219,26 +234,37 @@ class IngestionJob:
|
|
|
219
234
|
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
220
235
|
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
221
236
|
with ingestion_job_summary.record_timing("find_datasets"):
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
237
|
+
try:
|
|
238
|
+
dataset_resources = self.ingestion_plan.source.find_datasets(
|
|
239
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
240
|
+
data_spec_versions=self.selector.data_spec_versions,
|
|
241
|
+
dataset_collection_metadata=dataset_collection_metadata,
|
|
242
|
+
**self.selector.custom_attributes,
|
|
243
|
+
)
|
|
229
244
|
|
|
230
|
-
|
|
245
|
+
# We need to include the to_batches as that will start the generator
|
|
246
|
+
batches = to_batches(dataset_resources)
|
|
247
|
+
except Exception as e:
|
|
248
|
+
logger.exception("Failed to find datasets")
|
|
231
249
|
|
|
232
|
-
|
|
250
|
+
ingestion_job_summary.set_exception(e)
|
|
251
|
+
yield ingestion_job_summary
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
233
255
|
|
|
234
256
|
while True:
|
|
235
257
|
try:
|
|
236
258
|
batch = next(batches)
|
|
237
259
|
except StopIteration:
|
|
238
260
|
break
|
|
239
|
-
except Exception:
|
|
240
|
-
|
|
241
|
-
|
|
261
|
+
except Exception as e:
|
|
262
|
+
logger.exception("Failed to fetch next batch")
|
|
263
|
+
|
|
264
|
+
finish_task_timer()
|
|
265
|
+
ingestion_job_summary.set_exception(e)
|
|
266
|
+
yield ingestion_job_summary
|
|
267
|
+
return
|
|
242
268
|
|
|
243
269
|
dataset_identifiers = [
|
|
244
270
|
Identifier.create_from_selector(
|
{ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/ingestion/ingestion_job_summary.py
RENAMED
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from datetime import datetime, timedelta
|
|
4
|
+
from enum import Enum
|
|
4
5
|
from typing import Optional, List, TYPE_CHECKING
|
|
5
6
|
from pydantic import Field
|
|
6
7
|
|
|
7
8
|
from ingestify.domain import Selector, DataSpecVersionCollection
|
|
8
9
|
from ingestify.domain.models.base import BaseModel
|
|
9
|
-
from ingestify.domain.models.task.task_summary import TaskSummary,
|
|
10
|
+
from ingestify.domain.models.task.task_summary import TaskSummary, TaskState
|
|
10
11
|
from ingestify.domain.models.timing import Timing
|
|
11
12
|
from ingestify.utils import utcnow
|
|
12
13
|
|
|
@@ -14,6 +15,12 @@ if TYPE_CHECKING:
|
|
|
14
15
|
from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
|
|
15
16
|
|
|
16
17
|
|
|
18
|
+
class IngestionJobState(str, Enum):
|
|
19
|
+
RUNNING = "RUNNING"
|
|
20
|
+
FINISHED = "FINISHED"
|
|
21
|
+
FAILED = "FAILED"
|
|
22
|
+
|
|
23
|
+
|
|
17
24
|
def format_duration(duration: timedelta):
|
|
18
25
|
return f"{duration.total_seconds():.2f}sec"
|
|
19
26
|
|
|
@@ -30,7 +37,8 @@ class IngestionJobSummary(BaseModel):
|
|
|
30
37
|
selector: Selector
|
|
31
38
|
|
|
32
39
|
started_at: datetime = Field(default_factory=utcnow)
|
|
33
|
-
|
|
40
|
+
ended_at: Optional[datetime] = None
|
|
41
|
+
state: IngestionJobState = IngestionJobState.RUNNING
|
|
34
42
|
timings: List[Timing] = Field(default_factory=list)
|
|
35
43
|
task_summaries: List[TaskSummary] = Field(default_factory=list)
|
|
36
44
|
|
|
@@ -55,8 +63,10 @@ class IngestionJobSummary(BaseModel):
|
|
|
55
63
|
@contextmanager
|
|
56
64
|
def record_timing(self, name: str):
|
|
57
65
|
start = utcnow()
|
|
58
|
-
|
|
59
|
-
|
|
66
|
+
try:
|
|
67
|
+
yield
|
|
68
|
+
finally:
|
|
69
|
+
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
60
70
|
|
|
61
71
|
def start_timing(self, name):
|
|
62
72
|
start = utcnow()
|
|
@@ -75,28 +85,36 @@ class IngestionJobSummary(BaseModel):
|
|
|
75
85
|
def task_count(self):
|
|
76
86
|
return len(self.task_summaries)
|
|
77
87
|
|
|
78
|
-
def
|
|
88
|
+
def _set_ended(self):
|
|
79
89
|
self.failed_tasks = len(
|
|
80
|
-
[task for task in self.task_summaries if task.
|
|
90
|
+
[task for task in self.task_summaries if task.state == TaskState.FAILED]
|
|
81
91
|
)
|
|
82
92
|
self.successful_tasks = len(
|
|
83
|
-
[task for task in self.task_summaries if task.
|
|
93
|
+
[task for task in self.task_summaries if task.state == TaskState.FINISHED]
|
|
84
94
|
)
|
|
85
95
|
self.ignored_successful_tasks = len(
|
|
86
96
|
[
|
|
87
97
|
task
|
|
88
98
|
for task in self.task_summaries
|
|
89
|
-
if task.
|
|
99
|
+
if task.state == TaskState.FINISHED_IGNORED
|
|
90
100
|
]
|
|
91
101
|
)
|
|
92
|
-
self.
|
|
102
|
+
self.ended_at = utcnow()
|
|
103
|
+
|
|
104
|
+
def set_finished(self):
|
|
105
|
+
self.state = IngestionJobState.FINISHED
|
|
106
|
+
self._set_ended()
|
|
107
|
+
|
|
108
|
+
def set_exception(self, e: Exception):
|
|
109
|
+
self.state = IngestionJobState.FAILED
|
|
110
|
+
self._set_ended()
|
|
93
111
|
|
|
94
112
|
@property
|
|
95
113
|
def duration(self) -> timedelta:
|
|
96
|
-
return self.
|
|
114
|
+
return self.ended_at - self.started_at
|
|
97
115
|
|
|
98
116
|
def output_report(self):
|
|
99
|
-
print(f"\nIngestionJobSummary
|
|
117
|
+
print(f"\nIngestionJobSummary {self.state} in {format_duration(self.duration)}")
|
|
100
118
|
print("--------------------")
|
|
101
119
|
print(f" - IngestionPlan:")
|
|
102
120
|
print(f" Source: {self.source_name}")
|
|
@@ -16,7 +16,7 @@ from ingestify.utils import utcnow
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
class
|
|
19
|
+
class TaskState(str, Enum):
|
|
20
20
|
RUNNING = "RUNNING"
|
|
21
21
|
FINISHED = "FINISHED"
|
|
22
22
|
FINISHED_IGNORED = "FINISHED_IGNORED" # Finished, but didn't produce any new data
|
|
@@ -37,7 +37,7 @@ class TaskSummary(BaseModel):
|
|
|
37
37
|
persisted_file_count: int = 0
|
|
38
38
|
bytes_retrieved: int = 0
|
|
39
39
|
last_modified: Optional[datetime] = None
|
|
40
|
-
|
|
40
|
+
state: TaskState = TaskState.RUNNING
|
|
41
41
|
timings: List[Timing] = Field(default_factory=list)
|
|
42
42
|
|
|
43
43
|
@field_validator("dataset_identifier", mode="before")
|
|
@@ -83,10 +83,10 @@ class TaskSummary(BaseModel):
|
|
|
83
83
|
try:
|
|
84
84
|
yield task_summary
|
|
85
85
|
|
|
86
|
-
task_summary.
|
|
86
|
+
task_summary.set_state(TaskState.FINISHED)
|
|
87
87
|
except Exception as e:
|
|
88
88
|
logger.exception(f"Failed to execute task.")
|
|
89
|
-
task_summary.
|
|
89
|
+
task_summary.set_state(TaskState.FAILED)
|
|
90
90
|
|
|
91
91
|
# When the error comes from our own code, make sure it will be raised to the highest level
|
|
92
92
|
# raise
|
|
@@ -111,8 +111,8 @@ class TaskSummary(BaseModel):
|
|
|
111
111
|
file.modified_at for file in revision.modified_files
|
|
112
112
|
)
|
|
113
113
|
else:
|
|
114
|
-
self.
|
|
114
|
+
self.state = TaskState.FINISHED_IGNORED
|
|
115
115
|
|
|
116
|
-
def
|
|
117
|
-
if self.
|
|
118
|
-
self.
|
|
116
|
+
def set_state(self, state: TaskState):
|
|
117
|
+
if self.state == TaskState.RUNNING:
|
|
118
|
+
self.state = state
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from typing import Type, Any, TypeVar
|
|
4
|
+
|
|
5
|
+
from dataclass_factory import Schema, Factory, NameStyle
|
|
6
|
+
from dataclass_factory.schema_helpers import type_checker
|
|
7
|
+
|
|
8
|
+
from ingestify.domain import DatasetCreated, Identifier
|
|
9
|
+
from ingestify.domain.models.dataset.events import MetadataUpdated, RevisionAdded
|
|
10
|
+
from ingestify.domain.models.event import DomainEvent
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
event_types = {
|
|
14
|
+
DatasetCreated.event_type: DatasetCreated,
|
|
15
|
+
RevisionAdded.event_type: RevisionAdded,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def deserialize(event_dict: dict) -> DomainEvent:
|
|
20
|
+
event_cls = event_types[event_dict["event_type"]]
|
|
21
|
+
event_dict["dataset"]["identifier"] = Identifier(
|
|
22
|
+
**event_dict["dataset"]["identifier"]
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
return event_cls.model_validate(event_dict)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def serialize(event: DomainEvent) -> dict:
|
|
29
|
+
event_dict = event.model_dump(mode="json")
|
|
30
|
+
|
|
31
|
+
# Make sure event_type is always part of the event_dict. Pydantic might skip it when the type is ClassVar
|
|
32
|
+
event_dict["event_type"] = event.event_type
|
|
33
|
+
return event_dict
|