ingestify 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +3 -4
- ingestify/domain/models/base.py +2 -19
- ingestify/domain/models/dataset/dataset.py +8 -1
- ingestify/domain/models/dataset/file.py +5 -5
- ingestify/domain/models/dataset/revision.py +2 -2
- ingestify/domain/models/ingestion/ingestion_job.py +70 -44
- ingestify/domain/models/ingestion/ingestion_job_summary.py +29 -11
- ingestify/domain/models/task/task_summary.py +8 -8
- ingestify/exceptions.py +4 -0
- ingestify/infra/serialization/__init__.py +23 -40
- ingestify/infra/store/dataset/sqlalchemy/repository.py +264 -19
- ingestify/infra/store/dataset/sqlalchemy/{mapping.py → tables.py} +89 -72
- {ingestify-0.3.3.dist-info → ingestify-0.3.4.dist-info}/METADATA +16 -16
- {ingestify-0.3.3.dist-info → ingestify-0.3.4.dist-info}/RECORD +18 -18
- {ingestify-0.3.3.dist-info → ingestify-0.3.4.dist-info}/WHEEL +0 -0
- {ingestify-0.3.3.dist-info → ingestify-0.3.4.dist-info}/entry_points.txt +0 -0
- {ingestify-0.3.3.dist-info → ingestify-0.3.4.dist-info}/top_level.txt +0 -0
ingestify/__init__.py
CHANGED
|
@@ -58,8 +58,7 @@ class DatasetStore:
|
|
|
58
58
|
self.event_bus.dispatch(event)
|
|
59
59
|
|
|
60
60
|
def save_ingestion_job_summary(self, ingestion_job_summary):
|
|
61
|
-
self.dataset_repository.
|
|
62
|
-
self.dataset_repository.session.commit()
|
|
61
|
+
self.dataset_repository.save_ingestion_job_summary(ingestion_job_summary)
|
|
63
62
|
|
|
64
63
|
def get_dataset_collection(
|
|
65
64
|
self,
|
|
@@ -298,8 +297,8 @@ class DatasetStore:
|
|
|
298
297
|
)
|
|
299
298
|
|
|
300
299
|
loaded_file = LoadedFile(
|
|
301
|
-
|
|
302
|
-
**
|
|
300
|
+
stream_=get_stream if lazy else get_stream(file),
|
|
301
|
+
**file.model_dump(),
|
|
303
302
|
)
|
|
304
303
|
files[file.file_id] = loaded_file
|
|
305
304
|
return FileCollection(files, auto_rewind=auto_rewind)
|
ingestify/domain/models/base.py
CHANGED
|
@@ -1,22 +1,5 @@
|
|
|
1
|
-
from functools import partial
|
|
2
|
-
from typing import ClassVar, Any, Optional
|
|
3
|
-
|
|
4
|
-
import pydantic
|
|
5
1
|
from pydantic import BaseModel as PydanticBaseModel, ConfigDict
|
|
6
2
|
|
|
7
3
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
#
|
|
11
|
-
# _sa_instance_state: Optional[dict] = None
|
|
12
|
-
from sqlalchemy.orm import MappedAsDataclass
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class BaseModel(
|
|
16
|
-
MappedAsDataclass,
|
|
17
|
-
# DeclarativeBase,
|
|
18
|
-
dataclass_callable=partial(
|
|
19
|
-
pydantic.dataclasses.dataclass, config=ConfigDict(arbitrary_types_allowed=True)
|
|
20
|
-
),
|
|
21
|
-
):
|
|
22
|
-
pass
|
|
4
|
+
class BaseModel(PydanticBaseModel):
|
|
5
|
+
model_config = ConfigDict(arbitrary_types_allowed=True, from_attributes=True)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from enum import Enum
|
|
3
3
|
from typing import List, Optional
|
|
4
|
-
from pydantic import Field
|
|
4
|
+
from pydantic import Field, field_validator
|
|
5
5
|
|
|
6
6
|
from ingestify.utils import utcnow
|
|
7
7
|
from .dataset_state import DatasetState
|
|
@@ -24,6 +24,13 @@ class Dataset(BaseModel):
|
|
|
24
24
|
updated_at: datetime
|
|
25
25
|
revisions: List[Revision] = Field(default_factory=list)
|
|
26
26
|
|
|
27
|
+
@field_validator("identifier", mode="before")
|
|
28
|
+
@classmethod
|
|
29
|
+
def parse_identifier(cls, value):
|
|
30
|
+
if not isinstance(value, Identifier):
|
|
31
|
+
return Identifier(value)
|
|
32
|
+
return value
|
|
33
|
+
|
|
27
34
|
@property
|
|
28
35
|
def is_complete(self):
|
|
29
36
|
return self.state.is_complete
|
|
@@ -116,18 +116,18 @@ class LoadedFile(BaseModel):
|
|
|
116
116
|
data_serialization_format: Optional[str] # Example: 'json'
|
|
117
117
|
storage_compression_method: Optional[str] # Example: 'gzip'
|
|
118
118
|
storage_path: Path
|
|
119
|
-
|
|
119
|
+
stream_: Union[BinaryIO, BytesIO, Callable[[], Awaitable[Union[BinaryIO, BytesIO]]]]
|
|
120
120
|
revision_id: Optional[int] = None # This can be used when a Revision is squashed
|
|
121
121
|
|
|
122
122
|
def load_stream(self):
|
|
123
|
-
if callable(self.
|
|
124
|
-
self.
|
|
123
|
+
if callable(self.stream_):
|
|
124
|
+
self.stream_ = self.stream_(self)
|
|
125
125
|
|
|
126
126
|
@property
|
|
127
127
|
def stream(self):
|
|
128
|
-
if callable(self.
|
|
128
|
+
if callable(self.stream_):
|
|
129
129
|
raise Exception("You should load the stream first using `load_stream`")
|
|
130
|
-
return self.
|
|
130
|
+
return self.stream_
|
|
131
131
|
|
|
132
132
|
|
|
133
133
|
__all__ = ["File", "DraftFile", "LoadedFile"]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from datetime import datetime
|
|
2
2
|
from enum import Enum
|
|
3
|
-
from typing import Dict, List
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
from typing_extensions import TypedDict
|
|
6
6
|
|
|
@@ -32,7 +32,7 @@ class Revision(BaseModel):
|
|
|
32
32
|
created_at: datetime
|
|
33
33
|
description: str
|
|
34
34
|
modified_files: List[File]
|
|
35
|
-
source: RevisionSource
|
|
35
|
+
source: Optional[RevisionSource]
|
|
36
36
|
is_squashed: bool = False
|
|
37
37
|
state: RevisionState = RevisionState.PENDING_VALIDATION
|
|
38
38
|
|
|
@@ -2,6 +2,7 @@ import itertools
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import uuid
|
|
5
|
+
from enum import Enum
|
|
5
6
|
from typing import Optional, Iterator
|
|
6
7
|
|
|
7
8
|
from ingestify import retrieve_http
|
|
@@ -17,6 +18,7 @@ from ingestify.domain.models.resources.dataset_resource import (
|
|
|
17
18
|
DatasetResource,
|
|
18
19
|
)
|
|
19
20
|
from ingestify.domain.models.task.task_summary import TaskSummary
|
|
21
|
+
from ingestify.exceptions import SaveError
|
|
20
22
|
from ingestify.utils import TaskExecutor, chunker
|
|
21
23
|
|
|
22
24
|
logger = logging.getLogger(__name__)
|
|
@@ -120,21 +122,27 @@ class UpdateDatasetTask(Task):
|
|
|
120
122
|
with TaskSummary.update(
|
|
121
123
|
self.task_id, dataset_identifier=dataset_identifier
|
|
122
124
|
) as task_summary:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
125
|
+
|
|
126
|
+
files = {
|
|
127
|
+
file_id: task_summary.record_load_file(
|
|
128
|
+
lambda: load_file(file_resource, dataset=self.dataset),
|
|
129
|
+
metadata={"file_id": file_id},
|
|
130
|
+
)
|
|
131
|
+
for file_id, file_resource in self.dataset_resource.files.items()
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
revision = self.store.update_dataset(
|
|
136
|
+
dataset=self.dataset,
|
|
137
|
+
name=self.dataset_resource.name,
|
|
138
|
+
state=self.dataset_resource.state,
|
|
139
|
+
metadata=self.dataset_resource.metadata,
|
|
140
|
+
files=files,
|
|
141
|
+
revision_source=revision_source,
|
|
142
|
+
)
|
|
143
|
+
task_summary.set_stats_from_revision(revision)
|
|
144
|
+
except Exception as e:
|
|
145
|
+
raise SaveError("Could not update dataset") from e
|
|
138
146
|
|
|
139
147
|
return task_summary
|
|
140
148
|
|
|
@@ -159,24 +167,28 @@ class CreateDatasetTask(Task):
|
|
|
159
167
|
)
|
|
160
168
|
|
|
161
169
|
with TaskSummary.create(self.task_id, dataset_identifier) as task_summary:
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
170
|
+
files = {
|
|
171
|
+
file_id: task_summary.record_load_file(
|
|
172
|
+
lambda: load_file(file_resource, dataset=None),
|
|
173
|
+
metadata={"file_id": file_id},
|
|
174
|
+
)
|
|
175
|
+
for file_id, file_resource in self.dataset_resource.files.items()
|
|
176
|
+
}
|
|
177
|
+
try:
|
|
178
|
+
revision = self.store.create_dataset(
|
|
179
|
+
dataset_type=self.dataset_resource.dataset_type,
|
|
180
|
+
provider=self.dataset_resource.provider,
|
|
181
|
+
dataset_identifier=dataset_identifier,
|
|
182
|
+
name=self.dataset_resource.name,
|
|
183
|
+
state=self.dataset_resource.state,
|
|
184
|
+
metadata=self.dataset_resource.metadata,
|
|
185
|
+
files=files,
|
|
186
|
+
revision_source=revision_source,
|
|
187
|
+
)
|
|
178
188
|
|
|
179
|
-
|
|
189
|
+
task_summary.set_stats_from_revision(revision)
|
|
190
|
+
except Exception as e:
|
|
191
|
+
raise SaveError("Could not create dataset") from e
|
|
180
192
|
|
|
181
193
|
return task_summary
|
|
182
194
|
|
|
@@ -202,6 +214,9 @@ class IngestionJob:
|
|
|
202
214
|
self, store: DatasetStore, task_executor: TaskExecutor
|
|
203
215
|
) -> Iterator[IngestionJobSummary]:
|
|
204
216
|
is_first_chunk = True
|
|
217
|
+
ingestion_job_exception = (
|
|
218
|
+
None # Indicate if there was an exception during the IngestionJob itself
|
|
219
|
+
)
|
|
205
220
|
ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
|
|
206
221
|
# Process all items in batches. Yield a IngestionJobSummary per batch
|
|
207
222
|
|
|
@@ -219,26 +234,37 @@ class IngestionJob:
|
|
|
219
234
|
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
220
235
|
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
221
236
|
with ingestion_job_summary.record_timing("find_datasets"):
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
237
|
+
try:
|
|
238
|
+
dataset_resources = self.ingestion_plan.source.find_datasets(
|
|
239
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
240
|
+
data_spec_versions=self.selector.data_spec_versions,
|
|
241
|
+
dataset_collection_metadata=dataset_collection_metadata,
|
|
242
|
+
**self.selector.custom_attributes,
|
|
243
|
+
)
|
|
229
244
|
|
|
230
|
-
|
|
245
|
+
# We need to include the to_batches as that will start the generator
|
|
246
|
+
batches = to_batches(dataset_resources)
|
|
247
|
+
except Exception as e:
|
|
248
|
+
logger.exception("Failed to find datasets")
|
|
231
249
|
|
|
232
|
-
|
|
250
|
+
ingestion_job_summary.set_exception(e)
|
|
251
|
+
yield ingestion_job_summary
|
|
252
|
+
return
|
|
253
|
+
|
|
254
|
+
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
233
255
|
|
|
234
256
|
while True:
|
|
235
257
|
try:
|
|
236
258
|
batch = next(batches)
|
|
237
259
|
except StopIteration:
|
|
238
260
|
break
|
|
239
|
-
except Exception:
|
|
240
|
-
|
|
241
|
-
|
|
261
|
+
except Exception as e:
|
|
262
|
+
logger.exception("Failed to fetch next batch")
|
|
263
|
+
|
|
264
|
+
finish_task_timer()
|
|
265
|
+
ingestion_job_summary.set_exception(e)
|
|
266
|
+
yield ingestion_job_summary
|
|
267
|
+
return
|
|
242
268
|
|
|
243
269
|
dataset_identifiers = [
|
|
244
270
|
Identifier.create_from_selector(
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
import uuid
|
|
2
2
|
from contextlib import contextmanager
|
|
3
3
|
from datetime import datetime, timedelta
|
|
4
|
+
from enum import Enum
|
|
4
5
|
from typing import Optional, List, TYPE_CHECKING
|
|
5
6
|
from pydantic import Field
|
|
6
7
|
|
|
7
8
|
from ingestify.domain import Selector, DataSpecVersionCollection
|
|
8
9
|
from ingestify.domain.models.base import BaseModel
|
|
9
|
-
from ingestify.domain.models.task.task_summary import TaskSummary,
|
|
10
|
+
from ingestify.domain.models.task.task_summary import TaskSummary, TaskState
|
|
10
11
|
from ingestify.domain.models.timing import Timing
|
|
11
12
|
from ingestify.utils import utcnow
|
|
12
13
|
|
|
@@ -14,6 +15,12 @@ if TYPE_CHECKING:
|
|
|
14
15
|
from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
|
|
15
16
|
|
|
16
17
|
|
|
18
|
+
class IngestionJobState(str, Enum):
|
|
19
|
+
RUNNING = "RUNNING"
|
|
20
|
+
FINISHED = "FINISHED"
|
|
21
|
+
FAILED = "FAILED"
|
|
22
|
+
|
|
23
|
+
|
|
17
24
|
def format_duration(duration: timedelta):
|
|
18
25
|
return f"{duration.total_seconds():.2f}sec"
|
|
19
26
|
|
|
@@ -30,7 +37,8 @@ class IngestionJobSummary(BaseModel):
|
|
|
30
37
|
selector: Selector
|
|
31
38
|
|
|
32
39
|
started_at: datetime = Field(default_factory=utcnow)
|
|
33
|
-
|
|
40
|
+
ended_at: Optional[datetime] = None
|
|
41
|
+
state: IngestionJobState = IngestionJobState.RUNNING
|
|
34
42
|
timings: List[Timing] = Field(default_factory=list)
|
|
35
43
|
task_summaries: List[TaskSummary] = Field(default_factory=list)
|
|
36
44
|
|
|
@@ -55,8 +63,10 @@ class IngestionJobSummary(BaseModel):
|
|
|
55
63
|
@contextmanager
|
|
56
64
|
def record_timing(self, name: str):
|
|
57
65
|
start = utcnow()
|
|
58
|
-
|
|
59
|
-
|
|
66
|
+
try:
|
|
67
|
+
yield
|
|
68
|
+
finally:
|
|
69
|
+
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
60
70
|
|
|
61
71
|
def start_timing(self, name):
|
|
62
72
|
start = utcnow()
|
|
@@ -75,28 +85,36 @@ class IngestionJobSummary(BaseModel):
|
|
|
75
85
|
def task_count(self):
|
|
76
86
|
return len(self.task_summaries)
|
|
77
87
|
|
|
78
|
-
def
|
|
88
|
+
def _set_ended(self):
|
|
79
89
|
self.failed_tasks = len(
|
|
80
|
-
[task for task in self.task_summaries if task.
|
|
90
|
+
[task for task in self.task_summaries if task.state == TaskState.FAILED]
|
|
81
91
|
)
|
|
82
92
|
self.successful_tasks = len(
|
|
83
|
-
[task for task in self.task_summaries if task.
|
|
93
|
+
[task for task in self.task_summaries if task.state == TaskState.FINISHED]
|
|
84
94
|
)
|
|
85
95
|
self.ignored_successful_tasks = len(
|
|
86
96
|
[
|
|
87
97
|
task
|
|
88
98
|
for task in self.task_summaries
|
|
89
|
-
if task.
|
|
99
|
+
if task.state == TaskState.FINISHED_IGNORED
|
|
90
100
|
]
|
|
91
101
|
)
|
|
92
|
-
self.
|
|
102
|
+
self.ended_at = utcnow()
|
|
103
|
+
|
|
104
|
+
def set_finished(self):
|
|
105
|
+
self.state = IngestionJobState.FINISHED
|
|
106
|
+
self._set_ended()
|
|
107
|
+
|
|
108
|
+
def set_exception(self, e: Exception):
|
|
109
|
+
self.state = IngestionJobState.FAILED
|
|
110
|
+
self._set_ended()
|
|
93
111
|
|
|
94
112
|
@property
|
|
95
113
|
def duration(self) -> timedelta:
|
|
96
|
-
return self.
|
|
114
|
+
return self.ended_at - self.started_at
|
|
97
115
|
|
|
98
116
|
def output_report(self):
|
|
99
|
-
print(f"\nIngestionJobSummary
|
|
117
|
+
print(f"\nIngestionJobSummary {self.state} in {format_duration(self.duration)}")
|
|
100
118
|
print("--------------------")
|
|
101
119
|
print(f" - IngestionPlan:")
|
|
102
120
|
print(f" Source: {self.source_name}")
|
|
@@ -16,7 +16,7 @@ from ingestify.utils import utcnow
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
class
|
|
19
|
+
class TaskState(str, Enum):
|
|
20
20
|
RUNNING = "RUNNING"
|
|
21
21
|
FINISHED = "FINISHED"
|
|
22
22
|
FINISHED_IGNORED = "FINISHED_IGNORED" # Finished, but didn't produce any new data
|
|
@@ -37,7 +37,7 @@ class TaskSummary(BaseModel):
|
|
|
37
37
|
persisted_file_count: int = 0
|
|
38
38
|
bytes_retrieved: int = 0
|
|
39
39
|
last_modified: Optional[datetime] = None
|
|
40
|
-
|
|
40
|
+
state: TaskState = TaskState.RUNNING
|
|
41
41
|
timings: List[Timing] = Field(default_factory=list)
|
|
42
42
|
|
|
43
43
|
@field_validator("dataset_identifier", mode="before")
|
|
@@ -83,10 +83,10 @@ class TaskSummary(BaseModel):
|
|
|
83
83
|
try:
|
|
84
84
|
yield task_summary
|
|
85
85
|
|
|
86
|
-
task_summary.
|
|
86
|
+
task_summary.set_state(TaskState.FINISHED)
|
|
87
87
|
except Exception as e:
|
|
88
88
|
logger.exception(f"Failed to execute task.")
|
|
89
|
-
task_summary.
|
|
89
|
+
task_summary.set_state(TaskState.FAILED)
|
|
90
90
|
|
|
91
91
|
# When the error comes from our own code, make sure it will be raised to the highest level
|
|
92
92
|
# raise
|
|
@@ -111,8 +111,8 @@ class TaskSummary(BaseModel):
|
|
|
111
111
|
file.modified_at for file in revision.modified_files
|
|
112
112
|
)
|
|
113
113
|
else:
|
|
114
|
-
self.
|
|
114
|
+
self.state = TaskState.FINISHED_IGNORED
|
|
115
115
|
|
|
116
|
-
def
|
|
117
|
-
if self.
|
|
118
|
-
self.
|
|
116
|
+
def set_state(self, state: TaskState):
|
|
117
|
+
if self.state == TaskState.RUNNING:
|
|
118
|
+
self.state = state
|
ingestify/exceptions.py
CHANGED
|
@@ -7,44 +7,27 @@ from dataclass_factory.schema_helpers import type_checker
|
|
|
7
7
|
|
|
8
8
|
from ingestify.domain import DatasetCreated, Identifier
|
|
9
9
|
from ingestify.domain.models.dataset.events import MetadataUpdated, RevisionAdded
|
|
10
|
+
from ingestify.domain.models.event import DomainEvent
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
pre_parse=type_checker(RevisionAdded.event_type, "event_type")
|
|
34
|
-
),
|
|
35
|
-
# ClipSelectionContent: Schema(pre_parse=type_checker(ClipSelectionContent.content_type, field="contentType")),
|
|
36
|
-
# TeamInfoImageContent: Schema(pre_parse=type_checker(TeamInfoImageContent.content_type, field="contentType")),
|
|
37
|
-
# StaticVideoContent: Schema(pre_parse=type_checker(StaticVideoContent.content_type, field="contentType"))
|
|
38
|
-
},
|
|
39
|
-
default_schema=Schema(),
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
T = TypeVar("T")
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
def serialize(data: T, class_: Type[T] = None) -> Any:
|
|
46
|
-
return factory.dump(data, class_)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def unserialize(data: Any, class_: Type[T]) -> T:
|
|
50
|
-
return factory.load(data, class_)
|
|
12
|
+
|
|
13
|
+
event_types = {
|
|
14
|
+
DatasetCreated.event_type: DatasetCreated,
|
|
15
|
+
RevisionAdded.event_type: RevisionAdded,
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def deserialize(event_dict: dict) -> DomainEvent:
|
|
20
|
+
event_cls = event_types[event_dict["event_type"]]
|
|
21
|
+
event_dict["dataset"]["identifier"] = Identifier(
|
|
22
|
+
**event_dict["dataset"]["identifier"]
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
return event_cls.model_validate(event_dict)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def serialize(event: DomainEvent) -> dict:
|
|
29
|
+
event_dict = event.model_dump(mode="json")
|
|
30
|
+
|
|
31
|
+
# Make sure event_type is always part of the event_dict. Pydantic might skip it when the type is ClassVar
|
|
32
|
+
event_dict["event_type"] = event.event_type
|
|
33
|
+
return event_dict
|
|
@@ -1,13 +1,24 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
import json
|
|
2
3
|
import uuid
|
|
4
|
+
from collections import defaultdict
|
|
3
5
|
from typing import Optional, Union, List
|
|
4
6
|
|
|
5
|
-
from sqlalchemy import
|
|
7
|
+
from sqlalchemy import (
|
|
8
|
+
create_engine,
|
|
9
|
+
func,
|
|
10
|
+
text,
|
|
11
|
+
tuple_,
|
|
12
|
+
Table,
|
|
13
|
+
insert,
|
|
14
|
+
Transaction,
|
|
15
|
+
Connection,
|
|
16
|
+
)
|
|
6
17
|
from sqlalchemy.engine import make_url
|
|
7
18
|
from sqlalchemy.exc import NoSuchModuleError
|
|
8
19
|
from sqlalchemy.orm import Session, joinedload
|
|
9
20
|
|
|
10
|
-
from ingestify.domain import File
|
|
21
|
+
from ingestify.domain import File, Revision
|
|
11
22
|
from ingestify.domain.models import (
|
|
12
23
|
Dataset,
|
|
13
24
|
DatasetCollection,
|
|
@@ -15,11 +26,22 @@ from ingestify.domain.models import (
|
|
|
15
26
|
Identifier,
|
|
16
27
|
Selector,
|
|
17
28
|
)
|
|
29
|
+
from ingestify.domain.models.base import BaseModel
|
|
18
30
|
from ingestify.domain.models.dataset.collection_metadata import (
|
|
19
31
|
DatasetCollectionMetadata,
|
|
20
32
|
)
|
|
21
|
-
|
|
22
|
-
from .
|
|
33
|
+
from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobSummary
|
|
34
|
+
from ingestify.domain.models.task.task_summary import TaskSummary
|
|
35
|
+
from ingestify.exceptions import IngestifyError
|
|
36
|
+
|
|
37
|
+
from .tables import (
|
|
38
|
+
metadata,
|
|
39
|
+
dataset_table,
|
|
40
|
+
file_table,
|
|
41
|
+
revision_table,
|
|
42
|
+
ingestion_job_summary_table,
|
|
43
|
+
task_summary_table,
|
|
44
|
+
)
|
|
23
45
|
|
|
24
46
|
|
|
25
47
|
def parse_value(v):
|
|
@@ -113,6 +135,31 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
113
135
|
def session(self):
|
|
114
136
|
return self.session_provider.get()
|
|
115
137
|
|
|
138
|
+
def _upsert(self, connection: Connection, table: Table, entities: list[dict]):
|
|
139
|
+
dialect = self.session.bind.dialect.name
|
|
140
|
+
if dialect == "mysql":
|
|
141
|
+
from sqlalchemy.dialects.mysql import insert
|
|
142
|
+
elif dialect == "postgresql":
|
|
143
|
+
from sqlalchemy.dialects.postgresql import insert
|
|
144
|
+
elif dialect == "sqlite":
|
|
145
|
+
from sqlalchemy.dialects.sqlite import insert
|
|
146
|
+
else:
|
|
147
|
+
raise IngestifyError(f"Don't know how to do an upsert in {dialect}")
|
|
148
|
+
|
|
149
|
+
stmt = insert(table).values(entities)
|
|
150
|
+
|
|
151
|
+
primary_key_columns = [column for column in table.columns if column.primary_key]
|
|
152
|
+
|
|
153
|
+
set_ = {
|
|
154
|
+
name: getattr(stmt.excluded, name)
|
|
155
|
+
for name, column in table.columns.items()
|
|
156
|
+
if column not in primary_key_columns
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
stmt = stmt.on_conflict_do_update(index_elements=primary_key_columns, set_=set_)
|
|
160
|
+
|
|
161
|
+
connection.execute(stmt)
|
|
162
|
+
|
|
116
163
|
def _filter_query(
|
|
117
164
|
self,
|
|
118
165
|
query,
|
|
@@ -122,11 +169,11 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
122
169
|
dataset_id: Optional[Union[str, List[str]]] = None,
|
|
123
170
|
selector: Optional[Union[Selector, List[Selector]]] = None,
|
|
124
171
|
):
|
|
125
|
-
query = query.filter(
|
|
172
|
+
query = query.filter(dataset_table.c.bucket == bucket)
|
|
126
173
|
if dataset_type:
|
|
127
|
-
query = query.filter(
|
|
174
|
+
query = query.filter(dataset_table.c.dataset_type == dataset_type)
|
|
128
175
|
if provider:
|
|
129
|
-
query = query.filter(
|
|
176
|
+
query = query.filter(dataset_table.c.provider == provider)
|
|
130
177
|
if dataset_id is not None:
|
|
131
178
|
if isinstance(dataset_id, list):
|
|
132
179
|
if len(dataset_id) == 0:
|
|
@@ -134,9 +181,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
134
181
|
# return an empty DatasetCollection
|
|
135
182
|
return DatasetCollection()
|
|
136
183
|
|
|
137
|
-
query = query.filter(
|
|
184
|
+
query = query.filter(dataset_table.c.dataset_id.in_(dataset_id))
|
|
138
185
|
else:
|
|
139
|
-
query = query.filter(
|
|
186
|
+
query = query.filter(dataset_table.c.dataset_id == dataset_id)
|
|
140
187
|
|
|
141
188
|
dialect = self.session.bind.dialect.name
|
|
142
189
|
|
|
@@ -175,7 +222,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
175
222
|
else:
|
|
176
223
|
column = column.as_string()
|
|
177
224
|
else:
|
|
178
|
-
column = func.json_extract(
|
|
225
|
+
column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
|
|
179
226
|
columns.append(column)
|
|
180
227
|
|
|
181
228
|
values = []
|
|
@@ -189,6 +236,60 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
189
236
|
query = query.filter(text(where))
|
|
190
237
|
return query
|
|
191
238
|
|
|
239
|
+
def load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
|
|
240
|
+
if not dataset_ids:
|
|
241
|
+
return []
|
|
242
|
+
|
|
243
|
+
dataset_rows = list(
|
|
244
|
+
self.session.query(dataset_table).filter(
|
|
245
|
+
dataset_table.c.dataset_id.in_(dataset_ids)
|
|
246
|
+
)
|
|
247
|
+
)
|
|
248
|
+
revisions_per_dataset = {}
|
|
249
|
+
rows = (
|
|
250
|
+
self.session.query(revision_table)
|
|
251
|
+
.filter(revision_table.c.dataset_id.in_(dataset_ids))
|
|
252
|
+
.order_by(revision_table.c.dataset_id)
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
for dataset_id, revisions in itertools.groupby(
|
|
256
|
+
rows, key=lambda row: row.dataset_id
|
|
257
|
+
):
|
|
258
|
+
revisions_per_dataset[dataset_id] = list(revisions)
|
|
259
|
+
|
|
260
|
+
files_per_revision = {}
|
|
261
|
+
rows = (
|
|
262
|
+
self.session.query(file_table)
|
|
263
|
+
.filter(file_table.c.dataset_id.in_(dataset_ids))
|
|
264
|
+
.order_by(file_table.c.dataset_id, file_table.c.revision_id)
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
for (dataset_id, revision_id), files in itertools.groupby(
|
|
268
|
+
rows, key=lambda row: (row.dataset_id, row.revision_id)
|
|
269
|
+
):
|
|
270
|
+
files_per_revision[(dataset_id, revision_id)] = list(files)
|
|
271
|
+
|
|
272
|
+
datasets = []
|
|
273
|
+
for dataset_row in dataset_rows:
|
|
274
|
+
dataset_id = dataset_row.dataset_id
|
|
275
|
+
revisions = []
|
|
276
|
+
for revision_row in revisions_per_dataset.get(dataset_id, []):
|
|
277
|
+
files = [
|
|
278
|
+
File.model_validate(file_row)
|
|
279
|
+
for file_row in files_per_revision.get(
|
|
280
|
+
(dataset_id, revision_row.revision_id), []
|
|
281
|
+
)
|
|
282
|
+
]
|
|
283
|
+
revision = Revision.model_validate(
|
|
284
|
+
{**revision_row._mapping, "modified_files": files}
|
|
285
|
+
)
|
|
286
|
+
revisions.append(revision)
|
|
287
|
+
|
|
288
|
+
datasets.append(
|
|
289
|
+
Dataset.model_validate({**dataset_row._mapping, "revisions": revisions})
|
|
290
|
+
)
|
|
291
|
+
return datasets
|
|
292
|
+
|
|
192
293
|
def get_dataset_collection(
|
|
193
294
|
self,
|
|
194
295
|
bucket: str,
|
|
@@ -209,17 +310,20 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
209
310
|
)
|
|
210
311
|
|
|
211
312
|
if not metadata_only:
|
|
212
|
-
dataset_query = apply_query_filter(
|
|
213
|
-
|
|
313
|
+
dataset_query = apply_query_filter(
|
|
314
|
+
self.session.query(dataset_table.c.dataset_id)
|
|
315
|
+
)
|
|
316
|
+
dataset_ids = [row.dataset_id for row in dataset_query]
|
|
317
|
+
datasets = self.load_datasets(dataset_ids)
|
|
214
318
|
else:
|
|
215
319
|
datasets = []
|
|
216
320
|
|
|
217
321
|
metadata_result_row = apply_query_filter(
|
|
218
322
|
self.session.query(
|
|
219
|
-
func.min(
|
|
220
|
-
func.max(
|
|
323
|
+
func.min(file_table.c.modified_at).label("first_modified_at"),
|
|
324
|
+
func.max(file_table.c.modified_at).label("last_modified_at"),
|
|
221
325
|
func.count().label("row_count"),
|
|
222
|
-
).join(
|
|
326
|
+
).join(dataset_table, dataset_table.c.dataset_id == file_table.c.dataset_id)
|
|
223
327
|
).first()
|
|
224
328
|
dataset_collection_metadata = DatasetCollectionMetadata(*metadata_result_row)
|
|
225
329
|
|
|
@@ -228,12 +332,153 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
228
332
|
def save(self, bucket: str, dataset: Dataset):
|
|
229
333
|
# Just make sure
|
|
230
334
|
dataset.bucket = bucket
|
|
231
|
-
|
|
232
|
-
self.
|
|
335
|
+
|
|
336
|
+
self._save([dataset])
|
|
337
|
+
|
|
338
|
+
def connect(self):
|
|
339
|
+
return self.session_provider.engine.connect()
|
|
340
|
+
|
|
341
|
+
def _save(self, datasets: list[Dataset]):
|
|
342
|
+
"""Only do upserts. Never delete. Rows get only deleted when an entire Dataset is removed."""
|
|
343
|
+
datasets_entities = []
|
|
344
|
+
revision_entities = []
|
|
345
|
+
file_entities = []
|
|
346
|
+
|
|
347
|
+
for dataset in datasets:
|
|
348
|
+
datasets_entities.append(dataset.model_dump(exclude={"revisions"}))
|
|
349
|
+
for revision in dataset.revisions:
|
|
350
|
+
revision_entities.append(
|
|
351
|
+
{
|
|
352
|
+
**revision.model_dump(
|
|
353
|
+
exclude={"is_squashed", "modified_files"}
|
|
354
|
+
),
|
|
355
|
+
"dataset_id": dataset.dataset_id,
|
|
356
|
+
}
|
|
357
|
+
)
|
|
358
|
+
for file in revision.modified_files:
|
|
359
|
+
file_entities.append(
|
|
360
|
+
{
|
|
361
|
+
**file.model_dump(),
|
|
362
|
+
"dataset_id": dataset.dataset_id,
|
|
363
|
+
"revision_id": revision.revision_id,
|
|
364
|
+
}
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
with self.connect() as connection:
|
|
368
|
+
try:
|
|
369
|
+
self._upsert(connection, dataset_table, datasets_entities)
|
|
370
|
+
self._upsert(connection, revision_table, revision_entities)
|
|
371
|
+
self._upsert(connection, file_table, file_entities)
|
|
372
|
+
except Exception:
|
|
373
|
+
connection.rollback()
|
|
374
|
+
raise
|
|
375
|
+
else:
|
|
376
|
+
connection.commit()
|
|
233
377
|
|
|
234
378
|
def destroy(self, dataset: Dataset):
|
|
235
|
-
self.
|
|
236
|
-
|
|
379
|
+
with self.connect() as connection:
|
|
380
|
+
try:
|
|
381
|
+
# Delete modified files related to the dataset
|
|
382
|
+
file_table.delete().where(
|
|
383
|
+
file_table.c.dataset_id == dataset.dataset_id
|
|
384
|
+
).execute()
|
|
385
|
+
|
|
386
|
+
# Delete revisions related to the dataset
|
|
387
|
+
revision_table.delete().where(
|
|
388
|
+
revision_table.c.dataset_id == dataset.dataset_id
|
|
389
|
+
).execute()
|
|
390
|
+
|
|
391
|
+
# Delete the dataset itself
|
|
392
|
+
dataset_table.delete().where(
|
|
393
|
+
dataset_table.c.dataset_id == dataset.dataset_id
|
|
394
|
+
).execute()
|
|
395
|
+
|
|
396
|
+
connection.commit()
|
|
397
|
+
except Exception:
|
|
398
|
+
connection.rollback()
|
|
399
|
+
raise
|
|
237
400
|
|
|
238
401
|
def next_identity(self):
|
|
239
402
|
return str(uuid.uuid4())
|
|
403
|
+
|
|
404
|
+
# TODO: consider moving the IngestionJobSummary methods to a different Repository
|
|
405
|
+
def save_ingestion_job_summary(self, ingestion_job_summary: IngestionJobSummary):
|
|
406
|
+
ingestion_job_summary_entities = [
|
|
407
|
+
ingestion_job_summary.model_dump(exclude={"task_summaries"})
|
|
408
|
+
]
|
|
409
|
+
task_summary_entities = []
|
|
410
|
+
for task_summary in ingestion_job_summary.task_summaries:
|
|
411
|
+
task_summary_entities.append(
|
|
412
|
+
{
|
|
413
|
+
**task_summary.model_dump(),
|
|
414
|
+
"ingestion_job_summary_id": ingestion_job_summary.ingestion_job_summary_id,
|
|
415
|
+
}
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
with self.session_provider.engine.connect() as connection:
|
|
419
|
+
try:
|
|
420
|
+
self._upsert(
|
|
421
|
+
connection,
|
|
422
|
+
ingestion_job_summary_table,
|
|
423
|
+
ingestion_job_summary_entities,
|
|
424
|
+
)
|
|
425
|
+
if task_summary_entities:
|
|
426
|
+
self._upsert(connection, task_summary_table, task_summary_entities)
|
|
427
|
+
except Exception:
|
|
428
|
+
connection.rollback()
|
|
429
|
+
raise
|
|
430
|
+
else:
|
|
431
|
+
connection.commit()
|
|
432
|
+
|
|
433
|
+
def load_ingestion_job_summaries(self) -> list[IngestionJobSummary]:
|
|
434
|
+
ingestion_job_summary_ids = [
|
|
435
|
+
row.ingestion_job_summary_id
|
|
436
|
+
for row in self.session.query(
|
|
437
|
+
ingestion_job_summary_table.c.ingestion_job_summary_id
|
|
438
|
+
)
|
|
439
|
+
]
|
|
440
|
+
|
|
441
|
+
ingestion_job_summary_rows = list(
|
|
442
|
+
self.session.query(ingestion_job_summary_table).filter(
|
|
443
|
+
ingestion_job_summary_table.c.ingestion_job_summary_id.in_(
|
|
444
|
+
ingestion_job_summary_ids
|
|
445
|
+
)
|
|
446
|
+
)
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
task_summary_entities_per_job_summary = {}
|
|
450
|
+
rows = (
|
|
451
|
+
self.session.query(task_summary_table)
|
|
452
|
+
.filter(
|
|
453
|
+
task_summary_table.c.ingestion_job_summary_id.in_(
|
|
454
|
+
ingestion_job_summary_ids
|
|
455
|
+
)
|
|
456
|
+
)
|
|
457
|
+
.order_by(task_summary_table.c.ingestion_job_summary_id)
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
for ingestion_job_summary_id, task_summaries_rows in itertools.groupby(
|
|
461
|
+
rows, key=lambda row: row.ingestion_job_summary_id
|
|
462
|
+
):
|
|
463
|
+
task_summary_entities_per_job_summary[ingestion_job_summary_id] = list(
|
|
464
|
+
task_summaries_rows
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
ingestion_job_summaries = []
|
|
468
|
+
for ingestion_job_summary_row in ingestion_job_summary_rows:
|
|
469
|
+
task_summaries = [
|
|
470
|
+
TaskSummary.model_validate(row)
|
|
471
|
+
for row in task_summary_entities_per_job_summary.get(
|
|
472
|
+
ingestion_job_summary_row.ingestion_job_summary_id, []
|
|
473
|
+
)
|
|
474
|
+
]
|
|
475
|
+
|
|
476
|
+
ingestion_job_summaries.append(
|
|
477
|
+
IngestionJobSummary.model_validate(
|
|
478
|
+
{
|
|
479
|
+
**ingestion_job_summary_row._mapping,
|
|
480
|
+
"task_summaries": task_summaries,
|
|
481
|
+
}
|
|
482
|
+
)
|
|
483
|
+
)
|
|
484
|
+
return ingestion_job_summaries
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
from dataclasses import is_dataclass, asdict
|
|
3
2
|
from pathlib import Path
|
|
4
3
|
from typing import Optional
|
|
5
4
|
|
|
@@ -15,17 +14,13 @@ from sqlalchemy import (
|
|
|
15
14
|
String,
|
|
16
15
|
Table,
|
|
17
16
|
TypeDecorator,
|
|
18
|
-
Boolean,
|
|
19
17
|
)
|
|
20
|
-
from sqlalchemy.orm import registry, relationship
|
|
21
18
|
|
|
22
|
-
from ingestify.domain import
|
|
23
|
-
from ingestify.domain.models import Dataset, File, Revision
|
|
19
|
+
from ingestify.domain import Identifier, DataSpecVersionCollection, Selector
|
|
24
20
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
25
|
-
from ingestify.domain.models.ingestion.ingestion_job_summary import
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
from ingestify.domain.models.task.task_summary import TaskSummary, Operation, TaskStatus
|
|
21
|
+
from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobState
|
|
22
|
+
|
|
23
|
+
from ingestify.domain.models.task.task_summary import Operation, TaskState
|
|
29
24
|
from ingestify.domain.models.timing import Timing
|
|
30
25
|
from ingestify.domain.models.dataset.revision import RevisionState
|
|
31
26
|
|
|
@@ -106,7 +101,7 @@ class RevisionStateString(TypeDecorator):
|
|
|
106
101
|
|
|
107
102
|
def process_result_value(self, value, dialect):
|
|
108
103
|
if not value:
|
|
109
|
-
return
|
|
104
|
+
return RevisionState.PENDING_VALIDATION
|
|
110
105
|
|
|
111
106
|
return RevisionState[value]
|
|
112
107
|
|
|
@@ -124,20 +119,31 @@ class OperationString(TypeDecorator):
|
|
|
124
119
|
return Operation[value]
|
|
125
120
|
|
|
126
121
|
|
|
127
|
-
class
|
|
122
|
+
class TaskStateString(TypeDecorator):
|
|
128
123
|
impl = String(255)
|
|
129
124
|
|
|
130
|
-
def process_bind_param(self, value:
|
|
125
|
+
def process_bind_param(self, value: TaskState, dialect):
|
|
131
126
|
return value.value
|
|
132
127
|
|
|
133
128
|
def process_result_value(self, value, dialect):
|
|
134
129
|
if not value:
|
|
135
130
|
return value
|
|
136
131
|
|
|
137
|
-
return
|
|
132
|
+
return TaskState[value]
|
|
133
|
+
|
|
138
134
|
|
|
135
|
+
class IngestionJobStateString(TypeDecorator):
|
|
136
|
+
impl = String(255)
|
|
137
|
+
|
|
138
|
+
def process_bind_param(self, value: IngestionJobState, dialect):
|
|
139
|
+
return value.value
|
|
140
|
+
|
|
141
|
+
def process_result_value(self, value, dialect):
|
|
142
|
+
if not value:
|
|
143
|
+
return value
|
|
144
|
+
|
|
145
|
+
return IngestionJobState[value]
|
|
139
146
|
|
|
140
|
-
mapper_registry = registry()
|
|
141
147
|
|
|
142
148
|
metadata = MetaData()
|
|
143
149
|
|
|
@@ -193,40 +199,7 @@ file_table = Table(
|
|
|
193
199
|
),
|
|
194
200
|
)
|
|
195
201
|
|
|
196
|
-
|
|
197
|
-
mapper_registry.map_imperatively(
|
|
198
|
-
Dataset,
|
|
199
|
-
dataset_table,
|
|
200
|
-
properties={
|
|
201
|
-
"revisions": relationship(
|
|
202
|
-
Revision,
|
|
203
|
-
backref="dataset",
|
|
204
|
-
order_by=revision_table.c.revision_id,
|
|
205
|
-
lazy="selectin",
|
|
206
|
-
cascade="all, delete-orphan",
|
|
207
|
-
),
|
|
208
|
-
},
|
|
209
|
-
)
|
|
210
|
-
|
|
211
|
-
mapper_registry.map_imperatively(
|
|
212
|
-
Revision,
|
|
213
|
-
revision_table,
|
|
214
|
-
properties={
|
|
215
|
-
"modified_files": relationship(
|
|
216
|
-
File,
|
|
217
|
-
order_by=file_table.c.file_id,
|
|
218
|
-
primaryjoin="and_(Revision.revision_id==File.revision_id, Revision.dataset_id==File.dataset_id)",
|
|
219
|
-
lazy="selectin",
|
|
220
|
-
cascade="all, delete-orphan",
|
|
221
|
-
)
|
|
222
|
-
},
|
|
223
|
-
)
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
mapper_registry.map_imperatively(File, file_table)
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
ingestion_job_summary = Table(
|
|
202
|
+
ingestion_job_summary_table = Table(
|
|
230
203
|
"ingestion_job_summary",
|
|
231
204
|
metadata,
|
|
232
205
|
Column("ingestion_job_summary_id", String(255), primary_key=True),
|
|
@@ -238,18 +211,25 @@ ingestion_job_summary = Table(
|
|
|
238
211
|
Column(
|
|
239
212
|
"data_spec_versions",
|
|
240
213
|
JSONType(
|
|
241
|
-
serializer=lambda data_spec_versions:
|
|
214
|
+
serializer=lambda data_spec_versions: {
|
|
215
|
+
key: list(value) for key, value in data_spec_versions.items()
|
|
216
|
+
},
|
|
242
217
|
deserializer=lambda data_spec_versions: DataSpecVersionCollection.from_dict(
|
|
243
218
|
data_spec_versions
|
|
244
219
|
),
|
|
245
220
|
),
|
|
246
221
|
),
|
|
247
222
|
Column(
|
|
248
|
-
"selector",
|
|
223
|
+
"selector",
|
|
224
|
+
JSONType(
|
|
225
|
+
serializer=lambda selector: selector.filtered_attributes,
|
|
226
|
+
deserializer=lambda selector: Selector(**selector),
|
|
227
|
+
),
|
|
249
228
|
),
|
|
250
229
|
Column("started_at", TZDateTime(6)),
|
|
251
|
-
Column("
|
|
230
|
+
Column("ended_at", TZDateTime(6)),
|
|
252
231
|
# Some task counters
|
|
232
|
+
Column("state", IngestionJobStateString),
|
|
253
233
|
Column("successful_tasks", Integer),
|
|
254
234
|
Column("ignored_successful_tasks", Integer),
|
|
255
235
|
Column("skipped_datasets", Integer),
|
|
@@ -258,7 +238,10 @@ ingestion_job_summary = Table(
|
|
|
258
238
|
"timings",
|
|
259
239
|
JSONType(
|
|
260
240
|
serializer=lambda timings: [
|
|
261
|
-
|
|
241
|
+
# Timing is probably already a dictionary. Load it into Timing first, so it can be dumped
|
|
242
|
+
# in json mode
|
|
243
|
+
Timing.model_validate(timing).model_dump(mode="json")
|
|
244
|
+
for timing in timings
|
|
262
245
|
],
|
|
263
246
|
deserializer=lambda timings: [
|
|
264
247
|
Timing.model_validate(timing) for timing in timings
|
|
@@ -299,12 +282,13 @@ task_summary_table = Table(
|
|
|
299
282
|
Column("persisted_file_count", Integer),
|
|
300
283
|
Column("bytes_retrieved", Integer),
|
|
301
284
|
Column("last_modified", TZDateTime(6)),
|
|
302
|
-
Column("
|
|
285
|
+
Column("state", TaskStateString),
|
|
303
286
|
Column(
|
|
304
287
|
"timings",
|
|
305
288
|
JSONType(
|
|
306
289
|
serializer=lambda timings: [
|
|
307
|
-
timing.model_dump(mode="json")
|
|
290
|
+
Timing.model_validate(timing).model_dump(mode="json")
|
|
291
|
+
for timing in timings
|
|
308
292
|
],
|
|
309
293
|
deserializer=lambda timings: [
|
|
310
294
|
Timing.model_validate(timing) for timing in timings
|
|
@@ -316,21 +300,54 @@ task_summary_table = Table(
|
|
|
316
300
|
# Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
|
|
317
301
|
# Column("source", JSONType()),
|
|
318
302
|
)
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
mapper_registry
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
303
|
+
#
|
|
304
|
+
#
|
|
305
|
+
# mapper_registry = registry()
|
|
306
|
+
#
|
|
307
|
+
# mapper_registry.map_imperatively(
|
|
308
|
+
# Dataset,
|
|
309
|
+
# dataset_table,
|
|
310
|
+
# properties={
|
|
311
|
+
# "revisions": relationship(
|
|
312
|
+
# Revision,
|
|
313
|
+
# backref="dataset",
|
|
314
|
+
# order_by=revision_table.c.revision_id,
|
|
315
|
+
# lazy="selectin",
|
|
316
|
+
# cascade="all, delete-orphan",
|
|
317
|
+
# ),
|
|
318
|
+
# },
|
|
319
|
+
# )
|
|
320
|
+
#
|
|
321
|
+
# mapper_registry.map_imperatively(
|
|
322
|
+
# Revision,
|
|
323
|
+
# revision_table,
|
|
324
|
+
# properties={
|
|
325
|
+
# "modified_files": relationship(
|
|
326
|
+
# File,
|
|
327
|
+
# order_by=file_table.c.file_id,
|
|
328
|
+
# primaryjoin="and_(Revision.revision_id==File.revision_id, Revision.dataset_id==File.dataset_id)",
|
|
329
|
+
# lazy="selectin",
|
|
330
|
+
# cascade="all, delete-orphan",
|
|
331
|
+
# )
|
|
332
|
+
# },
|
|
333
|
+
# )
|
|
334
|
+
#
|
|
335
|
+
#
|
|
336
|
+
# mapper_registry.map_imperatively(File, file_table)
|
|
337
|
+
#
|
|
338
|
+
# mapper_registry.map_imperatively(
|
|
339
|
+
# IngestionJobSummary,
|
|
340
|
+
# ingestion_job_summary,
|
|
341
|
+
# properties={
|
|
342
|
+
# "task_summaries": relationship(
|
|
343
|
+
# TaskSummary,
|
|
344
|
+
# backref="ingestion_job_summary",
|
|
345
|
+
# # order_by=task_summary_table.c.revision_id,
|
|
346
|
+
# lazy="selectin",
|
|
347
|
+
# cascade="all, delete-orphan",
|
|
348
|
+
# ),
|
|
349
|
+
# },
|
|
350
|
+
# )
|
|
351
|
+
#
|
|
352
|
+
#
|
|
353
|
+
# mapper_registry.map_imperatively(TaskSummary, task_summary_table)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ingestify
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.4
|
|
4
4
|
Summary: Data Ingestion Framework
|
|
5
5
|
Author: Koen Vossen
|
|
6
6
|
Author-email: info@koenvossen.nl
|
|
@@ -227,23 +227,23 @@ dataset_collection = store.get_dataset_collection(
|
|
|
227
227
|
store.map(
|
|
228
228
|
lambda dataset: (
|
|
229
229
|
store
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
"*",
|
|
237
|
-
match_id=dataset.
|
|
238
|
-
competition_id=dataset.
|
|
239
|
-
season_id=dataset.
|
|
240
|
-
|
|
230
|
+
|
|
231
|
+
# As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
|
|
232
|
+
.load_with_kloppy(dataset)
|
|
233
|
+
|
|
234
|
+
# Convert it into a polars dataframe using all columns in the original data and some more additional ones
|
|
235
|
+
.to_df(
|
|
236
|
+
"*",
|
|
237
|
+
match_id=dataset.dataset_resource_id.match_id,
|
|
238
|
+
competition_id=dataset.dataset_resource_id.competition_id,
|
|
239
|
+
season_id=dataset.dataset_resource_id.season_id,
|
|
240
|
+
|
|
241
241
|
engine="polars"
|
|
242
242
|
)
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
f"/tmp/files/blaat/{dataset.
|
|
243
|
+
|
|
244
|
+
# Write to parquet format
|
|
245
|
+
.write_parquet(
|
|
246
|
+
f"/tmp/files/blaat/{dataset.dataset_resource_id.match_id}.parquet"
|
|
247
247
|
)
|
|
248
248
|
),
|
|
249
249
|
dataset_collection,
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
ingestify/__init__.py,sha256=
|
|
1
|
+
ingestify/__init__.py,sha256=lyBZ_P8y4qlkE1e11F4T41fSTp8WbReifRxX9UGizxA,301
|
|
2
2
|
ingestify/cmdline.py,sha256=bIuyPgGEw4wIglNzpG9zp7TsJozsP8NSVsCe4eAyWUg,7189
|
|
3
|
-
ingestify/exceptions.py,sha256=
|
|
3
|
+
ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
|
|
4
4
|
ingestify/main.py,sha256=Xr0VbGgstPO7doDX18xqk4lBb4W2sbGWtQuXZaARsHA,8763
|
|
5
5
|
ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
|
|
6
6
|
ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
|
|
7
7
|
ingestify/utils.py,sha256=HETGhAoUlutLG0cQR63nac2JbFei9gnktDHeBQoYWfU,5692
|
|
8
8
|
ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
ingestify/application/dataset_store.py,sha256=
|
|
9
|
+
ingestify/application/dataset_store.py,sha256=c10EIxzOfO4ksKwPOI9jcOn33j54QWu_qXOMLwe-Y-A,11617
|
|
10
10
|
ingestify/application/ingestion_engine.py,sha256=PtMjKMpvfqB802G5zfKLzyamdH7qFOXl3x6_97y8w60,2288
|
|
11
11
|
ingestify/application/loader.py,sha256=v8ZcpMDEml9k_uFPFqT4WaCjXED_OIpAr7g0Pz5Hp6Y,7153
|
|
12
12
|
ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
|
|
13
13
|
ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
|
|
14
14
|
ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
|
|
15
|
-
ingestify/domain/models/base.py,sha256=
|
|
15
|
+
ingestify/domain/models/base.py,sha256=4gKbREajxJHlS-VwKoosNtHVupZ4eDLKMqnJ4ib0aS8,184
|
|
16
16
|
ingestify/domain/models/data_spec_version_collection.py,sha256=CAXlO4W2AOOWAPdPAuymqBHnJpiYtkr2z7fYFJ3HSCk,1372
|
|
17
17
|
ingestify/domain/models/fetch_policy.py,sha256=d7K1TzliNJXxqaqzqEOQWLhvgIvmmqhUQEliXvSUcTs,1405
|
|
18
18
|
ingestify/domain/models/sink.py,sha256=OBVfFMpB7puJmHg4q2KYx4qgoAnlmX8xKWYnPi8a9pc,178
|
|
@@ -21,15 +21,15 @@ ingestify/domain/models/timing.py,sha256=TvvH6Szo61CD8wCP7Awyc45CXga5lKqvoW2U-0T
|
|
|
21
21
|
ingestify/domain/models/dataset/__init__.py,sha256=i1kswluvWjw0xn4OUByRt7yeRvNHu1mauevv-Vmayx4,630
|
|
22
22
|
ingestify/domain/models/dataset/collection.py,sha256=E2utQ6oyaFFrfQFMiwP9J_I7Wm21z0sRvE4Zc3QEs20,1310
|
|
23
23
|
ingestify/domain/models/dataset/collection_metadata.py,sha256=gI5cb9M0QRsheIr2jA71wOyWfI5lGx5ES2Qw7rbDIoA,371
|
|
24
|
-
ingestify/domain/models/dataset/dataset.py,sha256=
|
|
24
|
+
ingestify/domain/models/dataset/dataset.py,sha256=6iQgBApRK08GhxArnJjjE9SuJMMOsKx_gI6JDHy5nZc,2970
|
|
25
25
|
ingestify/domain/models/dataset/dataset_repository.py,sha256=kUjiqW58kOUOli1gZCLR5xw4dBX0bqI1UJsf16hgNsQ,812
|
|
26
26
|
ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4FVk_T-ZNUDezkvt7VzY,220
|
|
27
27
|
ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
|
|
28
|
-
ingestify/domain/models/dataset/file.py,sha256=
|
|
28
|
+
ingestify/domain/models/dataset/file.py,sha256=1Thdv6A1YmC1UfutaRf2q3FGHQYO0SWEptCxur6Ahfs,4144
|
|
29
29
|
ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
|
|
30
30
|
ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
|
|
31
31
|
ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
|
|
32
|
-
ingestify/domain/models/dataset/revision.py,sha256=
|
|
32
|
+
ingestify/domain/models/dataset/revision.py,sha256=HPOZpVmQSwdcsr90RNVlOQ7c1_W7grzi5E1NOiEK92g,1331
|
|
33
33
|
ingestify/domain/models/dataset/selector.py,sha256=kEGpU8pIyjZ0zwE9n2uo_NY5xrNanWiTTgapyMAUEsw,1039
|
|
34
34
|
ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
|
|
35
35
|
ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
|
|
@@ -39,15 +39,15 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
|
|
|
39
39
|
ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
|
|
40
40
|
ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
|
|
41
41
|
ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
-
ingestify/domain/models/ingestion/ingestion_job.py,sha256=
|
|
43
|
-
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=
|
|
42
|
+
ingestify/domain/models/ingestion/ingestion_job.py,sha256=H3vnEUS3izuNJfmD7ZGbznemX9r2JZ1po7D7D9ArzwM,13230
|
|
43
|
+
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=cgm8kLoX3eK9SkBYe5HhwA7kg5FAyN4kfTCJrVHaRlc,4702
|
|
44
44
|
ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
|
|
45
45
|
ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
|
|
46
46
|
ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
|
|
47
47
|
ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
|
|
48
48
|
ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
|
|
49
49
|
ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
|
|
50
|
-
ingestify/domain/models/task/task_summary.py,sha256=
|
|
50
|
+
ingestify/domain/models/task/task_summary.py,sha256=Ncf6ij_aLkElZOsBgep-kd82FyzHjr5xjhAbAXNRJUs,3757
|
|
51
51
|
ingestify/domain/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
52
|
ingestify/domain/services/identifier_key_transformer.py,sha256=y4GS9u9Ej1MO2jUhAxWbifp0mrE_MqTHvVVcoQzSKb4,4034
|
|
53
53
|
ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -55,7 +55,7 @@ ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-
|
|
|
55
55
|
ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
|
|
56
56
|
ingestify/infra/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
57
57
|
ingestify/infra/fetch/http.py,sha256=ldaXy6alBbI9z63H97lXfYZNT0ZCBkTac1W6-acNjjY,4127
|
|
58
|
-
ingestify/infra/serialization/__init__.py,sha256
|
|
58
|
+
ingestify/infra/serialization/__init__.py,sha256=-i8XLJDI2hwlX65JITcIzuOaGLJaNekgG9OfA6L7Enc,1035
|
|
59
59
|
ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
60
|
ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
|
|
61
61
|
ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -64,8 +64,8 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
|
|
|
64
64
|
ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
|
|
65
65
|
ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
66
|
ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
|
|
67
|
-
ingestify/infra/store/dataset/sqlalchemy/
|
|
68
|
-
ingestify/infra/store/dataset/sqlalchemy/
|
|
67
|
+
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=86BqLhj5pB45iNSfYWbuMNwo-9KnGbbSYtdD8WJw_qo,16003
|
|
68
|
+
ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=b73jqpW-_QubtZpFJv7BTKdTsKbufESP0O1uJCmFfBE,10106
|
|
69
69
|
ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
|
|
70
70
|
ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
|
|
71
71
|
ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
|
|
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
|
80
80
|
ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
|
|
81
81
|
ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
82
82
|
ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
|
|
83
|
-
ingestify-0.3.
|
|
84
|
-
ingestify-0.3.
|
|
85
|
-
ingestify-0.3.
|
|
86
|
-
ingestify-0.3.
|
|
87
|
-
ingestify-0.3.
|
|
83
|
+
ingestify-0.3.4.dist-info/METADATA,sha256=v5rEF3343auBHwK8K5Zu0C8tTYfm0WjGtyZs0SmY3xg,18854
|
|
84
|
+
ingestify-0.3.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
85
|
+
ingestify-0.3.4.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
|
|
86
|
+
ingestify-0.3.4.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
|
|
87
|
+
ingestify-0.3.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|