ingestify 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ingestify/__init__.py CHANGED
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.3.3"
11
+ __version__ = "0.3.4"
@@ -58,8 +58,7 @@ class DatasetStore:
58
58
  self.event_bus.dispatch(event)
59
59
 
60
60
  def save_ingestion_job_summary(self, ingestion_job_summary):
61
- self.dataset_repository.session.add(ingestion_job_summary)
62
- self.dataset_repository.session.commit()
61
+ self.dataset_repository.save_ingestion_job_summary(ingestion_job_summary)
63
62
 
64
63
  def get_dataset_collection(
65
64
  self,
@@ -298,8 +297,8 @@ class DatasetStore:
298
297
  )
299
298
 
300
299
  loaded_file = LoadedFile(
301
- _stream=get_stream if lazy else get_stream(file),
302
- **asdict(file),
300
+ stream_=get_stream if lazy else get_stream(file),
301
+ **file.model_dump(),
303
302
  )
304
303
  files[file.file_id] = loaded_file
305
304
  return FileCollection(files, auto_rewind=auto_rewind)
@@ -1,22 +1,5 @@
1
- from functools import partial
2
- from typing import ClassVar, Any, Optional
3
-
4
- import pydantic
5
1
  from pydantic import BaseModel as PydanticBaseModel, ConfigDict
6
2
 
7
3
 
8
- # class BaseModel(PydanticBaseModel):
9
- # model_config = ConfigDict(arbitrary_types_allowed=True)
10
- #
11
- # _sa_instance_state: Optional[dict] = None
12
- from sqlalchemy.orm import MappedAsDataclass
13
-
14
-
15
- class BaseModel(
16
- MappedAsDataclass,
17
- # DeclarativeBase,
18
- dataclass_callable=partial(
19
- pydantic.dataclasses.dataclass, config=ConfigDict(arbitrary_types_allowed=True)
20
- ),
21
- ):
22
- pass
4
+ class BaseModel(PydanticBaseModel):
5
+ model_config = ConfigDict(arbitrary_types_allowed=True, from_attributes=True)
@@ -1,7 +1,7 @@
1
1
  from datetime import datetime
2
2
  from enum import Enum
3
3
  from typing import List, Optional
4
- from pydantic import Field
4
+ from pydantic import Field, field_validator
5
5
 
6
6
  from ingestify.utils import utcnow
7
7
  from .dataset_state import DatasetState
@@ -24,6 +24,13 @@ class Dataset(BaseModel):
24
24
  updated_at: datetime
25
25
  revisions: List[Revision] = Field(default_factory=list)
26
26
 
27
+ @field_validator("identifier", mode="before")
28
+ @classmethod
29
+ def parse_identifier(cls, value):
30
+ if not isinstance(value, Identifier):
31
+ return Identifier(value)
32
+ return value
33
+
27
34
  @property
28
35
  def is_complete(self):
29
36
  return self.state.is_complete
@@ -116,18 +116,18 @@ class LoadedFile(BaseModel):
116
116
  data_serialization_format: Optional[str] # Example: 'json'
117
117
  storage_compression_method: Optional[str] # Example: 'gzip'
118
118
  storage_path: Path
119
- _stream: Union[BinaryIO, BytesIO, Callable[[], Awaitable[Union[BinaryIO, BytesIO]]]]
119
+ stream_: Union[BinaryIO, BytesIO, Callable[[], Awaitable[Union[BinaryIO, BytesIO]]]]
120
120
  revision_id: Optional[int] = None # This can be used when a Revision is squashed
121
121
 
122
122
  def load_stream(self):
123
- if callable(self._stream):
124
- self._stream = self._stream(self)
123
+ if callable(self.stream_):
124
+ self.stream_ = self.stream_(self)
125
125
 
126
126
  @property
127
127
  def stream(self):
128
- if callable(self._stream):
128
+ if callable(self.stream_):
129
129
  raise Exception("You should load the stream first using `load_stream`")
130
- return self._stream
130
+ return self.stream_
131
131
 
132
132
 
133
133
  __all__ = ["File", "DraftFile", "LoadedFile"]
@@ -1,6 +1,6 @@
1
1
  from datetime import datetime
2
2
  from enum import Enum
3
- from typing import Dict, List
3
+ from typing import Dict, List, Optional
4
4
 
5
5
  from typing_extensions import TypedDict
6
6
 
@@ -32,7 +32,7 @@ class Revision(BaseModel):
32
32
  created_at: datetime
33
33
  description: str
34
34
  modified_files: List[File]
35
- source: RevisionSource
35
+ source: Optional[RevisionSource]
36
36
  is_squashed: bool = False
37
37
  state: RevisionState = RevisionState.PENDING_VALIDATION
38
38
 
@@ -2,6 +2,7 @@ import itertools
2
2
  import json
3
3
  import logging
4
4
  import uuid
5
+ from enum import Enum
5
6
  from typing import Optional, Iterator
6
7
 
7
8
  from ingestify import retrieve_http
@@ -17,6 +18,7 @@ from ingestify.domain.models.resources.dataset_resource import (
17
18
  DatasetResource,
18
19
  )
19
20
  from ingestify.domain.models.task.task_summary import TaskSummary
21
+ from ingestify.exceptions import SaveError
20
22
  from ingestify.utils import TaskExecutor, chunker
21
23
 
22
24
  logger = logging.getLogger(__name__)
@@ -120,21 +122,27 @@ class UpdateDatasetTask(Task):
120
122
  with TaskSummary.update(
121
123
  self.task_id, dataset_identifier=dataset_identifier
122
124
  ) as task_summary:
123
- revision = self.store.update_dataset(
124
- dataset=self.dataset,
125
- name=self.dataset_resource.name,
126
- state=self.dataset_resource.state,
127
- metadata=self.dataset_resource.metadata,
128
- files={
129
- file_id: task_summary.record_load_file(
130
- lambda: load_file(file_resource, dataset=self.dataset),
131
- metadata={"file_id": file_id},
132
- )
133
- for file_id, file_resource in self.dataset_resource.files.items()
134
- },
135
- revision_source=revision_source,
136
- )
137
- task_summary.set_stats_from_revision(revision)
125
+
126
+ files = {
127
+ file_id: task_summary.record_load_file(
128
+ lambda: load_file(file_resource, dataset=self.dataset),
129
+ metadata={"file_id": file_id},
130
+ )
131
+ for file_id, file_resource in self.dataset_resource.files.items()
132
+ }
133
+
134
+ try:
135
+ revision = self.store.update_dataset(
136
+ dataset=self.dataset,
137
+ name=self.dataset_resource.name,
138
+ state=self.dataset_resource.state,
139
+ metadata=self.dataset_resource.metadata,
140
+ files=files,
141
+ revision_source=revision_source,
142
+ )
143
+ task_summary.set_stats_from_revision(revision)
144
+ except Exception as e:
145
+ raise SaveError("Could not update dataset") from e
138
146
 
139
147
  return task_summary
140
148
 
@@ -159,24 +167,28 @@ class CreateDatasetTask(Task):
159
167
  )
160
168
 
161
169
  with TaskSummary.create(self.task_id, dataset_identifier) as task_summary:
162
- revision = self.store.create_dataset(
163
- dataset_type=self.dataset_resource.dataset_type,
164
- provider=self.dataset_resource.provider,
165
- dataset_identifier=dataset_identifier,
166
- name=self.dataset_resource.name,
167
- state=self.dataset_resource.state,
168
- metadata=self.dataset_resource.metadata,
169
- files={
170
- file_id: task_summary.record_load_file(
171
- lambda: load_file(file_resource, dataset=None),
172
- metadata={"file_id": file_id},
173
- )
174
- for file_id, file_resource in self.dataset_resource.files.items()
175
- },
176
- revision_source=revision_source,
177
- )
170
+ files = {
171
+ file_id: task_summary.record_load_file(
172
+ lambda: load_file(file_resource, dataset=None),
173
+ metadata={"file_id": file_id},
174
+ )
175
+ for file_id, file_resource in self.dataset_resource.files.items()
176
+ }
177
+ try:
178
+ revision = self.store.create_dataset(
179
+ dataset_type=self.dataset_resource.dataset_type,
180
+ provider=self.dataset_resource.provider,
181
+ dataset_identifier=dataset_identifier,
182
+ name=self.dataset_resource.name,
183
+ state=self.dataset_resource.state,
184
+ metadata=self.dataset_resource.metadata,
185
+ files=files,
186
+ revision_source=revision_source,
187
+ )
178
188
 
179
- task_summary.set_stats_from_revision(revision)
189
+ task_summary.set_stats_from_revision(revision)
190
+ except Exception as e:
191
+ raise SaveError("Could not create dataset") from e
180
192
 
181
193
  return task_summary
182
194
 
@@ -202,6 +214,9 @@ class IngestionJob:
202
214
  self, store: DatasetStore, task_executor: TaskExecutor
203
215
  ) -> Iterator[IngestionJobSummary]:
204
216
  is_first_chunk = True
217
+ ingestion_job_exception = (
218
+ None # Indicate if there was an exception during the IngestionJob itself
219
+ )
205
220
  ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
206
221
  # Process all items in batches. Yield a IngestionJobSummary per batch
207
222
 
@@ -219,26 +234,37 @@ class IngestionJob:
219
234
  # 1. The discover_datasets returns a list, and the entire list can be processed at once
220
235
  # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
221
236
  with ingestion_job_summary.record_timing("find_datasets"):
222
- # Timing might be incorrect as it is an iterator
223
- dataset_resources = self.ingestion_plan.source.find_datasets(
224
- dataset_type=self.ingestion_plan.dataset_type,
225
- data_spec_versions=self.selector.data_spec_versions,
226
- dataset_collection_metadata=dataset_collection_metadata,
227
- **self.selector.custom_attributes,
228
- )
237
+ try:
238
+ dataset_resources = self.ingestion_plan.source.find_datasets(
239
+ dataset_type=self.ingestion_plan.dataset_type,
240
+ data_spec_versions=self.selector.data_spec_versions,
241
+ dataset_collection_metadata=dataset_collection_metadata,
242
+ **self.selector.custom_attributes,
243
+ )
229
244
 
230
- finish_task_timer = ingestion_job_summary.start_timing("tasks")
245
+ # We need to include the to_batches as that will start the generator
246
+ batches = to_batches(dataset_resources)
247
+ except Exception as e:
248
+ logger.exception("Failed to find datasets")
231
249
 
232
- batches = to_batches(dataset_resources)
250
+ ingestion_job_summary.set_exception(e)
251
+ yield ingestion_job_summary
252
+ return
253
+
254
+ finish_task_timer = ingestion_job_summary.start_timing("tasks")
233
255
 
234
256
  while True:
235
257
  try:
236
258
  batch = next(batches)
237
259
  except StopIteration:
238
260
  break
239
- except Exception:
240
- # TODO: handle exception on IngestionJob level
241
- raise
261
+ except Exception as e:
262
+ logger.exception("Failed to fetch next batch")
263
+
264
+ finish_task_timer()
265
+ ingestion_job_summary.set_exception(e)
266
+ yield ingestion_job_summary
267
+ return
242
268
 
243
269
  dataset_identifiers = [
244
270
  Identifier.create_from_selector(
@@ -1,12 +1,13 @@
1
1
  import uuid
2
2
  from contextlib import contextmanager
3
3
  from datetime import datetime, timedelta
4
+ from enum import Enum
4
5
  from typing import Optional, List, TYPE_CHECKING
5
6
  from pydantic import Field
6
7
 
7
8
  from ingestify.domain import Selector, DataSpecVersionCollection
8
9
  from ingestify.domain.models.base import BaseModel
9
- from ingestify.domain.models.task.task_summary import TaskSummary, TaskStatus
10
+ from ingestify.domain.models.task.task_summary import TaskSummary, TaskState
10
11
  from ingestify.domain.models.timing import Timing
11
12
  from ingestify.utils import utcnow
12
13
 
@@ -14,6 +15,12 @@ if TYPE_CHECKING:
14
15
  from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
15
16
 
16
17
 
18
+ class IngestionJobState(str, Enum):
19
+ RUNNING = "RUNNING"
20
+ FINISHED = "FINISHED"
21
+ FAILED = "FAILED"
22
+
23
+
17
24
  def format_duration(duration: timedelta):
18
25
  return f"{duration.total_seconds():.2f}sec"
19
26
 
@@ -30,7 +37,8 @@ class IngestionJobSummary(BaseModel):
30
37
  selector: Selector
31
38
 
32
39
  started_at: datetime = Field(default_factory=utcnow)
33
- finished_at: Optional[datetime] = None
40
+ ended_at: Optional[datetime] = None
41
+ state: IngestionJobState = IngestionJobState.RUNNING
34
42
  timings: List[Timing] = Field(default_factory=list)
35
43
  task_summaries: List[TaskSummary] = Field(default_factory=list)
36
44
 
@@ -55,8 +63,10 @@ class IngestionJobSummary(BaseModel):
55
63
  @contextmanager
56
64
  def record_timing(self, name: str):
57
65
  start = utcnow()
58
- yield
59
- self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
66
+ try:
67
+ yield
68
+ finally:
69
+ self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
60
70
 
61
71
  def start_timing(self, name):
62
72
  start = utcnow()
@@ -75,28 +85,36 @@ class IngestionJobSummary(BaseModel):
75
85
  def task_count(self):
76
86
  return len(self.task_summaries)
77
87
 
78
- def set_finished(self):
88
+ def _set_ended(self):
79
89
  self.failed_tasks = len(
80
- [task for task in self.task_summaries if task.status == TaskStatus.FAILED]
90
+ [task for task in self.task_summaries if task.state == TaskState.FAILED]
81
91
  )
82
92
  self.successful_tasks = len(
83
- [task for task in self.task_summaries if task.status == TaskStatus.FINISHED]
93
+ [task for task in self.task_summaries if task.state == TaskState.FINISHED]
84
94
  )
85
95
  self.ignored_successful_tasks = len(
86
96
  [
87
97
  task
88
98
  for task in self.task_summaries
89
- if task.status == TaskStatus.FINISHED_IGNORED
99
+ if task.state == TaskState.FINISHED_IGNORED
90
100
  ]
91
101
  )
92
- self.finished_at = utcnow()
102
+ self.ended_at = utcnow()
103
+
104
+ def set_finished(self):
105
+ self.state = IngestionJobState.FINISHED
106
+ self._set_ended()
107
+
108
+ def set_exception(self, e: Exception):
109
+ self.state = IngestionJobState.FAILED
110
+ self._set_ended()
93
111
 
94
112
  @property
95
113
  def duration(self) -> timedelta:
96
- return self.finished_at - self.started_at
114
+ return self.ended_at - self.started_at
97
115
 
98
116
  def output_report(self):
99
- print(f"\nIngestionJobSummary finished in {format_duration(self.duration)}")
117
+ print(f"\nIngestionJobSummary {self.state} in {format_duration(self.duration)}")
100
118
  print("--------------------")
101
119
  print(f" - IngestionPlan:")
102
120
  print(f" Source: {self.source_name}")
@@ -16,7 +16,7 @@ from ingestify.utils import utcnow
16
16
  logger = logging.getLogger(__name__)
17
17
 
18
18
 
19
- class TaskStatus(str, Enum):
19
+ class TaskState(str, Enum):
20
20
  RUNNING = "RUNNING"
21
21
  FINISHED = "FINISHED"
22
22
  FINISHED_IGNORED = "FINISHED_IGNORED" # Finished, but didn't produce any new data
@@ -37,7 +37,7 @@ class TaskSummary(BaseModel):
37
37
  persisted_file_count: int = 0
38
38
  bytes_retrieved: int = 0
39
39
  last_modified: Optional[datetime] = None
40
- status: TaskStatus = TaskStatus.RUNNING
40
+ state: TaskState = TaskState.RUNNING
41
41
  timings: List[Timing] = Field(default_factory=list)
42
42
 
43
43
  @field_validator("dataset_identifier", mode="before")
@@ -83,10 +83,10 @@ class TaskSummary(BaseModel):
83
83
  try:
84
84
  yield task_summary
85
85
 
86
- task_summary.set_status(TaskStatus.FINISHED)
86
+ task_summary.set_state(TaskState.FINISHED)
87
87
  except Exception as e:
88
88
  logger.exception(f"Failed to execute task.")
89
- task_summary.set_status(TaskStatus.FAILED)
89
+ task_summary.set_state(TaskState.FAILED)
90
90
 
91
91
  # When the error comes from our own code, make sure it will be raised to the highest level
92
92
  # raise
@@ -111,8 +111,8 @@ class TaskSummary(BaseModel):
111
111
  file.modified_at for file in revision.modified_files
112
112
  )
113
113
  else:
114
- self.status = TaskStatus.FINISHED_IGNORED
114
+ self.state = TaskState.FINISHED_IGNORED
115
115
 
116
- def set_status(self, status: TaskStatus):
117
- if self.status == TaskStatus.RUNNING:
118
- self.status = status
116
+ def set_state(self, state: TaskState):
117
+ if self.state == TaskState.RUNNING:
118
+ self.state = state
ingestify/exceptions.py CHANGED
@@ -8,3 +8,7 @@ class ConfigurationError(IngestifyError):
8
8
 
9
9
  class DuplicateFile(IngestifyError):
10
10
  pass
11
+
12
+
13
+ class SaveError(IngestifyError):
14
+ pass
@@ -7,44 +7,27 @@ from dataclass_factory.schema_helpers import type_checker
7
7
 
8
8
  from ingestify.domain import DatasetCreated, Identifier
9
9
  from ingestify.domain.models.dataset.events import MetadataUpdated, RevisionAdded
10
+ from ingestify.domain.models.event import DomainEvent
10
11
 
11
- isotime_schema = Schema(
12
- parser=lambda x: datetime.fromisoformat(x.replace("Z", "+00:00")), # type: ignore
13
- serializer=lambda x: datetime.isoformat(x).replace("+00:00", "Z"),
14
- )
15
-
16
- identifier_schema = Schema(
17
- # json.loads(x) for backwards compatibility
18
- parser=lambda x: Identifier(x if isinstance(x, dict) else json.loads(x)),
19
- serializer=lambda x: dict(x),
20
- )
21
-
22
- factory = Factory(
23
- schemas={
24
- datetime: isotime_schema,
25
- Identifier: identifier_schema,
26
- DatasetCreated: Schema(
27
- pre_parse=type_checker(DatasetCreated.event_type, "event_type")
28
- ),
29
- MetadataUpdated: Schema(
30
- pre_parse=type_checker(MetadataUpdated.event_type, "event_type")
31
- ),
32
- RevisionAdded: Schema(
33
- pre_parse=type_checker(RevisionAdded.event_type, "event_type")
34
- ),
35
- # ClipSelectionContent: Schema(pre_parse=type_checker(ClipSelectionContent.content_type, field="contentType")),
36
- # TeamInfoImageContent: Schema(pre_parse=type_checker(TeamInfoImageContent.content_type, field="contentType")),
37
- # StaticVideoContent: Schema(pre_parse=type_checker(StaticVideoContent.content_type, field="contentType"))
38
- },
39
- default_schema=Schema(),
40
- )
41
-
42
- T = TypeVar("T")
43
-
44
-
45
- def serialize(data: T, class_: Type[T] = None) -> Any:
46
- return factory.dump(data, class_)
47
-
48
-
49
- def unserialize(data: Any, class_: Type[T]) -> T:
50
- return factory.load(data, class_)
12
+
13
+ event_types = {
14
+ DatasetCreated.event_type: DatasetCreated,
15
+ RevisionAdded.event_type: RevisionAdded,
16
+ }
17
+
18
+
19
+ def deserialize(event_dict: dict) -> DomainEvent:
20
+ event_cls = event_types[event_dict["event_type"]]
21
+ event_dict["dataset"]["identifier"] = Identifier(
22
+ **event_dict["dataset"]["identifier"]
23
+ )
24
+
25
+ return event_cls.model_validate(event_dict)
26
+
27
+
28
+ def serialize(event: DomainEvent) -> dict:
29
+ event_dict = event.model_dump(mode="json")
30
+
31
+ # Make sure event_type is always part of the event_dict. Pydantic might skip it when the type is ClassVar
32
+ event_dict["event_type"] = event.event_type
33
+ return event_dict
@@ -1,13 +1,24 @@
1
+ import itertools
1
2
  import json
2
3
  import uuid
4
+ from collections import defaultdict
3
5
  from typing import Optional, Union, List
4
6
 
5
- from sqlalchemy import create_engine, func, text, tuple_
7
+ from sqlalchemy import (
8
+ create_engine,
9
+ func,
10
+ text,
11
+ tuple_,
12
+ Table,
13
+ insert,
14
+ Transaction,
15
+ Connection,
16
+ )
6
17
  from sqlalchemy.engine import make_url
7
18
  from sqlalchemy.exc import NoSuchModuleError
8
19
  from sqlalchemy.orm import Session, joinedload
9
20
 
10
- from ingestify.domain import File
21
+ from ingestify.domain import File, Revision
11
22
  from ingestify.domain.models import (
12
23
  Dataset,
13
24
  DatasetCollection,
@@ -15,11 +26,22 @@ from ingestify.domain.models import (
15
26
  Identifier,
16
27
  Selector,
17
28
  )
29
+ from ingestify.domain.models.base import BaseModel
18
30
  from ingestify.domain.models.dataset.collection_metadata import (
19
31
  DatasetCollectionMetadata,
20
32
  )
21
-
22
- from .mapping import dataset_table, metadata
33
+ from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobSummary
34
+ from ingestify.domain.models.task.task_summary import TaskSummary
35
+ from ingestify.exceptions import IngestifyError
36
+
37
+ from .tables import (
38
+ metadata,
39
+ dataset_table,
40
+ file_table,
41
+ revision_table,
42
+ ingestion_job_summary_table,
43
+ task_summary_table,
44
+ )
23
45
 
24
46
 
25
47
  def parse_value(v):
@@ -113,6 +135,31 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
113
135
  def session(self):
114
136
  return self.session_provider.get()
115
137
 
138
+ def _upsert(self, connection: Connection, table: Table, entities: list[dict]):
139
+ dialect = self.session.bind.dialect.name
140
+ if dialect == "mysql":
141
+ from sqlalchemy.dialects.mysql import insert
142
+ elif dialect == "postgresql":
143
+ from sqlalchemy.dialects.postgresql import insert
144
+ elif dialect == "sqlite":
145
+ from sqlalchemy.dialects.sqlite import insert
146
+ else:
147
+ raise IngestifyError(f"Don't know how to do an upsert in {dialect}")
148
+
149
+ stmt = insert(table).values(entities)
150
+
151
+ primary_key_columns = [column for column in table.columns if column.primary_key]
152
+
153
+ set_ = {
154
+ name: getattr(stmt.excluded, name)
155
+ for name, column in table.columns.items()
156
+ if column not in primary_key_columns
157
+ }
158
+
159
+ stmt = stmt.on_conflict_do_update(index_elements=primary_key_columns, set_=set_)
160
+
161
+ connection.execute(stmt)
162
+
116
163
  def _filter_query(
117
164
  self,
118
165
  query,
@@ -122,11 +169,11 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
122
169
  dataset_id: Optional[Union[str, List[str]]] = None,
123
170
  selector: Optional[Union[Selector, List[Selector]]] = None,
124
171
  ):
125
- query = query.filter(Dataset.bucket == bucket)
172
+ query = query.filter(dataset_table.c.bucket == bucket)
126
173
  if dataset_type:
127
- query = query.filter(Dataset.dataset_type == dataset_type)
174
+ query = query.filter(dataset_table.c.dataset_type == dataset_type)
128
175
  if provider:
129
- query = query.filter(Dataset.provider == provider)
176
+ query = query.filter(dataset_table.c.provider == provider)
130
177
  if dataset_id is not None:
131
178
  if isinstance(dataset_id, list):
132
179
  if len(dataset_id) == 0:
@@ -134,9 +181,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
134
181
  # return an empty DatasetCollection
135
182
  return DatasetCollection()
136
183
 
137
- query = query.filter(Dataset.dataset_id.in_(dataset_id))
184
+ query = query.filter(dataset_table.c.dataset_id.in_(dataset_id))
138
185
  else:
139
- query = query.filter(Dataset.dataset_id == dataset_id)
186
+ query = query.filter(dataset_table.c.dataset_id == dataset_id)
140
187
 
141
188
  dialect = self.session.bind.dialect.name
142
189
 
@@ -175,7 +222,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
175
222
  else:
176
223
  column = column.as_string()
177
224
  else:
178
- column = func.json_extract(Dataset.identifier, f"$.{k}")
225
+ column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
179
226
  columns.append(column)
180
227
 
181
228
  values = []
@@ -189,6 +236,60 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
189
236
  query = query.filter(text(where))
190
237
  return query
191
238
 
239
+ def load_datasets(self, dataset_ids: list[str]) -> list[Dataset]:
240
+ if not dataset_ids:
241
+ return []
242
+
243
+ dataset_rows = list(
244
+ self.session.query(dataset_table).filter(
245
+ dataset_table.c.dataset_id.in_(dataset_ids)
246
+ )
247
+ )
248
+ revisions_per_dataset = {}
249
+ rows = (
250
+ self.session.query(revision_table)
251
+ .filter(revision_table.c.dataset_id.in_(dataset_ids))
252
+ .order_by(revision_table.c.dataset_id)
253
+ )
254
+
255
+ for dataset_id, revisions in itertools.groupby(
256
+ rows, key=lambda row: row.dataset_id
257
+ ):
258
+ revisions_per_dataset[dataset_id] = list(revisions)
259
+
260
+ files_per_revision = {}
261
+ rows = (
262
+ self.session.query(file_table)
263
+ .filter(file_table.c.dataset_id.in_(dataset_ids))
264
+ .order_by(file_table.c.dataset_id, file_table.c.revision_id)
265
+ )
266
+
267
+ for (dataset_id, revision_id), files in itertools.groupby(
268
+ rows, key=lambda row: (row.dataset_id, row.revision_id)
269
+ ):
270
+ files_per_revision[(dataset_id, revision_id)] = list(files)
271
+
272
+ datasets = []
273
+ for dataset_row in dataset_rows:
274
+ dataset_id = dataset_row.dataset_id
275
+ revisions = []
276
+ for revision_row in revisions_per_dataset.get(dataset_id, []):
277
+ files = [
278
+ File.model_validate(file_row)
279
+ for file_row in files_per_revision.get(
280
+ (dataset_id, revision_row.revision_id), []
281
+ )
282
+ ]
283
+ revision = Revision.model_validate(
284
+ {**revision_row._mapping, "modified_files": files}
285
+ )
286
+ revisions.append(revision)
287
+
288
+ datasets.append(
289
+ Dataset.model_validate({**dataset_row._mapping, "revisions": revisions})
290
+ )
291
+ return datasets
292
+
192
293
  def get_dataset_collection(
193
294
  self,
194
295
  bucket: str,
@@ -209,17 +310,20 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
209
310
  )
210
311
 
211
312
  if not metadata_only:
212
- dataset_query = apply_query_filter(self.session.query(Dataset))
213
- datasets = list(dataset_query)
313
+ dataset_query = apply_query_filter(
314
+ self.session.query(dataset_table.c.dataset_id)
315
+ )
316
+ dataset_ids = [row.dataset_id for row in dataset_query]
317
+ datasets = self.load_datasets(dataset_ids)
214
318
  else:
215
319
  datasets = []
216
320
 
217
321
  metadata_result_row = apply_query_filter(
218
322
  self.session.query(
219
- func.min(File.modified_at).label("first_modified_at"),
220
- func.max(File.modified_at).label("last_modified_at"),
323
+ func.min(file_table.c.modified_at).label("first_modified_at"),
324
+ func.max(file_table.c.modified_at).label("last_modified_at"),
221
325
  func.count().label("row_count"),
222
- ).join(Dataset, Dataset.dataset_id == File.dataset_id)
326
+ ).join(dataset_table, dataset_table.c.dataset_id == file_table.c.dataset_id)
223
327
  ).first()
224
328
  dataset_collection_metadata = DatasetCollectionMetadata(*metadata_result_row)
225
329
 
@@ -228,12 +332,153 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
228
332
  def save(self, bucket: str, dataset: Dataset):
229
333
  # Just make sure
230
334
  dataset.bucket = bucket
231
- self.session.add(dataset)
232
- self.session.commit()
335
+
336
+ self._save([dataset])
337
+
338
+ def connect(self):
339
+ return self.session_provider.engine.connect()
340
+
341
+ def _save(self, datasets: list[Dataset]):
342
+ """Only do upserts. Never delete. Rows get only deleted when an entire Dataset is removed."""
343
+ datasets_entities = []
344
+ revision_entities = []
345
+ file_entities = []
346
+
347
+ for dataset in datasets:
348
+ datasets_entities.append(dataset.model_dump(exclude={"revisions"}))
349
+ for revision in dataset.revisions:
350
+ revision_entities.append(
351
+ {
352
+ **revision.model_dump(
353
+ exclude={"is_squashed", "modified_files"}
354
+ ),
355
+ "dataset_id": dataset.dataset_id,
356
+ }
357
+ )
358
+ for file in revision.modified_files:
359
+ file_entities.append(
360
+ {
361
+ **file.model_dump(),
362
+ "dataset_id": dataset.dataset_id,
363
+ "revision_id": revision.revision_id,
364
+ }
365
+ )
366
+
367
+ with self.connect() as connection:
368
+ try:
369
+ self._upsert(connection, dataset_table, datasets_entities)
370
+ self._upsert(connection, revision_table, revision_entities)
371
+ self._upsert(connection, file_table, file_entities)
372
+ except Exception:
373
+ connection.rollback()
374
+ raise
375
+ else:
376
+ connection.commit()
233
377
 
234
378
  def destroy(self, dataset: Dataset):
235
- self.session.delete(dataset)
236
- self.session.commit()
379
+ with self.connect() as connection:
380
+ try:
381
+ # Delete modified files related to the dataset
382
+ file_table.delete().where(
383
+ file_table.c.dataset_id == dataset.dataset_id
384
+ ).execute()
385
+
386
+ # Delete revisions related to the dataset
387
+ revision_table.delete().where(
388
+ revision_table.c.dataset_id == dataset.dataset_id
389
+ ).execute()
390
+
391
+ # Delete the dataset itself
392
+ dataset_table.delete().where(
393
+ dataset_table.c.dataset_id == dataset.dataset_id
394
+ ).execute()
395
+
396
+ connection.commit()
397
+ except Exception:
398
+ connection.rollback()
399
+ raise
237
400
 
238
401
  def next_identity(self):
239
402
  return str(uuid.uuid4())
403
+
404
+ # TODO: consider moving the IngestionJobSummary methods to a different Repository
405
+ def save_ingestion_job_summary(self, ingestion_job_summary: IngestionJobSummary):
406
+ ingestion_job_summary_entities = [
407
+ ingestion_job_summary.model_dump(exclude={"task_summaries"})
408
+ ]
409
+ task_summary_entities = []
410
+ for task_summary in ingestion_job_summary.task_summaries:
411
+ task_summary_entities.append(
412
+ {
413
+ **task_summary.model_dump(),
414
+ "ingestion_job_summary_id": ingestion_job_summary.ingestion_job_summary_id,
415
+ }
416
+ )
417
+
418
+ with self.session_provider.engine.connect() as connection:
419
+ try:
420
+ self._upsert(
421
+ connection,
422
+ ingestion_job_summary_table,
423
+ ingestion_job_summary_entities,
424
+ )
425
+ if task_summary_entities:
426
+ self._upsert(connection, task_summary_table, task_summary_entities)
427
+ except Exception:
428
+ connection.rollback()
429
+ raise
430
+ else:
431
+ connection.commit()
432
+
433
+ def load_ingestion_job_summaries(self) -> list[IngestionJobSummary]:
434
+ ingestion_job_summary_ids = [
435
+ row.ingestion_job_summary_id
436
+ for row in self.session.query(
437
+ ingestion_job_summary_table.c.ingestion_job_summary_id
438
+ )
439
+ ]
440
+
441
+ ingestion_job_summary_rows = list(
442
+ self.session.query(ingestion_job_summary_table).filter(
443
+ ingestion_job_summary_table.c.ingestion_job_summary_id.in_(
444
+ ingestion_job_summary_ids
445
+ )
446
+ )
447
+ )
448
+
449
+ task_summary_entities_per_job_summary = {}
450
+ rows = (
451
+ self.session.query(task_summary_table)
452
+ .filter(
453
+ task_summary_table.c.ingestion_job_summary_id.in_(
454
+ ingestion_job_summary_ids
455
+ )
456
+ )
457
+ .order_by(task_summary_table.c.ingestion_job_summary_id)
458
+ )
459
+
460
+ for ingestion_job_summary_id, task_summaries_rows in itertools.groupby(
461
+ rows, key=lambda row: row.ingestion_job_summary_id
462
+ ):
463
+ task_summary_entities_per_job_summary[ingestion_job_summary_id] = list(
464
+ task_summaries_rows
465
+ )
466
+
467
+ ingestion_job_summaries = []
468
+ for ingestion_job_summary_row in ingestion_job_summary_rows:
469
+ task_summaries = [
470
+ TaskSummary.model_validate(row)
471
+ for row in task_summary_entities_per_job_summary.get(
472
+ ingestion_job_summary_row.ingestion_job_summary_id, []
473
+ )
474
+ ]
475
+
476
+ ingestion_job_summaries.append(
477
+ IngestionJobSummary.model_validate(
478
+ {
479
+ **ingestion_job_summary_row._mapping,
480
+ "task_summaries": task_summaries,
481
+ }
482
+ )
483
+ )
484
+ return ingestion_job_summaries
@@ -1,5 +1,4 @@
1
1
  import datetime
2
- from dataclasses import is_dataclass, asdict
3
2
  from pathlib import Path
4
3
  from typing import Optional
5
4
 
@@ -15,17 +14,13 @@ from sqlalchemy import (
15
14
  String,
16
15
  Table,
17
16
  TypeDecorator,
18
- Boolean,
19
17
  )
20
- from sqlalchemy.orm import registry, relationship
21
18
 
22
- from ingestify.domain import Selector, Identifier, DataSpecVersionCollection
23
- from ingestify.domain.models import Dataset, File, Revision
19
+ from ingestify.domain import Identifier, DataSpecVersionCollection, Selector
24
20
  from ingestify.domain.models.dataset.dataset import DatasetState
25
- from ingestify.domain.models.ingestion.ingestion_job_summary import (
26
- IngestionJobSummary,
27
- )
28
- from ingestify.domain.models.task.task_summary import TaskSummary, Operation, TaskStatus
21
+ from ingestify.domain.models.ingestion.ingestion_job_summary import IngestionJobState
22
+
23
+ from ingestify.domain.models.task.task_summary import Operation, TaskState
29
24
  from ingestify.domain.models.timing import Timing
30
25
  from ingestify.domain.models.dataset.revision import RevisionState
31
26
 
@@ -106,7 +101,7 @@ class RevisionStateString(TypeDecorator):
106
101
 
107
102
  def process_result_value(self, value, dialect):
108
103
  if not value:
109
- return value
104
+ return RevisionState.PENDING_VALIDATION
110
105
 
111
106
  return RevisionState[value]
112
107
 
@@ -124,20 +119,31 @@ class OperationString(TypeDecorator):
124
119
  return Operation[value]
125
120
 
126
121
 
127
- class TaskStatusString(TypeDecorator):
122
+ class TaskStateString(TypeDecorator):
128
123
  impl = String(255)
129
124
 
130
- def process_bind_param(self, value: TaskStatus, dialect):
125
+ def process_bind_param(self, value: TaskState, dialect):
131
126
  return value.value
132
127
 
133
128
  def process_result_value(self, value, dialect):
134
129
  if not value:
135
130
  return value
136
131
 
137
- return TaskStatus[value]
132
+ return TaskState[value]
133
+
138
134
 
135
+ class IngestionJobStateString(TypeDecorator):
136
+ impl = String(255)
137
+
138
+ def process_bind_param(self, value: IngestionJobState, dialect):
139
+ return value.value
140
+
141
+ def process_result_value(self, value, dialect):
142
+ if not value:
143
+ return value
144
+
145
+ return IngestionJobState[value]
139
146
 
140
- mapper_registry = registry()
141
147
 
142
148
  metadata = MetaData()
143
149
 
@@ -193,40 +199,7 @@ file_table = Table(
193
199
  ),
194
200
  )
195
201
 
196
-
197
- mapper_registry.map_imperatively(
198
- Dataset,
199
- dataset_table,
200
- properties={
201
- "revisions": relationship(
202
- Revision,
203
- backref="dataset",
204
- order_by=revision_table.c.revision_id,
205
- lazy="selectin",
206
- cascade="all, delete-orphan",
207
- ),
208
- },
209
- )
210
-
211
- mapper_registry.map_imperatively(
212
- Revision,
213
- revision_table,
214
- properties={
215
- "modified_files": relationship(
216
- File,
217
- order_by=file_table.c.file_id,
218
- primaryjoin="and_(Revision.revision_id==File.revision_id, Revision.dataset_id==File.dataset_id)",
219
- lazy="selectin",
220
- cascade="all, delete-orphan",
221
- )
222
- },
223
- )
224
-
225
-
226
- mapper_registry.map_imperatively(File, file_table)
227
-
228
-
229
- ingestion_job_summary = Table(
202
+ ingestion_job_summary_table = Table(
230
203
  "ingestion_job_summary",
231
204
  metadata,
232
205
  Column("ingestion_job_summary_id", String(255), primary_key=True),
@@ -238,18 +211,25 @@ ingestion_job_summary = Table(
238
211
  Column(
239
212
  "data_spec_versions",
240
213
  JSONType(
241
- serializer=lambda data_spec_versions: data_spec_versions.to_dict(),
214
+ serializer=lambda data_spec_versions: {
215
+ key: list(value) for key, value in data_spec_versions.items()
216
+ },
242
217
  deserializer=lambda data_spec_versions: DataSpecVersionCollection.from_dict(
243
218
  data_spec_versions
244
219
  ),
245
220
  ),
246
221
  ),
247
222
  Column(
248
- "selector", JSONType(serializer=lambda selector: selector.filtered_attributes)
223
+ "selector",
224
+ JSONType(
225
+ serializer=lambda selector: selector.filtered_attributes,
226
+ deserializer=lambda selector: Selector(**selector),
227
+ ),
249
228
  ),
250
229
  Column("started_at", TZDateTime(6)),
251
- Column("finished_at", TZDateTime(6)),
230
+ Column("ended_at", TZDateTime(6)),
252
231
  # Some task counters
232
+ Column("state", IngestionJobStateString),
253
233
  Column("successful_tasks", Integer),
254
234
  Column("ignored_successful_tasks", Integer),
255
235
  Column("skipped_datasets", Integer),
@@ -258,7 +238,10 @@ ingestion_job_summary = Table(
258
238
  "timings",
259
239
  JSONType(
260
240
  serializer=lambda timings: [
261
- timing.model_dump(mode="json") for timing in timings
241
+ # Timing is probably already a dictionary. Load it into Timing first, so it can be dumped
242
+ # in json mode
243
+ Timing.model_validate(timing).model_dump(mode="json")
244
+ for timing in timings
262
245
  ],
263
246
  deserializer=lambda timings: [
264
247
  Timing.model_validate(timing) for timing in timings
@@ -299,12 +282,13 @@ task_summary_table = Table(
299
282
  Column("persisted_file_count", Integer),
300
283
  Column("bytes_retrieved", Integer),
301
284
  Column("last_modified", TZDateTime(6)),
302
- Column("status", TaskStatusString),
285
+ Column("state", TaskStateString),
303
286
  Column(
304
287
  "timings",
305
288
  JSONType(
306
289
  serializer=lambda timings: [
307
- timing.model_dump(mode="json") for timing in timings
290
+ Timing.model_validate(timing).model_dump(mode="json")
291
+ for timing in timings
308
292
  ],
309
293
  deserializer=lambda timings: [
310
294
  Timing.model_validate(timing) for timing in timings
@@ -316,21 +300,54 @@ task_summary_table = Table(
316
300
  # Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
317
301
  # Column("source", JSONType()),
318
302
  )
319
-
320
-
321
- mapper_registry.map_imperatively(
322
- IngestionJobSummary,
323
- ingestion_job_summary,
324
- properties={
325
- "task_summaries": relationship(
326
- TaskSummary,
327
- backref="ingestion_job_summary",
328
- # order_by=task_summary_table.c.revision_id,
329
- lazy="selectin",
330
- cascade="all, delete-orphan",
331
- ),
332
- },
333
- )
334
-
335
-
336
- mapper_registry.map_imperatively(TaskSummary, task_summary_table)
303
+ #
304
+ #
305
+ # mapper_registry = registry()
306
+ #
307
+ # mapper_registry.map_imperatively(
308
+ # Dataset,
309
+ # dataset_table,
310
+ # properties={
311
+ # "revisions": relationship(
312
+ # Revision,
313
+ # backref="dataset",
314
+ # order_by=revision_table.c.revision_id,
315
+ # lazy="selectin",
316
+ # cascade="all, delete-orphan",
317
+ # ),
318
+ # },
319
+ # )
320
+ #
321
+ # mapper_registry.map_imperatively(
322
+ # Revision,
323
+ # revision_table,
324
+ # properties={
325
+ # "modified_files": relationship(
326
+ # File,
327
+ # order_by=file_table.c.file_id,
328
+ # primaryjoin="and_(Revision.revision_id==File.revision_id, Revision.dataset_id==File.dataset_id)",
329
+ # lazy="selectin",
330
+ # cascade="all, delete-orphan",
331
+ # )
332
+ # },
333
+ # )
334
+ #
335
+ #
336
+ # mapper_registry.map_imperatively(File, file_table)
337
+ #
338
+ # mapper_registry.map_imperatively(
339
+ # IngestionJobSummary,
340
+ # ingestion_job_summary,
341
+ # properties={
342
+ # "task_summaries": relationship(
343
+ # TaskSummary,
344
+ # backref="ingestion_job_summary",
345
+ # # order_by=task_summary_table.c.revision_id,
346
+ # lazy="selectin",
347
+ # cascade="all, delete-orphan",
348
+ # ),
349
+ # },
350
+ # )
351
+ #
352
+ #
353
+ # mapper_registry.map_imperatively(TaskSummary, task_summary_table)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.3.3
3
+ Version: 0.3.4
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -227,23 +227,23 @@ dataset_collection = store.get_dataset_collection(
227
227
  store.map(
228
228
  lambda dataset: (
229
229
  store
230
-
231
- # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
232
- .load_with_kloppy(dataset)
233
-
234
- # Convert it into a polars dataframe using all columns in the original data and some more additional ones
235
- .to_df(
236
- "*",
237
- match_id=dataset.identifier.match_id,
238
- competition_id=dataset.identifier.competition_id,
239
- season_id=dataset.identifier.season_id,
240
-
230
+
231
+ # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
232
+ .load_with_kloppy(dataset)
233
+
234
+ # Convert it into a polars dataframe using all columns in the original data and some more additional ones
235
+ .to_df(
236
+ "*",
237
+ match_id=dataset.dataset_resource_id.match_id,
238
+ competition_id=dataset.dataset_resource_id.competition_id,
239
+ season_id=dataset.dataset_resource_id.season_id,
240
+
241
241
  engine="polars"
242
242
  )
243
-
244
- # Write to parquet format
245
- .write_parquet(
246
- f"/tmp/files/blaat/{dataset.identifier.match_id}.parquet"
243
+
244
+ # Write to parquet format
245
+ .write_parquet(
246
+ f"/tmp/files/blaat/{dataset.dataset_resource_id.match_id}.parquet"
247
247
  )
248
248
  ),
249
249
  dataset_collection,
@@ -1,18 +1,18 @@
1
- ingestify/__init__.py,sha256=skDa1VfOP7IslAz1tXtfTAwPzohhFlzwGkD_1wV8m50,301
1
+ ingestify/__init__.py,sha256=lyBZ_P8y4qlkE1e11F4T41fSTp8WbReifRxX9UGizxA,301
2
2
  ingestify/cmdline.py,sha256=bIuyPgGEw4wIglNzpG9zp7TsJozsP8NSVsCe4eAyWUg,7189
3
- ingestify/exceptions.py,sha256=wMMuajl4AkQRfW60TLN7btJmQaH8-lUczXyW_2g9kOU,143
3
+ ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
4
4
  ingestify/main.py,sha256=Xr0VbGgstPO7doDX18xqk4lBb4W2sbGWtQuXZaARsHA,8763
5
5
  ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
6
6
  ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
7
7
  ingestify/utils.py,sha256=HETGhAoUlutLG0cQR63nac2JbFei9gnktDHeBQoYWfU,5692
8
8
  ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- ingestify/application/dataset_store.py,sha256=5CZ2v_fjqhNyC8JdjE9O4huTwy3WtHhawyc8Gw4DeJ4,11646
9
+ ingestify/application/dataset_store.py,sha256=c10EIxzOfO4ksKwPOI9jcOn33j54QWu_qXOMLwe-Y-A,11617
10
10
  ingestify/application/ingestion_engine.py,sha256=PtMjKMpvfqB802G5zfKLzyamdH7qFOXl3x6_97y8w60,2288
11
11
  ingestify/application/loader.py,sha256=v8ZcpMDEml9k_uFPFqT4WaCjXED_OIpAr7g0Pz5Hp6Y,7153
12
12
  ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
13
13
  ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
14
14
  ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
15
- ingestify/domain/models/base.py,sha256=6jzzIqSkH1mPsXZ2OTXMj09S_IlvMOrOBHBJyWAKEjE,555
15
+ ingestify/domain/models/base.py,sha256=4gKbREajxJHlS-VwKoosNtHVupZ4eDLKMqnJ4ib0aS8,184
16
16
  ingestify/domain/models/data_spec_version_collection.py,sha256=CAXlO4W2AOOWAPdPAuymqBHnJpiYtkr2z7fYFJ3HSCk,1372
17
17
  ingestify/domain/models/fetch_policy.py,sha256=d7K1TzliNJXxqaqzqEOQWLhvgIvmmqhUQEliXvSUcTs,1405
18
18
  ingestify/domain/models/sink.py,sha256=OBVfFMpB7puJmHg4q2KYx4qgoAnlmX8xKWYnPi8a9pc,178
@@ -21,15 +21,15 @@ ingestify/domain/models/timing.py,sha256=TvvH6Szo61CD8wCP7Awyc45CXga5lKqvoW2U-0T
21
21
  ingestify/domain/models/dataset/__init__.py,sha256=i1kswluvWjw0xn4OUByRt7yeRvNHu1mauevv-Vmayx4,630
22
22
  ingestify/domain/models/dataset/collection.py,sha256=E2utQ6oyaFFrfQFMiwP9J_I7Wm21z0sRvE4Zc3QEs20,1310
23
23
  ingestify/domain/models/dataset/collection_metadata.py,sha256=gI5cb9M0QRsheIr2jA71wOyWfI5lGx5ES2Qw7rbDIoA,371
24
- ingestify/domain/models/dataset/dataset.py,sha256=ReL50BXNaJVU29OB5_9CQEI7BekWsgi1t3AR7e5jENc,2743
24
+ ingestify/domain/models/dataset/dataset.py,sha256=6iQgBApRK08GhxArnJjjE9SuJMMOsKx_gI6JDHy5nZc,2970
25
25
  ingestify/domain/models/dataset/dataset_repository.py,sha256=kUjiqW58kOUOli1gZCLR5xw4dBX0bqI1UJsf16hgNsQ,812
26
26
  ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4FVk_T-ZNUDezkvt7VzY,220
27
27
  ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
28
- ingestify/domain/models/dataset/file.py,sha256=1oj03zKdkO_9F85LuDcihbB0Kr3suf12KZNGHpVo3w0,4144
28
+ ingestify/domain/models/dataset/file.py,sha256=1Thdv6A1YmC1UfutaRf2q3FGHQYO0SWEptCxur6Ahfs,4144
29
29
  ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
30
30
  ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
31
31
  ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
32
- ingestify/domain/models/dataset/revision.py,sha256=O_1HG2S2EmYdWqI2K282S_D-d6IhRh_f4Q3wV8MEhkk,1311
32
+ ingestify/domain/models/dataset/revision.py,sha256=HPOZpVmQSwdcsr90RNVlOQ7c1_W7grzi5E1NOiEK92g,1331
33
33
  ingestify/domain/models/dataset/selector.py,sha256=kEGpU8pIyjZ0zwE9n2uo_NY5xrNanWiTTgapyMAUEsw,1039
34
34
  ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
35
35
  ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
@@ -39,15 +39,15 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
39
39
  ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
40
40
  ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
41
41
  ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- ingestify/domain/models/ingestion/ingestion_job.py,sha256=U6B62c7NGeHBAjmKhgOa4uHeul34xyR66WtWaPSRNTU,12276
43
- ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=e8wULXsKAGNGrOV4dyiRcFlAfYCWcLa2iqJKNMwirlk,4270
42
+ ingestify/domain/models/ingestion/ingestion_job.py,sha256=H3vnEUS3izuNJfmD7ZGbznemX9r2JZ1po7D7D9ArzwM,13230
43
+ ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=cgm8kLoX3eK9SkBYe5HhwA7kg5FAyN4kfTCJrVHaRlc,4702
44
44
  ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
45
45
  ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
46
46
  ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
47
47
  ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
48
48
  ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
49
49
  ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
50
- ingestify/domain/models/task/task_summary.py,sha256=ovzqKPstngRVzVA_JboQMluq5uQjKVJDsWNNcfcadhU,3774
50
+ ingestify/domain/models/task/task_summary.py,sha256=Ncf6ij_aLkElZOsBgep-kd82FyzHjr5xjhAbAXNRJUs,3757
51
51
  ingestify/domain/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
52
  ingestify/domain/services/identifier_key_transformer.py,sha256=y4GS9u9Ej1MO2jUhAxWbifp0mrE_MqTHvVVcoQzSKb4,4034
53
53
  ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -55,7 +55,7 @@ ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-
55
55
  ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
56
56
  ingestify/infra/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
57
  ingestify/infra/fetch/http.py,sha256=ldaXy6alBbI9z63H97lXfYZNT0ZCBkTac1W6-acNjjY,4127
58
- ingestify/infra/serialization/__init__.py,sha256=LwfmRoO4qykZkJZXxVPSKpwoVIkg9qzXa7Egut9JjL4,1772
58
+ ingestify/infra/serialization/__init__.py,sha256=-i8XLJDI2hwlX65JITcIzuOaGLJaNekgG9OfA6L7Enc,1035
59
59
  ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
61
61
  ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -64,8 +64,8 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
64
64
  ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
65
65
  ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
66
  ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
67
- ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256=pyqxy7LAyRK2Mt6knaKYozXN07oNPYztU9x4DGDQD0U,9451
68
- ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=-eSR_F9tS9Hd3JNEpoJoDAb5RY38rFaKLMI3eBedjx8,7068
67
+ ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=86BqLhj5pB45iNSfYWbuMNwo-9KnGbbSYtdD8WJw_qo,16003
68
+ ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=b73jqpW-_QubtZpFJv7BTKdTsKbufESP0O1uJCmFfBE,10106
69
69
  ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
70
70
  ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
71
71
  ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
@@ -80,8 +80,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
80
80
  ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
81
81
  ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
82
82
  ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
83
- ingestify-0.3.3.dist-info/METADATA,sha256=ln_MGBlqm4wgPBldNv_VofJ4snw981jv667X4JOylmY,18832
84
- ingestify-0.3.3.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
- ingestify-0.3.3.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
- ingestify-0.3.3.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
- ingestify-0.3.3.dist-info/RECORD,,
83
+ ingestify-0.3.4.dist-info/METADATA,sha256=v5rEF3343auBHwK8K5Zu0C8tTYfm0WjGtyZs0SmY3xg,18854
84
+ ingestify-0.3.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
+ ingestify-0.3.4.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
+ ingestify-0.3.4.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
+ ingestify-0.3.4.dist-info/RECORD,,