ingestify 0.3.2__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {ingestify-0.3.2 → ingestify-0.3.4}/PKG-INFO +16 -16
  2. {ingestify-0.3.2 → ingestify-0.3.4}/README.md +15 -15
  3. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/__init__.py +1 -1
  4. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/application/dataset_store.py +3 -4
  5. ingestify-0.3.4/ingestify/domain/models/base.py +5 -0
  6. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/dataset.py +8 -1
  7. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/file.py +5 -5
  8. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/revision.py +2 -2
  9. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/ingestion/ingestion_job.py +70 -44
  10. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/ingestion/ingestion_job_summary.py +29 -11
  11. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/task/task_summary.py +8 -8
  12. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/exceptions.py +4 -0
  13. ingestify-0.3.4/ingestify/infra/serialization/__init__.py +33 -0
  14. ingestify-0.3.4/ingestify/infra/store/dataset/sqlalchemy/repository.py +484 -0
  15. ingestify-0.3.2/ingestify/infra/store/dataset/sqlalchemy/mapping.py → ingestify-0.3.4/ingestify/infra/store/dataset/sqlalchemy/tables.py +90 -73
  16. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/file/s3_file_repository.py +5 -1
  17. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/PKG-INFO +16 -16
  18. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/SOURCES.txt +1 -1
  19. ingestify-0.3.2/ingestify/domain/models/base.py +0 -22
  20. ingestify-0.3.2/ingestify/infra/serialization/__init__.py +0 -50
  21. ingestify-0.3.2/ingestify/infra/store/dataset/sqlalchemy/repository.py +0 -239
  22. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/application/__init__.py +0 -0
  23. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/application/ingestion_engine.py +0 -0
  24. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/application/loader.py +0 -0
  25. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/application/secrets_manager.py +0 -0
  26. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/cmdline.py +0 -0
  27. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/__init__.py +0 -0
  28. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/__init__.py +0 -0
  29. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  30. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/__init__.py +0 -0
  31. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/collection.py +0 -0
  32. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  33. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  34. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/dataset_state.py +0 -0
  35. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/events.py +0 -0
  36. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/file_collection.py +0 -0
  37. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/file_repository.py +0 -0
  38. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/identifier.py +0 -0
  39. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/dataset/selector.py +0 -0
  40. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/__init__.py +0 -0
  41. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/_old_event.py +0 -0
  42. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/dispatcher.py +0 -0
  43. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/domain_event.py +0 -0
  44. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/event_bus.py +0 -0
  45. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/publisher.py +0 -0
  46. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/event/subscriber.py +0 -0
  47. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/fetch_policy.py +0 -0
  48. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/ingestion/__init__.py +0 -0
  49. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  50. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/resources/__init__.py +0 -0
  51. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/resources/dataset_resource.py +0 -0
  52. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/sink.py +0 -0
  53. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/source.py +0 -0
  54. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/task/__init__.py +0 -0
  55. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/task/set.py +0 -0
  56. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/task/task.py +0 -0
  57. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/models/timing.py +0 -0
  58. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/services/__init__.py +0 -0
  59. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  60. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/services/transformers/__init__.py +0 -0
  61. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  62. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/__init__.py +0 -0
  63. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/fetch/__init__.py +0 -0
  64. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/fetch/http.py +0 -0
  65. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/sink/__init__.py +0 -0
  66. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/sink/postgresql.py +0 -0
  67. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/source/__init__.py +0 -0
  68. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/source/statsbomb_github.py +0 -0
  69. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/source/wyscout.py +0 -0
  70. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/__init__.py +0 -0
  71. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/dataset/__init__.py +0 -0
  72. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  73. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/file/__init__.py +0 -0
  74. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  75. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/infra/store/file/local_file_repository.py +0 -0
  76. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/main.py +0 -0
  77. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/server.py +0 -0
  78. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/source_base.py +0 -0
  79. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/statsbomb_github/README.md +0 -0
  80. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
  81. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
  82. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/statsbomb_github/query.py +0 -0
  83. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/.env +0 -0
  84. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/.gitignore +0 -0
  85. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/README.md +0 -0
  86. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
  87. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/database/README.md +0 -0
  88. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/static/templates/wyscout/query.py +0 -0
  89. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify/utils.py +0 -0
  90. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/dependency_links.txt +0 -0
  91. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/entry_points.txt +0 -0
  92. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/requires.txt +0 -0
  93. {ingestify-0.3.2 → ingestify-0.3.4}/ingestify.egg-info/top_level.txt +0 -0
  94. {ingestify-0.3.2 → ingestify-0.3.4}/setup.cfg +0 -0
  95. {ingestify-0.3.2 → ingestify-0.3.4}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -215,23 +215,23 @@ dataset_collection = store.get_dataset_collection(
215
215
  store.map(
216
216
  lambda dataset: (
217
217
  store
218
-
219
- # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
220
- .load_with_kloppy(dataset)
221
-
222
- # Convert it into a polars dataframe using all columns in the original data and some more additional ones
223
- .to_df(
224
- "*",
225
- match_id=dataset.identifier.match_id,
226
- competition_id=dataset.identifier.competition_id,
227
- season_id=dataset.identifier.season_id,
228
-
218
+
219
+ # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
220
+ .load_with_kloppy(dataset)
221
+
222
+ # Convert it into a polars dataframe using all columns in the original data and some more additional ones
223
+ .to_df(
224
+ "*",
225
+ match_id=dataset.dataset_resource_id.match_id,
226
+ competition_id=dataset.dataset_resource_id.competition_id,
227
+ season_id=dataset.dataset_resource_id.season_id,
228
+
229
229
  engine="polars"
230
230
  )
231
-
232
- # Write to parquet format
233
- .write_parquet(
234
- f"/tmp/files/blaat/{dataset.identifier.match_id}.parquet"
231
+
232
+ # Write to parquet format
233
+ .write_parquet(
234
+ f"/tmp/files/blaat/{dataset.dataset_resource_id.match_id}.parquet"
235
235
  )
236
236
  ),
237
237
  dataset_collection,
@@ -205,23 +205,23 @@ dataset_collection = store.get_dataset_collection(
205
205
  store.map(
206
206
  lambda dataset: (
207
207
  store
208
-
209
- # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
210
- .load_with_kloppy(dataset)
211
-
212
- # Convert it into a polars dataframe using all columns in the original data and some more additional ones
213
- .to_df(
214
- "*",
215
- match_id=dataset.identifier.match_id,
216
- competition_id=dataset.identifier.competition_id,
217
- season_id=dataset.identifier.season_id,
218
-
208
+
209
+ # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
210
+ .load_with_kloppy(dataset)
211
+
212
+ # Convert it into a polars dataframe using all columns in the original data and some more additional ones
213
+ .to_df(
214
+ "*",
215
+ match_id=dataset.dataset_resource_id.match_id,
216
+ competition_id=dataset.dataset_resource_id.competition_id,
217
+ season_id=dataset.dataset_resource_id.season_id,
218
+
219
219
  engine="polars"
220
220
  )
221
-
222
- # Write to parquet format
223
- .write_parquet(
224
- f"/tmp/files/blaat/{dataset.identifier.match_id}.parquet"
221
+
222
+ # Write to parquet format
223
+ .write_parquet(
224
+ f"/tmp/files/blaat/{dataset.dataset_resource_id.match_id}.parquet"
225
225
  )
226
226
  ),
227
227
  dataset_collection,
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.3.2"
11
+ __version__ = "0.3.4"
@@ -58,8 +58,7 @@ class DatasetStore:
58
58
  self.event_bus.dispatch(event)
59
59
 
60
60
  def save_ingestion_job_summary(self, ingestion_job_summary):
61
- self.dataset_repository.session.add(ingestion_job_summary)
62
- self.dataset_repository.session.commit()
61
+ self.dataset_repository.save_ingestion_job_summary(ingestion_job_summary)
63
62
 
64
63
  def get_dataset_collection(
65
64
  self,
@@ -298,8 +297,8 @@ class DatasetStore:
298
297
  )
299
298
 
300
299
  loaded_file = LoadedFile(
301
- _stream=get_stream if lazy else get_stream(file),
302
- **asdict(file),
300
+ stream_=get_stream if lazy else get_stream(file),
301
+ **file.model_dump(),
303
302
  )
304
303
  files[file.file_id] = loaded_file
305
304
  return FileCollection(files, auto_rewind=auto_rewind)
@@ -0,0 +1,5 @@
1
+ from pydantic import BaseModel as PydanticBaseModel, ConfigDict
2
+
3
+
4
+ class BaseModel(PydanticBaseModel):
5
+ model_config = ConfigDict(arbitrary_types_allowed=True, from_attributes=True)
@@ -1,7 +1,7 @@
1
1
  from datetime import datetime
2
2
  from enum import Enum
3
3
  from typing import List, Optional
4
- from pydantic import Field
4
+ from pydantic import Field, field_validator
5
5
 
6
6
  from ingestify.utils import utcnow
7
7
  from .dataset_state import DatasetState
@@ -24,6 +24,13 @@ class Dataset(BaseModel):
24
24
  updated_at: datetime
25
25
  revisions: List[Revision] = Field(default_factory=list)
26
26
 
27
+ @field_validator("identifier", mode="before")
28
+ @classmethod
29
+ def parse_identifier(cls, value):
30
+ if not isinstance(value, Identifier):
31
+ return Identifier(value)
32
+ return value
33
+
27
34
  @property
28
35
  def is_complete(self):
29
36
  return self.state.is_complete
@@ -116,18 +116,18 @@ class LoadedFile(BaseModel):
116
116
  data_serialization_format: Optional[str] # Example: 'json'
117
117
  storage_compression_method: Optional[str] # Example: 'gzip'
118
118
  storage_path: Path
119
- _stream: Union[BinaryIO, BytesIO, Callable[[], Awaitable[Union[BinaryIO, BytesIO]]]]
119
+ stream_: Union[BinaryIO, BytesIO, Callable[[], Awaitable[Union[BinaryIO, BytesIO]]]]
120
120
  revision_id: Optional[int] = None # This can be used when a Revision is squashed
121
121
 
122
122
  def load_stream(self):
123
- if callable(self._stream):
124
- self._stream = self._stream(self)
123
+ if callable(self.stream_):
124
+ self.stream_ = self.stream_(self)
125
125
 
126
126
  @property
127
127
  def stream(self):
128
- if callable(self._stream):
128
+ if callable(self.stream_):
129
129
  raise Exception("You should load the stream first using `load_stream`")
130
- return self._stream
130
+ return self.stream_
131
131
 
132
132
 
133
133
  __all__ = ["File", "DraftFile", "LoadedFile"]
@@ -1,6 +1,6 @@
1
1
  from datetime import datetime
2
2
  from enum import Enum
3
- from typing import Dict, List
3
+ from typing import Dict, List, Optional
4
4
 
5
5
  from typing_extensions import TypedDict
6
6
 
@@ -32,7 +32,7 @@ class Revision(BaseModel):
32
32
  created_at: datetime
33
33
  description: str
34
34
  modified_files: List[File]
35
- source: RevisionSource
35
+ source: Optional[RevisionSource]
36
36
  is_squashed: bool = False
37
37
  state: RevisionState = RevisionState.PENDING_VALIDATION
38
38
 
@@ -2,6 +2,7 @@ import itertools
2
2
  import json
3
3
  import logging
4
4
  import uuid
5
+ from enum import Enum
5
6
  from typing import Optional, Iterator
6
7
 
7
8
  from ingestify import retrieve_http
@@ -17,6 +18,7 @@ from ingestify.domain.models.resources.dataset_resource import (
17
18
  DatasetResource,
18
19
  )
19
20
  from ingestify.domain.models.task.task_summary import TaskSummary
21
+ from ingestify.exceptions import SaveError
20
22
  from ingestify.utils import TaskExecutor, chunker
21
23
 
22
24
  logger = logging.getLogger(__name__)
@@ -120,21 +122,27 @@ class UpdateDatasetTask(Task):
120
122
  with TaskSummary.update(
121
123
  self.task_id, dataset_identifier=dataset_identifier
122
124
  ) as task_summary:
123
- revision = self.store.update_dataset(
124
- dataset=self.dataset,
125
- name=self.dataset_resource.name,
126
- state=self.dataset_resource.state,
127
- metadata=self.dataset_resource.metadata,
128
- files={
129
- file_id: task_summary.record_load_file(
130
- lambda: load_file(file_resource, dataset=self.dataset),
131
- metadata={"file_id": file_id},
132
- )
133
- for file_id, file_resource in self.dataset_resource.files.items()
134
- },
135
- revision_source=revision_source,
136
- )
137
- task_summary.set_stats_from_revision(revision)
125
+
126
+ files = {
127
+ file_id: task_summary.record_load_file(
128
+ lambda: load_file(file_resource, dataset=self.dataset),
129
+ metadata={"file_id": file_id},
130
+ )
131
+ for file_id, file_resource in self.dataset_resource.files.items()
132
+ }
133
+
134
+ try:
135
+ revision = self.store.update_dataset(
136
+ dataset=self.dataset,
137
+ name=self.dataset_resource.name,
138
+ state=self.dataset_resource.state,
139
+ metadata=self.dataset_resource.metadata,
140
+ files=files,
141
+ revision_source=revision_source,
142
+ )
143
+ task_summary.set_stats_from_revision(revision)
144
+ except Exception as e:
145
+ raise SaveError("Could not update dataset") from e
138
146
 
139
147
  return task_summary
140
148
 
@@ -159,24 +167,28 @@ class CreateDatasetTask(Task):
159
167
  )
160
168
 
161
169
  with TaskSummary.create(self.task_id, dataset_identifier) as task_summary:
162
- revision = self.store.create_dataset(
163
- dataset_type=self.dataset_resource.dataset_type,
164
- provider=self.dataset_resource.provider,
165
- dataset_identifier=dataset_identifier,
166
- name=self.dataset_resource.name,
167
- state=self.dataset_resource.state,
168
- metadata=self.dataset_resource.metadata,
169
- files={
170
- file_id: task_summary.record_load_file(
171
- lambda: load_file(file_resource, dataset=None),
172
- metadata={"file_id": file_id},
173
- )
174
- for file_id, file_resource in self.dataset_resource.files.items()
175
- },
176
- revision_source=revision_source,
177
- )
170
+ files = {
171
+ file_id: task_summary.record_load_file(
172
+ lambda: load_file(file_resource, dataset=None),
173
+ metadata={"file_id": file_id},
174
+ )
175
+ for file_id, file_resource in self.dataset_resource.files.items()
176
+ }
177
+ try:
178
+ revision = self.store.create_dataset(
179
+ dataset_type=self.dataset_resource.dataset_type,
180
+ provider=self.dataset_resource.provider,
181
+ dataset_identifier=dataset_identifier,
182
+ name=self.dataset_resource.name,
183
+ state=self.dataset_resource.state,
184
+ metadata=self.dataset_resource.metadata,
185
+ files=files,
186
+ revision_source=revision_source,
187
+ )
178
188
 
179
- task_summary.set_stats_from_revision(revision)
189
+ task_summary.set_stats_from_revision(revision)
190
+ except Exception as e:
191
+ raise SaveError("Could not create dataset") from e
180
192
 
181
193
  return task_summary
182
194
 
@@ -202,6 +214,9 @@ class IngestionJob:
202
214
  self, store: DatasetStore, task_executor: TaskExecutor
203
215
  ) -> Iterator[IngestionJobSummary]:
204
216
  is_first_chunk = True
217
+ ingestion_job_exception = (
218
+ None # Indicate if there was an exception during the IngestionJob itself
219
+ )
205
220
  ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
206
221
  # Process all items in batches. Yield a IngestionJobSummary per batch
207
222
 
@@ -219,26 +234,37 @@ class IngestionJob:
219
234
  # 1. The discover_datasets returns a list, and the entire list can be processed at once
220
235
  # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
221
236
  with ingestion_job_summary.record_timing("find_datasets"):
222
- # Timing might be incorrect as it is an iterator
223
- dataset_resources = self.ingestion_plan.source.find_datasets(
224
- dataset_type=self.ingestion_plan.dataset_type,
225
- data_spec_versions=self.selector.data_spec_versions,
226
- dataset_collection_metadata=dataset_collection_metadata,
227
- **self.selector.custom_attributes,
228
- )
237
+ try:
238
+ dataset_resources = self.ingestion_plan.source.find_datasets(
239
+ dataset_type=self.ingestion_plan.dataset_type,
240
+ data_spec_versions=self.selector.data_spec_versions,
241
+ dataset_collection_metadata=dataset_collection_metadata,
242
+ **self.selector.custom_attributes,
243
+ )
229
244
 
230
- finish_task_timer = ingestion_job_summary.start_timing("tasks")
245
+ # We need to include the to_batches as that will start the generator
246
+ batches = to_batches(dataset_resources)
247
+ except Exception as e:
248
+ logger.exception("Failed to find datasets")
231
249
 
232
- batches = to_batches(dataset_resources)
250
+ ingestion_job_summary.set_exception(e)
251
+ yield ingestion_job_summary
252
+ return
253
+
254
+ finish_task_timer = ingestion_job_summary.start_timing("tasks")
233
255
 
234
256
  while True:
235
257
  try:
236
258
  batch = next(batches)
237
259
  except StopIteration:
238
260
  break
239
- except Exception:
240
- # TODO: handle exception on IngestionJob level
241
- raise
261
+ except Exception as e:
262
+ logger.exception("Failed to fetch next batch")
263
+
264
+ finish_task_timer()
265
+ ingestion_job_summary.set_exception(e)
266
+ yield ingestion_job_summary
267
+ return
242
268
 
243
269
  dataset_identifiers = [
244
270
  Identifier.create_from_selector(
@@ -1,12 +1,13 @@
1
1
  import uuid
2
2
  from contextlib import contextmanager
3
3
  from datetime import datetime, timedelta
4
+ from enum import Enum
4
5
  from typing import Optional, List, TYPE_CHECKING
5
6
  from pydantic import Field
6
7
 
7
8
  from ingestify.domain import Selector, DataSpecVersionCollection
8
9
  from ingestify.domain.models.base import BaseModel
9
- from ingestify.domain.models.task.task_summary import TaskSummary, TaskStatus
10
+ from ingestify.domain.models.task.task_summary import TaskSummary, TaskState
10
11
  from ingestify.domain.models.timing import Timing
11
12
  from ingestify.utils import utcnow
12
13
 
@@ -14,6 +15,12 @@ if TYPE_CHECKING:
14
15
  from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
15
16
 
16
17
 
18
+ class IngestionJobState(str, Enum):
19
+ RUNNING = "RUNNING"
20
+ FINISHED = "FINISHED"
21
+ FAILED = "FAILED"
22
+
23
+
17
24
  def format_duration(duration: timedelta):
18
25
  return f"{duration.total_seconds():.2f}sec"
19
26
 
@@ -30,7 +37,8 @@ class IngestionJobSummary(BaseModel):
30
37
  selector: Selector
31
38
 
32
39
  started_at: datetime = Field(default_factory=utcnow)
33
- finished_at: Optional[datetime] = None
40
+ ended_at: Optional[datetime] = None
41
+ state: IngestionJobState = IngestionJobState.RUNNING
34
42
  timings: List[Timing] = Field(default_factory=list)
35
43
  task_summaries: List[TaskSummary] = Field(default_factory=list)
36
44
 
@@ -55,8 +63,10 @@ class IngestionJobSummary(BaseModel):
55
63
  @contextmanager
56
64
  def record_timing(self, name: str):
57
65
  start = utcnow()
58
- yield
59
- self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
66
+ try:
67
+ yield
68
+ finally:
69
+ self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
60
70
 
61
71
  def start_timing(self, name):
62
72
  start = utcnow()
@@ -75,28 +85,36 @@ class IngestionJobSummary(BaseModel):
75
85
  def task_count(self):
76
86
  return len(self.task_summaries)
77
87
 
78
- def set_finished(self):
88
+ def _set_ended(self):
79
89
  self.failed_tasks = len(
80
- [task for task in self.task_summaries if task.status == TaskStatus.FAILED]
90
+ [task for task in self.task_summaries if task.state == TaskState.FAILED]
81
91
  )
82
92
  self.successful_tasks = len(
83
- [task for task in self.task_summaries if task.status == TaskStatus.FINISHED]
93
+ [task for task in self.task_summaries if task.state == TaskState.FINISHED]
84
94
  )
85
95
  self.ignored_successful_tasks = len(
86
96
  [
87
97
  task
88
98
  for task in self.task_summaries
89
- if task.status == TaskStatus.FINISHED_IGNORED
99
+ if task.state == TaskState.FINISHED_IGNORED
90
100
  ]
91
101
  )
92
- self.finished_at = utcnow()
102
+ self.ended_at = utcnow()
103
+
104
+ def set_finished(self):
105
+ self.state = IngestionJobState.FINISHED
106
+ self._set_ended()
107
+
108
+ def set_exception(self, e: Exception):
109
+ self.state = IngestionJobState.FAILED
110
+ self._set_ended()
93
111
 
94
112
  @property
95
113
  def duration(self) -> timedelta:
96
- return self.finished_at - self.started_at
114
+ return self.ended_at - self.started_at
97
115
 
98
116
  def output_report(self):
99
- print(f"\nIngestionJobSummary finished in {format_duration(self.duration)}")
117
+ print(f"\nIngestionJobSummary {self.state} in {format_duration(self.duration)}")
100
118
  print("--------------------")
101
119
  print(f" - IngestionPlan:")
102
120
  print(f" Source: {self.source_name}")
@@ -16,7 +16,7 @@ from ingestify.utils import utcnow
16
16
  logger = logging.getLogger(__name__)
17
17
 
18
18
 
19
- class TaskStatus(str, Enum):
19
+ class TaskState(str, Enum):
20
20
  RUNNING = "RUNNING"
21
21
  FINISHED = "FINISHED"
22
22
  FINISHED_IGNORED = "FINISHED_IGNORED" # Finished, but didn't produce any new data
@@ -37,7 +37,7 @@ class TaskSummary(BaseModel):
37
37
  persisted_file_count: int = 0
38
38
  bytes_retrieved: int = 0
39
39
  last_modified: Optional[datetime] = None
40
- status: TaskStatus = TaskStatus.RUNNING
40
+ state: TaskState = TaskState.RUNNING
41
41
  timings: List[Timing] = Field(default_factory=list)
42
42
 
43
43
  @field_validator("dataset_identifier", mode="before")
@@ -83,10 +83,10 @@ class TaskSummary(BaseModel):
83
83
  try:
84
84
  yield task_summary
85
85
 
86
- task_summary.set_status(TaskStatus.FINISHED)
86
+ task_summary.set_state(TaskState.FINISHED)
87
87
  except Exception as e:
88
88
  logger.exception(f"Failed to execute task.")
89
- task_summary.set_status(TaskStatus.FAILED)
89
+ task_summary.set_state(TaskState.FAILED)
90
90
 
91
91
  # When the error comes from our own code, make sure it will be raised to the highest level
92
92
  # raise
@@ -111,8 +111,8 @@ class TaskSummary(BaseModel):
111
111
  file.modified_at for file in revision.modified_files
112
112
  )
113
113
  else:
114
- self.status = TaskStatus.FINISHED_IGNORED
114
+ self.state = TaskState.FINISHED_IGNORED
115
115
 
116
- def set_status(self, status: TaskStatus):
117
- if self.status == TaskStatus.RUNNING:
118
- self.status = status
116
+ def set_state(self, state: TaskState):
117
+ if self.state == TaskState.RUNNING:
118
+ self.state = state
@@ -8,3 +8,7 @@ class ConfigurationError(IngestifyError):
8
8
 
9
9
  class DuplicateFile(IngestifyError):
10
10
  pass
11
+
12
+
13
+ class SaveError(IngestifyError):
14
+ pass
@@ -0,0 +1,33 @@
1
+ import json
2
+ from datetime import datetime
3
+ from typing import Type, Any, TypeVar
4
+
5
+ from dataclass_factory import Schema, Factory, NameStyle
6
+ from dataclass_factory.schema_helpers import type_checker
7
+
8
+ from ingestify.domain import DatasetCreated, Identifier
9
+ from ingestify.domain.models.dataset.events import MetadataUpdated, RevisionAdded
10
+ from ingestify.domain.models.event import DomainEvent
11
+
12
+
13
+ event_types = {
14
+ DatasetCreated.event_type: DatasetCreated,
15
+ RevisionAdded.event_type: RevisionAdded,
16
+ }
17
+
18
+
19
+ def deserialize(event_dict: dict) -> DomainEvent:
20
+ event_cls = event_types[event_dict["event_type"]]
21
+ event_dict["dataset"]["identifier"] = Identifier(
22
+ **event_dict["dataset"]["identifier"]
23
+ )
24
+
25
+ return event_cls.model_validate(event_dict)
26
+
27
+
28
+ def serialize(event: DomainEvent) -> dict:
29
+ event_dict = event.model_dump(mode="json")
30
+
31
+ # Make sure event_type is always part of the event_dict. Pydantic might skip it when the type is ClassVar
32
+ event_dict["event_type"] = event.event_type
33
+ return event_dict