ingestify 0.3.3__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {ingestify-0.3.3 → ingestify-0.4.0}/PKG-INFO +16 -16
  2. {ingestify-0.3.3 → ingestify-0.4.0}/README.md +15 -15
  3. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/__init__.py +1 -1
  4. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/application/dataset_store.py +4 -4
  5. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/application/ingestion_engine.py +7 -2
  6. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/application/loader.py +14 -1
  7. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/cmdline.py +20 -2
  8. ingestify-0.4.0/ingestify/domain/models/base.py +5 -0
  9. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/collection_metadata.py +2 -1
  10. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/dataset.py +18 -1
  11. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/file.py +5 -5
  12. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/revision.py +6 -2
  13. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/ingestion/ingestion_job.py +69 -45
  14. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +48 -40
  15. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/task/task_summary.py +11 -32
  16. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/exceptions.py +4 -0
  17. ingestify-0.4.0/ingestify/infra/serialization/__init__.py +22 -0
  18. ingestify-0.4.0/ingestify/infra/store/dataset/sqlalchemy/repository.py +483 -0
  19. ingestify-0.3.3/ingestify/infra/store/dataset/sqlalchemy/mapping.py → ingestify-0.4.0/ingestify/infra/store/dataset/sqlalchemy/tables.py +103 -79
  20. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/utils.py +48 -16
  21. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/PKG-INFO +16 -16
  22. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/SOURCES.txt +1 -1
  23. ingestify-0.3.3/ingestify/domain/models/base.py +0 -22
  24. ingestify-0.3.3/ingestify/infra/serialization/__init__.py +0 -50
  25. ingestify-0.3.3/ingestify/infra/store/dataset/sqlalchemy/repository.py +0 -239
  26. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/application/__init__.py +0 -0
  27. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/application/secrets_manager.py +0 -0
  28. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/__init__.py +0 -0
  29. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/__init__.py +0 -0
  30. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  31. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/__init__.py +0 -0
  32. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/collection.py +0 -0
  33. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  34. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/dataset_state.py +0 -0
  35. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/events.py +0 -0
  36. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
  37. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
  38. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/identifier.py +0 -0
  39. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/dataset/selector.py +0 -0
  40. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/__init__.py +0 -0
  41. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/_old_event.py +0 -0
  42. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/dispatcher.py +0 -0
  43. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/domain_event.py +0 -0
  44. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/event_bus.py +0 -0
  45. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/publisher.py +0 -0
  46. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/event/subscriber.py +0 -0
  47. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/fetch_policy.py +0 -0
  48. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
  49. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  50. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/resources/__init__.py +0 -0
  51. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/resources/dataset_resource.py +0 -0
  52. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/sink.py +0 -0
  53. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/source.py +0 -0
  54. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/task/__init__.py +0 -0
  55. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/task/set.py +0 -0
  56. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/task/task.py +0 -0
  57. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/models/timing.py +0 -0
  58. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/services/__init__.py +0 -0
  59. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  60. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/services/transformers/__init__.py +0 -0
  61. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  62. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/__init__.py +0 -0
  63. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/fetch/__init__.py +0 -0
  64. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/fetch/http.py +0 -0
  65. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/sink/__init__.py +0 -0
  66. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/sink/postgresql.py +0 -0
  67. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/source/__init__.py +0 -0
  68. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/source/statsbomb_github.py +0 -0
  69. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/source/wyscout.py +0 -0
  70. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/__init__.py +0 -0
  71. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/dataset/__init__.py +0 -0
  72. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  73. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/file/__init__.py +0 -0
  74. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  75. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
  76. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  77. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/main.py +0 -0
  78. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/server.py +0 -0
  79. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/source_base.py +0 -0
  80. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/README.md +0 -0
  81. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
  82. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
  83. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/statsbomb_github/query.py +0 -0
  84. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/.env +0 -0
  85. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/.gitignore +0 -0
  86. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/README.md +0 -0
  87. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
  88. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/database/README.md +0 -0
  89. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify/static/templates/wyscout/query.py +0 -0
  90. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/dependency_links.txt +0 -0
  91. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/entry_points.txt +0 -0
  92. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/requires.txt +0 -0
  93. {ingestify-0.3.3 → ingestify-0.4.0}/ingestify.egg-info/top_level.txt +0 -0
  94. {ingestify-0.3.3 → ingestify-0.4.0}/setup.cfg +0 -0
  95. {ingestify-0.3.3 → ingestify-0.4.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.3.3
3
+ Version: 0.4.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -215,23 +215,23 @@ dataset_collection = store.get_dataset_collection(
215
215
  store.map(
216
216
  lambda dataset: (
217
217
  store
218
-
219
- # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
220
- .load_with_kloppy(dataset)
221
-
222
- # Convert it into a polars dataframe using all columns in the original data and some more additional ones
223
- .to_df(
224
- "*",
225
- match_id=dataset.identifier.match_id,
226
- competition_id=dataset.identifier.competition_id,
227
- season_id=dataset.identifier.season_id,
228
-
218
+
219
+ # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
220
+ .load_with_kloppy(dataset)
221
+
222
+ # Convert it into a polars dataframe using all columns in the original data and some more additional ones
223
+ .to_df(
224
+ "*",
225
+ match_id=dataset.dataset_resource_id.match_id,
226
+ competition_id=dataset.dataset_resource_id.competition_id,
227
+ season_id=dataset.dataset_resource_id.season_id,
228
+
229
229
  engine="polars"
230
230
  )
231
-
232
- # Write to parquet format
233
- .write_parquet(
234
- f"/tmp/files/blaat/{dataset.identifier.match_id}.parquet"
231
+
232
+ # Write to parquet format
233
+ .write_parquet(
234
+ f"/tmp/files/blaat/{dataset.dataset_resource_id.match_id}.parquet"
235
235
  )
236
236
  ),
237
237
  dataset_collection,
@@ -205,23 +205,23 @@ dataset_collection = store.get_dataset_collection(
205
205
  store.map(
206
206
  lambda dataset: (
207
207
  store
208
-
209
- # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
210
- .load_with_kloppy(dataset)
211
-
212
- # Convert it into a polars dataframe using all columns in the original data and some more additional ones
213
- .to_df(
214
- "*",
215
- match_id=dataset.identifier.match_id,
216
- competition_id=dataset.identifier.competition_id,
217
- season_id=dataset.identifier.season_id,
218
-
208
+
209
+ # As it's related to https://github.com/PySport/kloppy the store can load files using kloppy
210
+ .load_with_kloppy(dataset)
211
+
212
+ # Convert it into a polars dataframe using all columns in the original data and some more additional ones
213
+ .to_df(
214
+ "*",
215
+ match_id=dataset.dataset_resource_id.match_id,
216
+ competition_id=dataset.dataset_resource_id.competition_id,
217
+ season_id=dataset.dataset_resource_id.season_id,
218
+
219
219
  engine="polars"
220
220
  )
221
-
222
- # Write to parquet format
223
- .write_parquet(
224
- f"/tmp/files/blaat/{dataset.identifier.match_id}.parquet"
221
+
222
+ # Write to parquet format
223
+ .write_parquet(
224
+ f"/tmp/files/blaat/{dataset.dataset_resource_id.match_id}.parquet"
225
225
  )
226
226
  ),
227
227
  dataset_collection,
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.3.3"
11
+ __version__ = "0.4.0"
@@ -58,8 +58,7 @@ class DatasetStore:
58
58
  self.event_bus.dispatch(event)
59
59
 
60
60
  def save_ingestion_job_summary(self, ingestion_job_summary):
61
- self.dataset_repository.session.add(ingestion_job_summary)
62
- self.dataset_repository.session.commit()
61
+ self.dataset_repository.save_ingestion_job_summary(ingestion_job_summary)
63
62
 
64
63
  def get_dataset_collection(
65
64
  self,
@@ -271,6 +270,7 @@ class DatasetStore:
271
270
  metadata=metadata,
272
271
  created_at=now,
273
272
  updated_at=now,
273
+ last_modified_at=None, # Not known at this moment
274
274
  )
275
275
  revision = self.add_revision(dataset, files, revision_source, description)
276
276
 
@@ -298,8 +298,8 @@ class DatasetStore:
298
298
  )
299
299
 
300
300
  loaded_file = LoadedFile(
301
- _stream=get_stream if lazy else get_stream(file),
302
- **asdict(file),
301
+ stream_=get_stream if lazy else get_stream(file),
302
+ **file.model_dump(),
303
303
  )
304
304
  files[file.file_id] = loaded_file
305
305
  return FileCollection(files, auto_rewind=auto_rewind)
@@ -21,8 +21,13 @@ class IngestionEngine:
21
21
  def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
22
22
  self.loader.add_ingestion_plan(ingestion_plan)
23
23
 
24
- def load(self, dry_run: bool = False, provider: Optional[str] = None):
25
- self.loader.collect_and_run(dry_run=dry_run, provider=provider)
24
+ def load(
25
+ self,
26
+ dry_run: bool = False,
27
+ provider: Optional[str] = None,
28
+ source: Optional[str] = None,
29
+ ):
30
+ self.loader.collect_and_run(dry_run=dry_run, provider=provider, source=source)
26
31
 
27
32
  def list_datasets(self, as_count: bool = False):
28
33
  """Consider moving this to DataStore"""
@@ -29,7 +29,12 @@ class Loader:
29
29
  def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
30
30
  self.ingestion_plans.append(ingestion_plan)
31
31
 
32
- def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
32
+ def collect_and_run(
33
+ self,
34
+ dry_run: bool = False,
35
+ provider: Optional[str] = None,
36
+ source: Optional[str] = None,
37
+ ):
33
38
  # First collect all selectors, before discovering datasets
34
39
  selectors = {}
35
40
  for ingestion_plan in self.ingestion_plans:
@@ -42,6 +47,13 @@ class Loader:
42
47
  )
43
48
  continue
44
49
 
50
+ if source is not None:
51
+ if ingestion_plan.source.name != source:
52
+ logger.info(
53
+ f"Skipping {ingestion_plan} because source doesn't match '{source}'"
54
+ )
55
+ continue
56
+
45
57
  static_selectors = [
46
58
  selector
47
59
  for selector in ingestion_plan.selectors
@@ -60,6 +72,7 @@ class Loader:
60
72
 
61
73
  # TODO: consider making this lazy and fetch once per Source instead of
62
74
  # once per IngestionPlan
75
+ # TODO: Log exception when `discover_selectors` fails
63
76
  all_selectors = ingestion_plan.source.discover_selectors(
64
77
  ingestion_plan.dataset_type
65
78
  )
@@ -58,7 +58,14 @@ def cli():
58
58
  help="bucket",
59
59
  type=str,
60
60
  )
61
- @click.option("--debug", "debug", required=False, help="Debugging enabled", type=bool)
61
+ @click.option(
62
+ "--debug",
63
+ "debug",
64
+ required=False,
65
+ help="Debugging enabled",
66
+ is_flag=True,
67
+ type=bool,
68
+ )
62
69
  @click.option(
63
70
  "--dry-run",
64
71
  "dry_run",
@@ -74,11 +81,19 @@ def cli():
74
81
  help="Provider - only run tasks for a single provider",
75
82
  type=str,
76
83
  )
84
+ @click.option(
85
+ "--source",
86
+ "source",
87
+ required=False,
88
+ help="Source - only run tasks for a single source",
89
+ type=str,
90
+ )
77
91
  def run(
78
92
  config_file: str,
79
93
  bucket: Optional[str],
80
94
  dry_run: Optional[bool],
81
95
  provider: Optional[str],
96
+ source: Optional[str],
82
97
  debug: Optional[bool],
83
98
  ):
84
99
  try:
@@ -90,7 +105,10 @@ def run(
90
105
  logger.exception(f"Failed due a configuration error: {e}")
91
106
  sys.exit(1)
92
107
 
93
- engine.load(dry_run=dry_run, provider=provider)
108
+ if debug:
109
+ logging.getLogger("root").setLevel(logging.DEBUG)
110
+
111
+ engine.load(dry_run=dry_run, provider=provider, source=source)
94
112
 
95
113
  logger.info("Done")
96
114
 
@@ -0,0 +1,5 @@
1
+ from pydantic import BaseModel as PydanticBaseModel, ConfigDict
2
+
3
+
4
+ class BaseModel(PydanticBaseModel):
5
+ model_config = ConfigDict(arbitrary_types_allowed=True, from_attributes=True)
@@ -6,7 +6,8 @@ from typing import Optional
6
6
  @dataclass
7
7
  class DatasetCollectionMetadata:
8
8
  # This can be useful to figure out if a backfill is required
9
- first_modified: Optional[datetime]
9
+ # TODO - Note: not stored at Dataset level and requires joined query to retrieve
10
+ # first_modified: Optional[datetime]
10
11
 
11
12
  # Use the last modified to only retrieve datasets that are changed
12
13
  last_modified: Optional[datetime]
@@ -1,7 +1,7 @@
1
1
  from datetime import datetime
2
2
  from enum import Enum
3
3
  from typing import List, Optional
4
- from pydantic import Field
4
+ from pydantic import Field, field_validator
5
5
 
6
6
  from ingestify.utils import utcnow
7
7
  from .dataset_state import DatasetState
@@ -22,7 +22,17 @@ class Dataset(BaseModel):
22
22
  metadata: dict
23
23
  created_at: datetime
24
24
  updated_at: datetime
25
+
25
26
  revisions: List[Revision] = Field(default_factory=list)
27
+ # The last_modified_at is equal to the max modified_at of all files in all revisions
28
+ last_modified_at: Optional[datetime]
29
+
30
+ @field_validator("identifier", mode="before")
31
+ @classmethod
32
+ def parse_identifier(cls, value):
33
+ if not isinstance(value, Identifier):
34
+ return Identifier(value)
35
+ return value
26
36
 
27
37
  @property
28
38
  def is_complete(self):
@@ -35,6 +45,13 @@ class Dataset(BaseModel):
35
45
  self.revisions.append(revision)
36
46
  self.updated_at = utcnow()
37
47
 
48
+ if self.last_modified_at:
49
+ self.last_modified_at = max(
50
+ self.last_modified_at, revision.last_modified_at
51
+ )
52
+ else:
53
+ self.last_modified_at = revision.last_modified_at
54
+
38
55
  def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
39
56
  changed = False
40
57
  if self.name != name:
@@ -116,18 +116,18 @@ class LoadedFile(BaseModel):
116
116
  data_serialization_format: Optional[str] # Example: 'json'
117
117
  storage_compression_method: Optional[str] # Example: 'gzip'
118
118
  storage_path: Path
119
- _stream: Union[BinaryIO, BytesIO, Callable[[], Awaitable[Union[BinaryIO, BytesIO]]]]
119
+ stream_: Union[BinaryIO, BytesIO, Callable[[], Awaitable[Union[BinaryIO, BytesIO]]]]
120
120
  revision_id: Optional[int] = None # This can be used when a Revision is squashed
121
121
 
122
122
  def load_stream(self):
123
- if callable(self._stream):
124
- self._stream = self._stream(self)
123
+ if callable(self.stream_):
124
+ self.stream_ = self.stream_(self)
125
125
 
126
126
  @property
127
127
  def stream(self):
128
- if callable(self._stream):
128
+ if callable(self.stream_):
129
129
  raise Exception("You should load the stream first using `load_stream`")
130
- return self._stream
130
+ return self.stream_
131
131
 
132
132
 
133
133
  __all__ = ["File", "DraftFile", "LoadedFile"]
@@ -1,6 +1,6 @@
1
1
  from datetime import datetime
2
2
  from enum import Enum
3
- from typing import Dict, List
3
+ from typing import Dict, List, Optional
4
4
 
5
5
  from typing_extensions import TypedDict
6
6
 
@@ -32,10 +32,14 @@ class Revision(BaseModel):
32
32
  created_at: datetime
33
33
  description: str
34
34
  modified_files: List[File]
35
- source: RevisionSource
35
+ source: Optional[RevisionSource]
36
36
  is_squashed: bool = False
37
37
  state: RevisionState = RevisionState.PENDING_VALIDATION
38
38
 
39
+ @property
40
+ def last_modified_at(self):
41
+ return max(file.modified_at for file in self.modified_files)
42
+
39
43
  @property
40
44
  def modified_files_map(self) -> Dict[str, File]:
41
45
  return {file.file_id: file for file in self.modified_files}
@@ -2,6 +2,7 @@ import itertools
2
2
  import json
3
3
  import logging
4
4
  import uuid
5
+ from enum import Enum
5
6
  from typing import Optional, Iterator
6
7
 
7
8
  from ingestify import retrieve_http
@@ -17,6 +18,7 @@ from ingestify.domain.models.resources.dataset_resource import (
17
18
  DatasetResource,
18
19
  )
19
20
  from ingestify.domain.models.task.task_summary import TaskSummary
21
+ from ingestify.exceptions import SaveError
20
22
  from ingestify.utils import TaskExecutor, chunker
21
23
 
22
24
  logger = logging.getLogger(__name__)
@@ -120,21 +122,27 @@ class UpdateDatasetTask(Task):
120
122
  with TaskSummary.update(
121
123
  self.task_id, dataset_identifier=dataset_identifier
122
124
  ) as task_summary:
123
- revision = self.store.update_dataset(
124
- dataset=self.dataset,
125
- name=self.dataset_resource.name,
126
- state=self.dataset_resource.state,
127
- metadata=self.dataset_resource.metadata,
128
- files={
129
- file_id: task_summary.record_load_file(
130
- lambda: load_file(file_resource, dataset=self.dataset),
131
- metadata={"file_id": file_id},
132
- )
133
- for file_id, file_resource in self.dataset_resource.files.items()
134
- },
135
- revision_source=revision_source,
136
- )
137
- task_summary.set_stats_from_revision(revision)
125
+
126
+ files = {
127
+ file_id: task_summary.record_load_file(
128
+ lambda: load_file(file_resource, dataset=self.dataset),
129
+ metadata={"file_id": file_id},
130
+ )
131
+ for file_id, file_resource in self.dataset_resource.files.items()
132
+ }
133
+
134
+ try:
135
+ revision = self.store.update_dataset(
136
+ dataset=self.dataset,
137
+ name=self.dataset_resource.name,
138
+ state=self.dataset_resource.state,
139
+ metadata=self.dataset_resource.metadata,
140
+ files=files,
141
+ revision_source=revision_source,
142
+ )
143
+ task_summary.set_stats_from_revision(revision)
144
+ except Exception as e:
145
+ raise SaveError("Could not update dataset") from e
138
146
 
139
147
  return task_summary
140
148
 
@@ -159,24 +167,28 @@ class CreateDatasetTask(Task):
159
167
  )
160
168
 
161
169
  with TaskSummary.create(self.task_id, dataset_identifier) as task_summary:
162
- revision = self.store.create_dataset(
163
- dataset_type=self.dataset_resource.dataset_type,
164
- provider=self.dataset_resource.provider,
165
- dataset_identifier=dataset_identifier,
166
- name=self.dataset_resource.name,
167
- state=self.dataset_resource.state,
168
- metadata=self.dataset_resource.metadata,
169
- files={
170
- file_id: task_summary.record_load_file(
171
- lambda: load_file(file_resource, dataset=None),
172
- metadata={"file_id": file_id},
173
- )
174
- for file_id, file_resource in self.dataset_resource.files.items()
175
- },
176
- revision_source=revision_source,
177
- )
170
+ files = {
171
+ file_id: task_summary.record_load_file(
172
+ lambda: load_file(file_resource, dataset=None),
173
+ metadata={"file_id": file_id},
174
+ )
175
+ for file_id, file_resource in self.dataset_resource.files.items()
176
+ }
177
+ try:
178
+ revision = self.store.create_dataset(
179
+ dataset_type=self.dataset_resource.dataset_type,
180
+ provider=self.dataset_resource.provider,
181
+ dataset_identifier=dataset_identifier,
182
+ name=self.dataset_resource.name,
183
+ state=self.dataset_resource.state,
184
+ metadata=self.dataset_resource.metadata,
185
+ files=files,
186
+ revision_source=revision_source,
187
+ )
178
188
 
179
- task_summary.set_stats_from_revision(revision)
189
+ task_summary.set_stats_from_revision(revision)
190
+ except Exception as e:
191
+ raise SaveError("Could not create dataset") from e
180
192
 
181
193
  return task_summary
182
194
 
@@ -209,6 +221,7 @@ class IngestionJob:
209
221
  with ingestion_job_summary.record_timing("get_dataset_collection"):
210
222
  dataset_collection_metadata = store.get_dataset_collection(
211
223
  dataset_type=self.ingestion_plan.dataset_type,
224
+ provider=self.ingestion_plan.source.provider,
212
225
  data_spec_versions=self.selector.data_spec_versions,
213
226
  selector=self.selector,
214
227
  metadata_only=True,
@@ -218,27 +231,38 @@ class IngestionJob:
218
231
  # There are two different, but similar flows here:
219
232
  # 1. The discover_datasets returns a list, and the entire list can be processed at once
220
233
  # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
221
- with ingestion_job_summary.record_timing("find_datasets"):
222
- # Timing might be incorrect as it is an iterator
223
- dataset_resources = self.ingestion_plan.source.find_datasets(
224
- dataset_type=self.ingestion_plan.dataset_type,
225
- data_spec_versions=self.selector.data_spec_versions,
226
- dataset_collection_metadata=dataset_collection_metadata,
227
- **self.selector.custom_attributes,
228
- )
234
+ try:
235
+ with ingestion_job_summary.record_timing("find_datasets"):
236
+ dataset_resources = self.ingestion_plan.source.find_datasets(
237
+ dataset_type=self.ingestion_plan.dataset_type,
238
+ data_spec_versions=self.selector.data_spec_versions,
239
+ dataset_collection_metadata=dataset_collection_metadata,
240
+ **self.selector.custom_attributes,
241
+ )
229
242
 
230
- finish_task_timer = ingestion_job_summary.start_timing("tasks")
243
+ # We need to include the to_batches as that will start the generator
244
+ batches = to_batches(dataset_resources)
245
+ except Exception as e:
246
+ logger.exception("Failed to find datasets")
231
247
 
232
- batches = to_batches(dataset_resources)
248
+ ingestion_job_summary.set_exception(e)
249
+ yield ingestion_job_summary
250
+ return
251
+
252
+ finish_task_timer = ingestion_job_summary.start_timing("tasks")
233
253
 
234
254
  while True:
235
255
  try:
236
256
  batch = next(batches)
237
257
  except StopIteration:
238
258
  break
239
- except Exception:
240
- # TODO: handle exception on IngestionJob level
241
- raise
259
+ except Exception as e:
260
+ logger.exception("Failed to fetch next batch")
261
+
262
+ finish_task_timer()
263
+ ingestion_job_summary.set_exception(e)
264
+ yield ingestion_job_summary
265
+ return
242
266
 
243
267
  dataset_identifiers = [
244
268
  Identifier.create_from_selector(
@@ -1,24 +1,31 @@
1
1
  import uuid
2
2
  from contextlib import contextmanager
3
3
  from datetime import datetime, timedelta
4
+ from enum import Enum
4
5
  from typing import Optional, List, TYPE_CHECKING
5
6
  from pydantic import Field
6
7
 
7
8
  from ingestify.domain import Selector, DataSpecVersionCollection
8
9
  from ingestify.domain.models.base import BaseModel
9
- from ingestify.domain.models.task.task_summary import TaskSummary, TaskStatus
10
+ from ingestify.domain.models.task.task_summary import TaskSummary, TaskState
10
11
  from ingestify.domain.models.timing import Timing
11
- from ingestify.utils import utcnow
12
+ from ingestify.utils import utcnow, HasTiming
12
13
 
13
14
  if TYPE_CHECKING:
14
15
  from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
15
16
 
16
17
 
18
+ class IngestionJobState(str, Enum):
19
+ RUNNING = "RUNNING"
20
+ FINISHED = "FINISHED"
21
+ FAILED = "FAILED"
22
+
23
+
17
24
  def format_duration(duration: timedelta):
18
25
  return f"{duration.total_seconds():.2f}sec"
19
26
 
20
27
 
21
- class IngestionJobSummary(BaseModel):
28
+ class IngestionJobSummary(BaseModel, HasTiming):
22
29
  ingestion_job_summary_id: str
23
30
  ingestion_job_id: str
24
31
 
@@ -30,8 +37,8 @@ class IngestionJobSummary(BaseModel):
30
37
  selector: Selector
31
38
 
32
39
  started_at: datetime = Field(default_factory=utcnow)
33
- finished_at: Optional[datetime] = None
34
- timings: List[Timing] = Field(default_factory=list)
40
+ ended_at: Optional[datetime] = None
41
+ state: IngestionJobState = IngestionJobState.RUNNING
35
42
  task_summaries: List[TaskSummary] = Field(default_factory=list)
36
43
 
37
44
  skipped_datasets: int = 0
@@ -52,20 +59,6 @@ class IngestionJobSummary(BaseModel):
52
59
  )
53
60
  return cls(**args)
54
61
 
55
- @contextmanager
56
- def record_timing(self, name: str):
57
- start = utcnow()
58
- yield
59
- self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
60
-
61
- def start_timing(self, name):
62
- start = utcnow()
63
-
64
- def finish():
65
- self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
66
-
67
- return finish
68
-
69
62
  def add_task_summaries(self, task_summaries: List[TaskSummary]):
70
63
  self.task_summaries.extend(task_summaries)
71
64
 
@@ -75,46 +68,61 @@ class IngestionJobSummary(BaseModel):
75
68
  def task_count(self):
76
69
  return len(self.task_summaries)
77
70
 
78
- def set_finished(self):
71
+ def _set_ended(self):
79
72
  self.failed_tasks = len(
80
- [task for task in self.task_summaries if task.status == TaskStatus.FAILED]
73
+ [task for task in self.task_summaries if task.state == TaskState.FAILED]
81
74
  )
82
75
  self.successful_tasks = len(
83
- [task for task in self.task_summaries if task.status == TaskStatus.FINISHED]
76
+ [task for task in self.task_summaries if task.state == TaskState.FINISHED]
84
77
  )
85
78
  self.ignored_successful_tasks = len(
86
79
  [
87
80
  task
88
81
  for task in self.task_summaries
89
- if task.status == TaskStatus.FINISHED_IGNORED
82
+ if task.state == TaskState.FINISHED_IGNORED
90
83
  ]
91
84
  )
92
- self.finished_at = utcnow()
85
+ self.ended_at = utcnow()
86
+
87
+ # Only keep failed tasks. Rest isn't interesting
88
+ self.task_summaries = [
89
+ task for task in self.task_summaries if task.state == TaskState.FAILED
90
+ ]
91
+
92
+ def set_finished(self):
93
+ self.state = IngestionJobState.FINISHED
94
+ self._set_ended()
95
+
96
+ def set_exception(self, e: Exception):
97
+ self.state = IngestionJobState.FAILED
98
+ self._set_ended()
93
99
 
94
100
  @property
95
101
  def duration(self) -> timedelta:
96
- return self.finished_at - self.started_at
102
+ return self.ended_at - self.started_at
97
103
 
98
104
  def output_report(self):
99
- print(f"\nIngestionJobSummary finished in {format_duration(self.duration)}")
100
- print("--------------------")
101
- print(f" - IngestionPlan:")
102
- print(f" Source: {self.source_name}")
103
- print(f" Provider: {self.provider}")
104
- print(f" DatasetType: {self.dataset_type}")
105
- print(f" - Selector: {self.selector}")
106
- print(f" - Timings: ")
105
+ print(
106
+ f"\nIngestionJobSummary {self.state.value} in {format_duration(self.duration)}"
107
+ )
108
+ print("********************************")
109
+ print(f"* - IngestionPlan:")
110
+ print(f"* Source: {self.source_name}")
111
+ print(f"* Provider: {self.provider}")
112
+ print(f"* DatasetType: {self.dataset_type}")
113
+ print(f"* - Selector: {self.selector}")
114
+ print(f"* - Timings: ")
107
115
  for timing in self.timings:
108
- print(f" - {timing.name}: {format_duration(timing.duration)}")
116
+ print(f"* - {timing.name}: {format_duration(timing.duration)}")
109
117
  print(
110
- f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
118
+ f"* - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
111
119
  )
112
120
 
113
- print(f" - Failed tasks: {self.failed_tasks}")
114
- print(f" - Successful tasks: {self.successful_tasks}")
115
- print(f" - Successful ignored tasks: {self.ignored_successful_tasks}")
116
- print(f" - Skipped datasets: {self.skipped_datasets}")
117
- print("--------------------")
121
+ print(f"* - Failed tasks: {self.failed_tasks}")
122
+ print(f"* - Successful tasks: {self.successful_tasks}")
123
+ print(f"* - Successful ignored tasks: {self.ignored_successful_tasks}")
124
+ print(f"* - Skipped datasets: {self.skipped_datasets}")
125
+ print("********************************")
118
126
 
119
127
  def __enter__(self):
120
128
  return self