ingestify 0.1.3__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. {ingestify-0.1.3 → ingestify-0.3.0}/PKG-INFO +1 -1
  2. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/__init__.py +1 -1
  3. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/application/dataset_store.py +47 -36
  4. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/application/ingestion_engine.py +3 -3
  5. ingestify-0.3.0/ingestify/application/loader.py +165 -0
  6. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/__init__.py +1 -6
  7. ingestify-0.3.0/ingestify/domain/models/base.py +22 -0
  8. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/data_spec_version_collection.py +6 -0
  9. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/__init__.py +3 -5
  10. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/dataset.py +15 -32
  11. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/dataset_repository.py +1 -15
  12. ingestify-0.3.0/ingestify/domain/models/dataset/dataset_state.py +11 -0
  13. ingestify-0.3.0/ingestify/domain/models/dataset/events.py +21 -0
  14. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/file.py +21 -34
  15. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/file_collection.py +3 -1
  16. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/file_repository.py +29 -28
  17. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/revision.py +26 -3
  18. ingestify-0.3.0/ingestify/domain/models/event/domain_event.py +14 -0
  19. ingestify-0.3.0/ingestify/domain/models/ingestion/ingestion_job.py +325 -0
  20. ingestify-0.3.0/ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
  21. ingestify-0.1.3/ingestify/domain/models/extract_job.py → ingestify-0.3.0/ingestify/domain/models/ingestion/ingestion_plan.py +4 -4
  22. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/resources/dataset_resource.py +29 -37
  23. ingestify-0.3.0/ingestify/domain/models/sink.py +9 -0
  24. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/task/task.py +3 -1
  25. ingestify-0.3.0/ingestify/domain/models/task/task_summary.py +118 -0
  26. ingestify-0.3.0/ingestify/domain/models/timing.py +16 -0
  27. ingestify-0.3.0/ingestify/domain/services/identifier_key_transformer.py +111 -0
  28. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/fetch/http.py +5 -0
  29. ingestify-0.3.0/ingestify/infra/source/statsbomb_github.py +105 -0
  30. ingestify-0.3.0/ingestify/infra/store/dataset/sqlalchemy/mapping.py +336 -0
  31. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
  32. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/file/local_file_repository.py +3 -5
  33. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/file/s3_file_repository.py +4 -9
  34. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/main.py +64 -25
  35. ingestify-0.3.0/ingestify/static/templates/statsbomb_github/README.md +0 -0
  36. ingestify-0.3.0/ingestify/static/templates/wyscout/README.md +0 -0
  37. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/utils.py +15 -78
  38. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/PKG-INFO +1 -1
  39. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/SOURCES.txt +9 -2
  40. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/requires.txt +1 -0
  41. {ingestify-0.1.3 → ingestify-0.3.0}/setup.py +1 -0
  42. ingestify-0.1.3/ingestify/application/loader.py +0 -335
  43. ingestify-0.1.3/ingestify/domain/models/dataset/events.py +0 -31
  44. ingestify-0.1.3/ingestify/domain/models/event/domain_event.py +0 -10
  45. ingestify-0.1.3/ingestify/domain/models/sink.py +0 -16
  46. ingestify-0.1.3/ingestify/infra/source/statsbomb_github.py +0 -92
  47. ingestify-0.1.3/ingestify/infra/store/dataset/__init__.py +0 -2
  48. ingestify-0.1.3/ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
  49. ingestify-0.1.3/ingestify/infra/store/dataset/sqlalchemy/mapping.py +0 -153
  50. {ingestify-0.1.3 → ingestify-0.3.0}/README.md +0 -0
  51. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/application/__init__.py +0 -0
  52. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/application/secrets_manager.py +0 -0
  53. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/cmdline.py +0 -0
  54. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/__init__.py +0 -0
  55. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/collection.py +0 -0
  56. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  57. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/identifier.py +0 -0
  58. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/dataset/selector.py +0 -0
  59. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/__init__.py +0 -0
  60. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/_old_event.py +0 -0
  61. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/dispatcher.py +0 -0
  62. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/event_bus.py +0 -0
  63. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/publisher.py +0 -0
  64. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/event/subscriber.py +0 -0
  65. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/fetch_policy.py +0 -0
  66. {ingestify-0.1.3/ingestify/domain/services → ingestify-0.3.0/ingestify/domain/models/ingestion}/__init__.py +0 -0
  67. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/resources/__init__.py +0 -0
  68. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/source.py +0 -0
  69. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/task/__init__.py +0 -0
  70. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/models/task/set.py +0 -0
  71. {ingestify-0.1.3/ingestify/domain/services/transformers → ingestify-0.3.0/ingestify/domain/services}/__init__.py +0 -0
  72. {ingestify-0.1.3/ingestify/infra/fetch → ingestify-0.3.0/ingestify/domain/services/transformers}/__init__.py +0 -0
  73. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  74. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/exceptions.py +0 -0
  75. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/__init__.py +0 -0
  76. {ingestify-0.1.3/ingestify/infra/sink → ingestify-0.3.0/ingestify/infra/fetch}/__init__.py +0 -0
  77. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/serialization/__init__.py +0 -0
  78. {ingestify-0.1.3/ingestify/infra/source → ingestify-0.3.0/ingestify/infra/sink}/__init__.py +0 -0
  79. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/sink/postgresql.py +0 -0
  80. /ingestify-0.1.3/ingestify/static/templates/statsbomb_github/README.md → /ingestify-0.3.0/ingestify/infra/source/__init__.py +0 -0
  81. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/source/wyscout.py +0 -0
  82. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/__init__.py +0 -0
  83. /ingestify-0.1.3/ingestify/static/templates/wyscout/README.md → /ingestify-0.3.0/ingestify/infra/store/dataset/__init__.py +0 -0
  84. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  85. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/infra/store/file/__init__.py +0 -0
  86. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/server.py +0 -0
  87. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/source_base.py +0 -0
  88. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
  89. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
  90. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/statsbomb_github/query.py +0 -0
  91. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/wyscout/.env +0 -0
  92. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/wyscout/.gitignore +0 -0
  93. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
  94. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/wyscout/database/README.md +0 -0
  95. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify/static/templates/wyscout/query.py +0 -0
  96. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/dependency_links.txt +0 -0
  97. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/entry_points.txt +0 -0
  98. {ingestify-0.1.3 → ingestify-0.3.0}/ingestify.egg-info/top_level.txt +0 -0
  99. {ingestify-0.1.3 → ingestify-0.3.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.1.3
3
+ Version: 0.3.0
4
4
  Summary: Standardizing soccer tracking- and event data
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.1.3"
11
+ __version__ = "0.3.0"
@@ -5,13 +5,14 @@ import mimetypes
5
5
  import os
6
6
  import shutil
7
7
  from dataclasses import asdict
8
- from io import BytesIO, StringIO
8
+ from io import BytesIO
9
9
 
10
- from typing import Dict, List, Optional, Union, Callable, BinaryIO
10
+ from typing import Dict, List, Optional, Union, Callable, BinaryIO, Awaitable
11
11
 
12
12
  from ingestify.domain.models.dataset.dataset import DatasetState
13
13
  from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
14
14
  from ingestify.domain.models.dataset.file_collection import FileCollection
15
+ from ingestify.domain.models.dataset.revision import RevisionSource
15
16
  from ingestify.domain.models.event import EventBus
16
17
  from ingestify.domain.models import (
17
18
  Dataset,
@@ -27,7 +28,7 @@ from ingestify.domain.models import (
27
28
  Revision,
28
29
  DatasetCreated,
29
30
  )
30
- from ingestify.utils import utcnow, map_in_pool
31
+ from ingestify.utils import utcnow
31
32
 
32
33
 
33
34
  logger = logging.getLogger(__name__)
@@ -56,11 +57,16 @@ class DatasetStore:
56
57
  if self.event_bus:
57
58
  self.event_bus.dispatch(event)
58
59
 
60
+ def save_ingestion_job_summary(self, ingestion_job_summary):
61
+ self.dataset_repository.session.add(ingestion_job_summary)
62
+ self.dataset_repository.session.commit()
63
+
59
64
  def get_dataset_collection(
60
65
  self,
61
66
  dataset_type: Optional[str] = None,
62
67
  provider: Optional[str] = None,
63
68
  dataset_id: Optional[str] = None,
69
+ metadata_only: Optional[bool] = False,
64
70
  **selector,
65
71
  ) -> DatasetCollection:
66
72
  if "selector" in selector:
@@ -81,6 +87,7 @@ class DatasetStore:
81
87
  dataset_type=dataset_type,
82
88
  dataset_id=dataset_id,
83
89
  provider=provider,
90
+ metadata_only=metadata_only,
84
91
  selector=selector,
85
92
  )
86
93
  return dataset_collection
@@ -107,7 +114,9 @@ class DatasetStore:
107
114
 
108
115
  return stream, storage_size, suffix
109
116
 
110
- def _prepare_read_stream(self) -> tuple[Callable[[BinaryIO], BytesIO], str]:
117
+ def _prepare_read_stream(
118
+ self,
119
+ ) -> tuple[Callable[[BinaryIO], Awaitable[BytesIO]], str]:
111
120
  if self.storage_compression_method == "gzip":
112
121
 
113
122
  def reader(fh: BinaryIO) -> BytesIO:
@@ -168,7 +177,11 @@ class DatasetStore:
168
177
  return modified_files_
169
178
 
170
179
  def add_revision(
171
- self, dataset: Dataset, files: Dict[str, DraftFile], description: str = "Update"
180
+ self,
181
+ dataset: Dataset,
182
+ files: Dict[str, DraftFile],
183
+ revision_source: RevisionSource,
184
+ description: str = "Update",
172
185
  ):
173
186
  """
174
187
  Create new revision first, so FileRepository can use
@@ -182,46 +195,53 @@ class DatasetStore:
182
195
  # It can happen an API tells us data is changed, but it was not changed. In this case
183
196
  # we decide to ignore it.
184
197
  # Make sure there are files changed before creating a new revision
185
- dataset.add_revision(
186
- Revision(
187
- revision_id=revision_id,
188
- created_at=created_at,
189
- description=description,
190
- modified_files=persisted_files_,
191
- )
198
+ revision = Revision(
199
+ revision_id=revision_id,
200
+ created_at=created_at,
201
+ description=description,
202
+ modified_files=persisted_files_,
203
+ source=revision_source,
192
204
  )
193
205
 
206
+ dataset.add_revision(revision)
207
+
194
208
  self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
195
209
  self.dispatch(RevisionAdded(dataset=dataset))
196
210
  logger.info(
197
211
  f"Added a new revision to {dataset.identifier} -> {', '.join([file.file_id for file in persisted_files_])}"
198
212
  )
199
- return True
200
213
  else:
201
214
  logger.info(
202
215
  f"Ignoring a new revision without changed files -> {dataset.identifier}"
203
216
  )
204
- return False
217
+ revision = None
218
+
219
+ return revision
205
220
 
206
221
  def update_dataset(
207
222
  self,
208
223
  dataset: Dataset,
209
- dataset_resource: DatasetResource,
224
+ name: str,
225
+ state: DatasetState,
226
+ metadata: dict,
210
227
  files: Dict[str, DraftFile],
228
+ revision_source: RevisionSource,
211
229
  ):
212
230
  """The add_revision will also save the dataset."""
213
231
  metadata_changed = False
214
- if dataset.update_from_resource(dataset_resource):
232
+ if dataset.update_metadata(name, metadata, state):
215
233
  self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
216
234
  metadata_changed = True
217
235
 
218
- self.add_revision(dataset, files)
236
+ revision = self.add_revision(dataset, files, revision_source)
219
237
 
220
238
  if metadata_changed:
221
239
  # Dispatch after revision added. Otherwise, the downstream handlers are not able to see
222
240
  # the new revision
223
241
  self.dispatch(MetadataUpdated(dataset=dataset))
224
242
 
243
+ return revision
244
+
225
245
  def destroy_dataset(self, dataset: Dataset):
226
246
  # TODO: remove files. Now we leave some orphaned files around
227
247
  self.dataset_repository.destroy(dataset)
@@ -235,6 +255,7 @@ class DatasetStore:
235
255
  state: DatasetState,
236
256
  metadata: dict,
237
257
  files: Dict[str, DraftFile],
258
+ revision_source: RevisionSource,
238
259
  description: str = "Create",
239
260
  ):
240
261
  now = utcnow()
@@ -251,9 +272,10 @@ class DatasetStore:
251
272
  created_at=now,
252
273
  updated_at=now,
253
274
  )
254
- self.add_revision(dataset, files, description)
275
+ revision = self.add_revision(dataset, files, revision_source, description)
255
276
 
256
277
  self.dispatch(DatasetCreated(dataset=dataset))
278
+ return revision
257
279
 
258
280
  def load_files(
259
281
  self,
@@ -271,20 +293,9 @@ class DatasetStore:
271
293
  continue
272
294
 
273
295
  def get_stream(file_):
274
- revision_id = file_.revision_id
275
- if revision_id is None:
276
- revision_id = current_revision.revision_id
277
-
278
296
  return reader(
279
297
  self.file_repository.load_content(
280
- bucket=self.bucket,
281
- dataset=dataset,
282
- # When file.revision_id is set we must use it.
283
- revision_id=revision_id,
284
- filename=file_.file_id
285
- + "."
286
- + file_.data_serialization_format
287
- + suffix,
298
+ bucket=self.bucket, storage_path=file_.storage_path
288
299
  )
289
300
  )
290
301
 
@@ -302,8 +313,8 @@ class DatasetStore:
302
313
 
303
314
  try:
304
315
  return statsbomb.load(
305
- event_data=files.get_file("events").stream,
306
- lineup_data=files.get_file("lineups").stream,
316
+ event_data=(files.get_file("events")).stream,
317
+ lineup_data=(files.get_file("lineups")).stream,
307
318
  **kwargs,
308
319
  )
309
320
  except Exception as e:
@@ -333,7 +344,7 @@ class DatasetStore:
333
344
  # filename=filename,
334
345
  # )
335
346
 
336
- def map(
337
- self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
338
- ):
339
- return map_in_pool(fn, dataset_collection, processes)
347
+ # def map(
348
+ # self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
349
+ # ):
350
+ # return map_in_pool(fn, dataset_collection, processes)
@@ -5,7 +5,7 @@ from typing import Optional, List
5
5
 
6
6
  from .loader import Loader
7
7
  from .dataset_store import DatasetStore
8
- from ..domain.models.extract_job import ExtractJob
8
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
9
9
 
10
10
  logger = logging.getLogger(__name__)
11
11
 
@@ -18,8 +18,8 @@ class IngestionEngine:
18
18
  self.store = store
19
19
  self.loader = Loader(self.store)
20
20
 
21
- def add_extract_job(self, extract_job: ExtractJob):
22
- self.loader.add_extract_job(extract_job)
21
+ def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
22
+ self.loader.add_ingestion_plan(ingestion_plan)
23
23
 
24
24
  def load(self, dry_run: bool = False, provider: Optional[str] = None):
25
25
  self.loader.collect_and_run(dry_run=dry_run, provider=provider)
@@ -0,0 +1,165 @@
1
+ import logging
2
+ import platform
3
+ import uuid
4
+ from multiprocessing import set_start_method
5
+ from typing import List, Optional
6
+
7
+ from ingestify.domain.models import Selector
8
+ from ingestify.utils import TaskExecutor
9
+
10
+ from .dataset_store import DatasetStore
11
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
12
+ from ..domain.models.ingestion.ingestion_job import IngestionJob
13
+ from ..exceptions import ConfigurationError
14
+
15
+ if platform.system() == "Darwin":
16
+ set_start_method("fork", force=True)
17
+ else:
18
+ set_start_method("spawn", force=True)
19
+
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class Loader:
25
+ def __init__(self, store: DatasetStore):
26
+ self.store = store
27
+ self.ingestion_plans: List[IngestionPlan] = []
28
+
29
+ def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
30
+ self.ingestion_plans.append(ingestion_plan)
31
+
32
+ def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
33
+ # First collect all selectors, before discovering datasets
34
+ selectors = {}
35
+ for ingestion_plan in self.ingestion_plans:
36
+ logger.info(f"Determining selectors for {ingestion_plan}")
37
+
38
+ if provider is not None:
39
+ if ingestion_plan.source.provider != provider:
40
+ logger.info(
41
+ f"Skipping {ingestion_plan} because provider doesn't match '{provider}'"
42
+ )
43
+ continue
44
+
45
+ static_selectors = [
46
+ selector
47
+ for selector in ingestion_plan.selectors
48
+ if not selector.is_dynamic
49
+ ]
50
+ dynamic_selectors = [
51
+ selector for selector in ingestion_plan.selectors if selector.is_dynamic
52
+ ]
53
+
54
+ no_selectors = len(static_selectors) == 1 and not bool(static_selectors[0])
55
+ if dynamic_selectors or no_selectors:
56
+ if hasattr(ingestion_plan.source, "discover_selectors"):
57
+ logger.debug(
58
+ f"Discovering selectors from {ingestion_plan.source.__class__.__name__}"
59
+ )
60
+
61
+ # TODO: consider making this lazy and fetch once per Source instead of
62
+ # once per IngestionPlan
63
+ all_selectors = ingestion_plan.source.discover_selectors(
64
+ ingestion_plan.dataset_type
65
+ )
66
+ if no_selectors:
67
+ # When there were no selectors specified, just use all of them
68
+ extra_static_selectors = [
69
+ Selector.build(
70
+ job_selector,
71
+ data_spec_versions=ingestion_plan.data_spec_versions,
72
+ )
73
+ for job_selector in all_selectors
74
+ ]
75
+ static_selectors = []
76
+ else:
77
+ extra_static_selectors = []
78
+ for dynamic_selector in dynamic_selectors:
79
+ dynamic_job_selectors = [
80
+ Selector.build(
81
+ job_selector,
82
+ data_spec_versions=ingestion_plan.data_spec_versions,
83
+ )
84
+ for job_selector in all_selectors
85
+ if dynamic_selector.is_match(job_selector)
86
+ ]
87
+ extra_static_selectors.extend(dynamic_job_selectors)
88
+ logger.info(f"Added {len(dynamic_job_selectors)} selectors")
89
+
90
+ static_selectors.extend(extra_static_selectors)
91
+
92
+ logger.info(
93
+ f"Discovered {len(extra_static_selectors)} selectors from {ingestion_plan.source.__class__.__name__}"
94
+ )
95
+ else:
96
+ if not no_selectors:
97
+ # When there are no selectors and no discover_selectors, just pass it through. It might break
98
+ # later on
99
+ raise ConfigurationError(
100
+ f"Dynamic selectors cannot be used for "
101
+ f"{ingestion_plan.source.__class__.__name__} because it doesn't support"
102
+ f" selector discovery"
103
+ )
104
+
105
+ # Merge selectors when source, dataset_type and actual selector is the same. This makes
106
+ # sure there will be only 1 dataset for this combination
107
+ for selector in static_selectors:
108
+ key = (
109
+ ingestion_plan.source.name,
110
+ ingestion_plan.dataset_type,
111
+ selector.key,
112
+ )
113
+ if existing_selector := selectors.get(key):
114
+ existing_selector[1].data_spec_versions.merge(
115
+ selector.data_spec_versions
116
+ )
117
+ else:
118
+ selectors[key] = (ingestion_plan, selector)
119
+
120
+ """
121
+ Data is denormalized:
122
+
123
+ It actually looks like:
124
+ - IngestionPlan #1
125
+ - Selector 1.1
126
+ - Selector 1.2
127
+ - Selector 1.3
128
+ - IngestionPlan #2
129
+ - Selector 2.1
130
+ - Selector 2.2
131
+
132
+ We process this as:
133
+ - IngestionPlan #1, Selector 1.1
134
+ - IngestionPlan #1, Selector 1.2
135
+ - IngestionPlan #1, Selector 1.3
136
+ - IngestionPlan #2, Selector 2.1
137
+ - IngestionPlan #2, Selector 2.2
138
+
139
+ IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
140
+ """
141
+ for ingestion_plan, selector in selectors.values():
142
+ logger.info(
143
+ f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
144
+ )
145
+
146
+ ingestion_job = IngestionJob(
147
+ ingestion_job_id=str(uuid.uuid1()),
148
+ ingestion_plan=ingestion_plan,
149
+ selector=selector,
150
+ )
151
+
152
+ with TaskExecutor(dry_run=dry_run) as task_executor:
153
+ for ingestion_job_summary in ingestion_job.execute(
154
+ self.store, task_executor=task_executor
155
+ ):
156
+ # TODO: handle task_summaries
157
+ # Summarize to a IngestionJobSummary, and save to a database. This Summary can later be used in a
158
+ # next run to determine where to resume.
159
+ # TODO 2: Do we want to add additional information from the summary back to the Task, so it can use
160
+ # extra information to determine how/where to resume
161
+ ingestion_job_summary.output_report()
162
+ logger.info(f"Storing IngestionJobSummary")
163
+ self.store.save_ingestion_job_summary(ingestion_job_summary)
164
+
165
+ logger.info("Done")
@@ -11,10 +11,8 @@ from .dataset import (
11
11
  LoadedFile,
12
12
  Selector,
13
13
  Revision,
14
- dataset_repository_factory,
15
- file_repository_factory,
16
14
  )
17
- from .sink import Sink, sink_factory
15
+ from .sink import Sink
18
16
  from .source import Source
19
17
  from .task import Task, TaskSet
20
18
  from .data_spec_version_collection import DataSpecVersionCollection
@@ -35,11 +33,8 @@ __all__ = [
35
33
  "FileRepository",
36
34
  "FileCollection",
37
35
  "DatasetRepository",
38
- "dataset_repository_factory",
39
- "file_repository_factory",
40
36
  "TaskSet",
41
37
  "Task",
42
38
  "Sink",
43
- "sink_factory",
44
39
  "DataSpecVersionCollection",
45
40
  ]
@@ -0,0 +1,22 @@
1
+ from functools import partial
2
+ from typing import ClassVar, Any, Optional
3
+
4
+ import pydantic
5
+ from pydantic import BaseModel as PydanticBaseModel, ConfigDict
6
+
7
+
8
+ # class BaseModel(PydanticBaseModel):
9
+ # model_config = ConfigDict(arbitrary_types_allowed=True)
10
+ #
11
+ # _sa_instance_state: Optional[dict] = None
12
+ from sqlalchemy.orm import MappedAsDataclass
13
+
14
+
15
+ class BaseModel(
16
+ MappedAsDataclass,
17
+ # DeclarativeBase,
18
+ dataclass_callable=partial(
19
+ pydantic.dataclasses.dataclass, config=ConfigDict(arbitrary_types_allowed=True)
20
+ ),
21
+ ):
22
+ pass
@@ -16,6 +16,12 @@ class DataSpecVersionCollection(dict):
16
16
 
17
17
  return cls(items_)
18
18
 
19
+ def to_dict(self):
20
+ return {
21
+ data_feed_key: list(data_spec_versions)
22
+ for data_feed_key, data_spec_versions in self.items()
23
+ }
24
+
19
25
  def copy(self):
20
26
  return DataSpecVersionCollection(copy.deepcopy(self))
21
27
 
@@ -1,8 +1,8 @@
1
+ from .file import DraftFile, File, LoadedFile
1
2
  from .collection import DatasetCollection
2
3
  from .dataset import Dataset
3
- from .dataset_repository import DatasetRepository, dataset_repository_factory
4
- from .file import DraftFile, File, LoadedFile
5
- from .file_repository import FileRepository, file_repository_factory
4
+ from .dataset_repository import DatasetRepository
5
+ from .file_repository import FileRepository
6
6
  from .file_collection import FileCollection
7
7
  from .identifier import Identifier
8
8
  from .selector import Selector
@@ -16,12 +16,10 @@ __all__ = [
16
16
  "Identifier",
17
17
  "DatasetCollection",
18
18
  "DatasetCreated",
19
- "dataset_repository_factory",
20
19
  "File",
21
20
  "DraftFile",
22
21
  "LoadedFile",
23
22
  "DatasetRepository",
24
23
  "FileRepository",
25
- "file_repository_factory",
26
24
  "FileCollection",
27
25
  ]
@@ -1,70 +1,52 @@
1
- from dataclasses import dataclass, field
2
1
  from datetime import datetime
3
2
  from enum import Enum
4
3
  from typing import List, Optional
4
+ from pydantic import Field
5
5
 
6
6
  from ingestify.utils import utcnow
7
-
7
+ from .dataset_state import DatasetState
8
8
  from .file import DraftFile
9
9
  from .identifier import Identifier
10
- from .revision import Revision
11
-
12
-
13
- class DatasetState(Enum):
14
- SCHEDULED = "SCHEDULED"
15
- PARTIAL = "PARTIAL"
16
- COMPLETE = "COMPLETE"
17
-
18
- @property
19
- def is_complete(self):
20
- return self == DatasetState.COMPLETE
10
+ from .revision import Revision, RevisionSource, SourceType
11
+ from ..base import BaseModel
21
12
 
22
- def __str__(self):
23
- return self.value
24
13
 
25
-
26
- @dataclass
27
- class Dataset:
14
+ class Dataset(BaseModel):
28
15
  bucket: str # This must be set by the DatasetRepository
29
-
30
16
  dataset_id: str
31
17
  name: str
32
18
  state: DatasetState
33
-
34
19
  dataset_type: str
35
20
  provider: str
36
-
37
21
  identifier: Identifier
38
22
  metadata: dict
39
-
40
23
  created_at: datetime
41
24
  updated_at: datetime
42
-
43
- revisions: List[Revision] = field(default_factory=list)
25
+ revisions: List[Revision] = Field(default_factory=list)
44
26
 
45
27
  @property
46
28
  def is_complete(self):
47
29
  return self.state.is_complete
48
30
 
49
- def next_revision_id(self):
31
+ def next_revision_id(self) -> int:
50
32
  return len(self.revisions)
51
33
 
52
34
  def add_revision(self, revision: Revision):
53
35
  self.revisions.append(revision)
54
36
  self.updated_at = utcnow()
55
37
 
56
- def update_from_resource(self, dataset_resource) -> bool:
38
+ def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
57
39
  changed = False
58
- if self.name != dataset_resource.name:
59
- self.name = dataset_resource.name
40
+ if self.name != name:
41
+ self.name = name
60
42
  changed = True
61
43
 
62
- if self.metadata != dataset_resource.metadata:
63
- self.metadata = dataset_resource.metadata
44
+ if self.metadata != metadata:
45
+ self.metadata = metadata
64
46
  changed = True
65
47
 
66
- if self.state != dataset_resource.state:
67
- self.state = dataset_resource.state
48
+ if self.state != state:
49
+ self.state = state
68
50
  changed = True
69
51
 
70
52
  if changed:
@@ -101,4 +83,5 @@ class Dataset:
101
83
  description="Squashed revision",
102
84
  is_squashed=True,
103
85
  modified_files=list(files.values()),
86
+ source=RevisionSource(source_type=SourceType.SQUASHED, source_id=""),
104
87
  )
@@ -1,16 +1,12 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from typing import Optional, List, Union
3
3
 
4
- from ingestify.utils import ComponentFactory, ComponentRegistry
5
-
6
4
  from .collection import DatasetCollection
7
5
  from .dataset import Dataset
8
6
  from .selector import Selector
9
7
 
10
- dataset_repository_registry = ComponentRegistry()
11
-
12
8
 
13
- class DatasetRepository(ABC, metaclass=dataset_repository_registry.metaclass):
9
+ class DatasetRepository(ABC):
14
10
  @abstractmethod
15
11
  def get_dataset_collection(
16
12
  self,
@@ -34,13 +30,3 @@ class DatasetRepository(ABC, metaclass=dataset_repository_registry.metaclass):
34
30
  @abstractmethod
35
31
  def next_identity(self):
36
32
  pass
37
-
38
- @classmethod
39
- @abstractmethod
40
- def supports(cls, url: str) -> bool:
41
- pass
42
-
43
-
44
- dataset_repository_factory = ComponentFactory.build_factory(
45
- DatasetRepository, dataset_repository_registry
46
- )
@@ -0,0 +1,11 @@
1
+ from enum import Enum
2
+
3
+
4
+ class DatasetState(str, Enum):
5
+ SCHEDULED = "SCHEDULED"
6
+ PARTIAL = "PARTIAL"
7
+ COMPLETE = "COMPLETE"
8
+
9
+ @property
10
+ def is_complete(self):
11
+ return self == DatasetState.COMPLETE
@@ -0,0 +1,21 @@
1
+ from typing import ClassVar
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from ingestify.domain.models.event.domain_event import DomainEvent
6
+ from .dataset import Dataset
7
+
8
+
9
+ class DatasetCreated(DomainEvent):
10
+ dataset: Dataset
11
+ event_type: ClassVar[str] = "dataset_created"
12
+
13
+
14
+ class RevisionAdded(DomainEvent):
15
+ dataset: Dataset
16
+ event_type: ClassVar[str] = "revision_added"
17
+
18
+
19
+ class MetadataUpdated(DomainEvent):
20
+ dataset: Dataset
21
+ event_type: ClassVar[str] = "metadata_updated"