ingestify 0.1.2__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. {ingestify-0.1.2 → ingestify-0.2.0}/PKG-INFO +1 -1
  2. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/__init__.py +1 -1
  3. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/application/dataset_store.py +44 -24
  4. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/application/ingestion_engine.py +3 -3
  5. ingestify-0.2.0/ingestify/application/loader.py +165 -0
  6. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/cmdline.py +2 -1
  7. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/__init__.py +1 -6
  8. ingestify-0.2.0/ingestify/domain/models/base.py +22 -0
  9. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/data_spec_version_collection.py +6 -0
  10. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/__init__.py +3 -5
  11. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/dataset.py +15 -32
  12. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/dataset_repository.py +1 -15
  13. ingestify-0.2.0/ingestify/domain/models/dataset/dataset_state.py +11 -0
  14. ingestify-0.2.0/ingestify/domain/models/dataset/events.py +21 -0
  15. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/file.py +21 -34
  16. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/file_collection.py +3 -1
  17. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/file_repository.py +1 -10
  18. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/revision.py +26 -3
  19. ingestify-0.2.0/ingestify/domain/models/event/domain_event.py +14 -0
  20. ingestify-0.2.0/ingestify/domain/models/ingestion/ingestion_job.py +292 -0
  21. ingestify-0.2.0/ingestify/domain/models/ingestion/ingestion_job_summary.py +106 -0
  22. ingestify-0.1.2/ingestify/domain/models/extract_job.py → ingestify-0.2.0/ingestify/domain/models/ingestion/ingestion_plan.py +4 -4
  23. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/resources/dataset_resource.py +29 -37
  24. ingestify-0.2.0/ingestify/domain/models/sink.py +9 -0
  25. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/task/task.py +3 -1
  26. ingestify-0.2.0/ingestify/domain/models/task/task_summary.py +118 -0
  27. ingestify-0.2.0/ingestify/domain/models/timing.py +16 -0
  28. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/fetch/http.py +5 -0
  29. ingestify-0.2.0/ingestify/infra/source/statsbomb_github.py +105 -0
  30. ingestify-0.2.0/ingestify/infra/store/dataset/sqlalchemy/mapping.py +333 -0
  31. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -22
  32. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/main.py +42 -22
  33. ingestify-0.2.0/ingestify/static/templates/statsbomb_github/README.md +0 -0
  34. ingestify-0.2.0/ingestify/static/templates/wyscout/README.md +0 -0
  35. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/utils.py +25 -78
  36. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/PKG-INFO +1 -1
  37. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/SOURCES.txt +8 -2
  38. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/requires.txt +1 -0
  39. {ingestify-0.1.2 → ingestify-0.2.0}/setup.py +1 -0
  40. ingestify-0.1.2/ingestify/application/loader.py +0 -335
  41. ingestify-0.1.2/ingestify/domain/models/dataset/events.py +0 -31
  42. ingestify-0.1.2/ingestify/domain/models/event/domain_event.py +0 -10
  43. ingestify-0.1.2/ingestify/domain/models/sink.py +0 -16
  44. ingestify-0.1.2/ingestify/infra/source/statsbomb_github.py +0 -92
  45. ingestify-0.1.2/ingestify/infra/store/dataset/__init__.py +0 -2
  46. ingestify-0.1.2/ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
  47. ingestify-0.1.2/ingestify/infra/store/dataset/sqlalchemy/mapping.py +0 -153
  48. {ingestify-0.1.2 → ingestify-0.2.0}/README.md +0 -0
  49. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/application/__init__.py +0 -0
  50. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/application/secrets_manager.py +0 -0
  51. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/__init__.py +0 -0
  52. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/collection.py +0 -0
  53. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  54. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/identifier.py +0 -0
  55. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/dataset/selector.py +0 -0
  56. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/__init__.py +0 -0
  57. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/_old_event.py +0 -0
  58. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/dispatcher.py +0 -0
  59. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/event_bus.py +0 -0
  60. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/publisher.py +0 -0
  61. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/event/subscriber.py +0 -0
  62. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/fetch_policy.py +0 -0
  63. {ingestify-0.1.2/ingestify/domain/services → ingestify-0.2.0/ingestify/domain/models/ingestion}/__init__.py +0 -0
  64. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/resources/__init__.py +0 -0
  65. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/source.py +0 -0
  66. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/task/__init__.py +0 -0
  67. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/models/task/set.py +0 -0
  68. {ingestify-0.1.2/ingestify/domain/services/transformers → ingestify-0.2.0/ingestify/domain/services}/__init__.py +0 -0
  69. {ingestify-0.1.2/ingestify/infra/fetch → ingestify-0.2.0/ingestify/domain/services/transformers}/__init__.py +0 -0
  70. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  71. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/exceptions.py +0 -0
  72. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/__init__.py +0 -0
  73. {ingestify-0.1.2/ingestify/infra/sink → ingestify-0.2.0/ingestify/infra/fetch}/__init__.py +0 -0
  74. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/serialization/__init__.py +0 -0
  75. {ingestify-0.1.2/ingestify/infra/source → ingestify-0.2.0/ingestify/infra/sink}/__init__.py +0 -0
  76. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/sink/postgresql.py +0 -0
  77. /ingestify-0.1.2/ingestify/static/templates/statsbomb_github/README.md → /ingestify-0.2.0/ingestify/infra/source/__init__.py +0 -0
  78. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/source/wyscout.py +0 -0
  79. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/__init__.py +0 -0
  80. /ingestify-0.1.2/ingestify/static/templates/wyscout/README.md → /ingestify-0.2.0/ingestify/infra/store/dataset/__init__.py +0 -0
  81. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  82. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/file/__init__.py +0 -0
  83. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
  84. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  85. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/server.py +0 -0
  86. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/source_base.py +0 -0
  87. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -0
  88. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/statsbomb_github/database/README.md +0 -0
  89. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/statsbomb_github/query.py +0 -0
  90. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/wyscout/.env +0 -0
  91. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/wyscout/.gitignore +0 -0
  92. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -0
  93. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/wyscout/database/README.md +0 -0
  94. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify/static/templates/wyscout/query.py +0 -0
  95. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/dependency_links.txt +0 -0
  96. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/entry_points.txt +0 -0
  97. {ingestify-0.1.2 → ingestify-0.2.0}/ingestify.egg-info/top_level.txt +0 -0
  98. {ingestify-0.1.2 → ingestify-0.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.1.2
3
+ Version: 0.2.0
4
4
  Summary: Standardizing soccer tracking- and event data
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.1.2"
11
+ __version__ = "0.2.0"
@@ -5,13 +5,14 @@ import mimetypes
5
5
  import os
6
6
  import shutil
7
7
  from dataclasses import asdict
8
- from io import BytesIO, StringIO
8
+ from io import BytesIO
9
9
 
10
- from typing import Dict, List, Optional, Union, Callable, BinaryIO
10
+ from typing import Dict, List, Optional, Union, Callable, BinaryIO, Awaitable
11
11
 
12
12
  from ingestify.domain.models.dataset.dataset import DatasetState
13
13
  from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
14
14
  from ingestify.domain.models.dataset.file_collection import FileCollection
15
+ from ingestify.domain.models.dataset.revision import RevisionSource
15
16
  from ingestify.domain.models.event import EventBus
16
17
  from ingestify.domain.models import (
17
18
  Dataset,
@@ -27,7 +28,7 @@ from ingestify.domain.models import (
27
28
  Revision,
28
29
  DatasetCreated,
29
30
  )
30
- from ingestify.utils import utcnow, map_in_pool
31
+ from ingestify.utils import utcnow
31
32
 
32
33
 
33
34
  logger = logging.getLogger(__name__)
@@ -56,6 +57,10 @@ class DatasetStore:
56
57
  if self.event_bus:
57
58
  self.event_bus.dispatch(event)
58
59
 
60
+ def save_ingestion_job_summary(self, ingestion_job_summary):
61
+ self.dataset_repository.session.add(ingestion_job_summary)
62
+ self.dataset_repository.session.commit()
63
+
59
64
  def get_dataset_collection(
60
65
  self,
61
66
  dataset_type: Optional[str] = None,
@@ -107,7 +112,9 @@ class DatasetStore:
107
112
 
108
113
  return stream, storage_size, suffix
109
114
 
110
- def _prepare_read_stream(self) -> tuple[Callable[[BinaryIO], BytesIO], str]:
115
+ def _prepare_read_stream(
116
+ self,
117
+ ) -> tuple[Callable[[BinaryIO], Awaitable[BytesIO]], str]:
111
118
  if self.storage_compression_method == "gzip":
112
119
 
113
120
  def reader(fh: BinaryIO) -> BytesIO:
@@ -168,7 +175,11 @@ class DatasetStore:
168
175
  return modified_files_
169
176
 
170
177
  def add_revision(
171
- self, dataset: Dataset, files: Dict[str, DraftFile], description: str = "Update"
178
+ self,
179
+ dataset: Dataset,
180
+ files: Dict[str, DraftFile],
181
+ revision_source: RevisionSource,
182
+ description: str = "Update",
172
183
  ):
173
184
  """
174
185
  Create new revision first, so FileRepository can use
@@ -182,46 +193,53 @@ class DatasetStore:
182
193
  # It can happen an API tells us data is changed, but it was not changed. In this case
183
194
  # we decide to ignore it.
184
195
  # Make sure there are files changed before creating a new revision
185
- dataset.add_revision(
186
- Revision(
187
- revision_id=revision_id,
188
- created_at=created_at,
189
- description=description,
190
- modified_files=persisted_files_,
191
- )
196
+ revision = Revision(
197
+ revision_id=revision_id,
198
+ created_at=created_at,
199
+ description=description,
200
+ modified_files=persisted_files_,
201
+ source=revision_source,
192
202
  )
193
203
 
204
+ dataset.add_revision(revision)
205
+
194
206
  self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
195
207
  self.dispatch(RevisionAdded(dataset=dataset))
196
208
  logger.info(
197
209
  f"Added a new revision to {dataset.identifier} -> {', '.join([file.file_id for file in persisted_files_])}"
198
210
  )
199
- return True
200
211
  else:
201
212
  logger.info(
202
213
  f"Ignoring a new revision without changed files -> {dataset.identifier}"
203
214
  )
204
- return False
215
+ revision = None
216
+
217
+ return revision
205
218
 
206
219
  def update_dataset(
207
220
  self,
208
221
  dataset: Dataset,
209
- dataset_resource: DatasetResource,
222
+ name: str,
223
+ state: DatasetState,
224
+ metadata: dict,
210
225
  files: Dict[str, DraftFile],
226
+ revision_source: RevisionSource,
211
227
  ):
212
228
  """The add_revision will also save the dataset."""
213
229
  metadata_changed = False
214
- if dataset.update_from_resource(dataset_resource):
230
+ if dataset.update_metadata(name, metadata, state):
215
231
  self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
216
232
  metadata_changed = True
217
233
 
218
- self.add_revision(dataset, files)
234
+ revision = self.add_revision(dataset, files, revision_source)
219
235
 
220
236
  if metadata_changed:
221
237
  # Dispatch after revision added. Otherwise, the downstream handlers are not able to see
222
238
  # the new revision
223
239
  self.dispatch(MetadataUpdated(dataset=dataset))
224
240
 
241
+ return revision
242
+
225
243
  def destroy_dataset(self, dataset: Dataset):
226
244
  # TODO: remove files. Now we leave some orphaned files around
227
245
  self.dataset_repository.destroy(dataset)
@@ -235,6 +253,7 @@ class DatasetStore:
235
253
  state: DatasetState,
236
254
  metadata: dict,
237
255
  files: Dict[str, DraftFile],
256
+ revision_source: RevisionSource,
238
257
  description: str = "Create",
239
258
  ):
240
259
  now = utcnow()
@@ -251,9 +270,10 @@ class DatasetStore:
251
270
  created_at=now,
252
271
  updated_at=now,
253
272
  )
254
- self.add_revision(dataset, files, description)
273
+ revision = self.add_revision(dataset, files, revision_source, description)
255
274
 
256
275
  self.dispatch(DatasetCreated(dataset=dataset))
276
+ return revision
257
277
 
258
278
  def load_files(
259
279
  self,
@@ -302,8 +322,8 @@ class DatasetStore:
302
322
 
303
323
  try:
304
324
  return statsbomb.load(
305
- event_data=files.get_file("events").stream,
306
- lineup_data=files.get_file("lineups").stream,
325
+ event_data=(files.get_file("events")).stream,
326
+ lineup_data=(files.get_file("lineups")).stream,
307
327
  **kwargs,
308
328
  )
309
329
  except Exception as e:
@@ -333,7 +353,7 @@ class DatasetStore:
333
353
  # filename=filename,
334
354
  # )
335
355
 
336
- def map(
337
- self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
338
- ):
339
- return map_in_pool(fn, dataset_collection, processes)
356
+ # def map(
357
+ # self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
358
+ # ):
359
+ # return map_in_pool(fn, dataset_collection, processes)
@@ -5,7 +5,7 @@ from typing import Optional, List
5
5
 
6
6
  from .loader import Loader
7
7
  from .dataset_store import DatasetStore
8
- from ..domain.models.extract_job import ExtractJob
8
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
9
9
 
10
10
  logger = logging.getLogger(__name__)
11
11
 
@@ -18,8 +18,8 @@ class IngestionEngine:
18
18
  self.store = store
19
19
  self.loader = Loader(self.store)
20
20
 
21
- def add_extract_job(self, extract_job: ExtractJob):
22
- self.loader.add_extract_job(extract_job)
21
+ def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
22
+ self.loader.add_ingestion_plan(ingestion_plan)
23
23
 
24
24
  def load(self, dry_run: bool = False, provider: Optional[str] = None):
25
25
  self.loader.collect_and_run(dry_run=dry_run, provider=provider)
@@ -0,0 +1,165 @@
1
+ import logging
2
+ import platform
3
+ import uuid
4
+ from multiprocessing import set_start_method
5
+ from typing import List, Optional
6
+
7
+ from ingestify.domain.models import Selector
8
+ from ingestify.utils import TaskExecutor
9
+
10
+ from .dataset_store import DatasetStore
11
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
12
+ from ..domain.models.ingestion.ingestion_job import IngestionJob
13
+ from ..exceptions import ConfigurationError
14
+
15
+ if platform.system() == "Darwin":
16
+ set_start_method("fork", force=True)
17
+ else:
18
+ set_start_method("spawn", force=True)
19
+
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class Loader:
25
+ def __init__(self, store: DatasetStore):
26
+ self.store = store
27
+ self.ingestion_plans: List[IngestionPlan] = []
28
+
29
+ def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
30
+ self.ingestion_plans.append(ingestion_plan)
31
+
32
+ def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
33
+ # First collect all selectors, before discovering datasets
34
+ selectors = {}
35
+ for ingestion_plan in self.ingestion_plans:
36
+ if provider is not None:
37
+ if ingestion_plan.source.provider != provider:
38
+ logger.info(
39
+ f"Skipping {ingestion_plan} because provider doesn't match '{provider}'"
40
+ )
41
+ continue
42
+
43
+ static_selectors = [
44
+ selector
45
+ for selector in ingestion_plan.selectors
46
+ if not selector.is_dynamic
47
+ ]
48
+ dynamic_selectors = [
49
+ selector for selector in ingestion_plan.selectors if selector.is_dynamic
50
+ ]
51
+
52
+ no_selectors = len(static_selectors) == 1 and not bool(static_selectors[0])
53
+ if dynamic_selectors or no_selectors:
54
+ if hasattr(ingestion_plan.source, "discover_selectors"):
55
+ logger.debug(
56
+ f"Discovering selectors from {ingestion_plan.source.__class__.__name__}"
57
+ )
58
+
59
+ # TODO: consider making this lazy and fetch once per Source instead of
60
+ # once per IngestionPlan
61
+ all_selectors = ingestion_plan.source.discover_selectors(
62
+ ingestion_plan.dataset_type
63
+ )
64
+ if no_selectors:
65
+ # When there were no selectors specified, just use all of them
66
+ extra_static_selectors = [
67
+ Selector.build(
68
+ job_selector,
69
+ data_spec_versions=ingestion_plan.data_spec_versions,
70
+ )
71
+ for job_selector in all_selectors
72
+ ]
73
+ static_selectors = []
74
+ else:
75
+ extra_static_selectors = []
76
+ for dynamic_selector in dynamic_selectors:
77
+ dynamic_job_selectors = [
78
+ Selector.build(
79
+ job_selector,
80
+ data_spec_versions=ingestion_plan.data_spec_versions,
81
+ )
82
+ for job_selector in all_selectors
83
+ if dynamic_selector.is_match(job_selector)
84
+ ]
85
+ extra_static_selectors.extend(dynamic_job_selectors)
86
+ logger.info(f"Added {len(dynamic_job_selectors)} selectors")
87
+
88
+ static_selectors.extend(extra_static_selectors)
89
+
90
+ logger.info(
91
+ f"Discovered {len(extra_static_selectors)} selectors from {ingestion_plan.source.__class__.__name__}"
92
+ )
93
+ else:
94
+ if not no_selectors:
95
+ # When there are no selectors and no discover_selectors, just pass it through. It might break
96
+ # later on
97
+ raise ConfigurationError(
98
+ f"Dynamic selectors cannot be used for "
99
+ f"{ingestion_plan.source.__class__.__name__} because it doesn't support"
100
+ f" selector discovery"
101
+ )
102
+
103
+ # Merge selectors when source, dataset_type and actual selector is the same. This makes
104
+ # sure there will be only 1 dataset for this combination
105
+ for selector in static_selectors:
106
+ key = (
107
+ ingestion_plan.source.name,
108
+ ingestion_plan.dataset_type,
109
+ selector.key,
110
+ )
111
+ if existing_selector := selectors.get(key):
112
+ existing_selector[1].data_spec_versions.merge(
113
+ selector.data_spec_versions
114
+ )
115
+ else:
116
+ selectors[key] = (ingestion_plan, selector)
117
+
118
+ """
119
+ Data is denormalized:
120
+
121
+ It actually looks like:
122
+ - IngestionPlan #1
123
+ - Selector 1.1
124
+ - Selector 1.2
125
+ - Selector 1.3
126
+ - IngestionPlan #2
127
+ - Selector 2.1
128
+ - Selector 2.2
129
+
130
+ We process this as:
131
+ - IngestionPlan #1, Selector 1.1
132
+ - IngestionPlan #1, Selector 1.2
133
+ - IngestionPlan #1, Selector 1.3
134
+ - IngestionPlan #2, Selector 2.1
135
+ - IngestionPlan #2, Selector 2.2
136
+
137
+ IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
138
+ """
139
+ for ingestion_plan, selector in selectors.values():
140
+ logger.debug(
141
+ f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
142
+ )
143
+
144
+ ingestion_job = IngestionJob(
145
+ ingestion_job_id=str(uuid.uuid1()),
146
+ ingestion_plan=ingestion_plan,
147
+ selector=selector,
148
+ )
149
+
150
+ with TaskExecutor(dry_run=dry_run) as task_executor:
151
+ ingestion_job_summary = ingestion_job.execute(
152
+ self.store, task_executor=task_executor
153
+ )
154
+
155
+ # TODO: handle task_summaries
156
+ # Summarize to a IngestionJobSummary, and save to a database. This Summary can later be used in a
157
+ # next run to determine where to resume.
158
+ # TODO 2: Do we want to add additional information from the summary back to the Task, so it can use
159
+ # extra information to determine how/where to resume
160
+ ingestion_job_summary.set_finished()
161
+
162
+ ingestion_job_summary.output_report()
163
+ self.store.save_ingestion_job_summary(ingestion_job_summary)
164
+
165
+ logger.info("Done")
@@ -12,6 +12,7 @@ from ingestify.exceptions import ConfigurationError
12
12
  from ingestify.main import get_engine
13
13
 
14
14
  from ingestify import __version__
15
+ from ingestify.utils import try_number
15
16
 
16
17
  logger = logging.getLogger(__name__)
17
18
  #
@@ -174,7 +175,7 @@ def delete_dataset(
174
175
  if "=" in dataset_id:
175
176
  selector = {
176
177
  # TODO: this `int` will might break stuff. Issue here is the int != str
177
- _[0]: int(_[1])
178
+ _[0]: try_number(_[1])
178
179
  for _ in [_.split("=") for _ in dataset_id.split("/")]
179
180
  }
180
181
  else:
@@ -11,10 +11,8 @@ from .dataset import (
11
11
  LoadedFile,
12
12
  Selector,
13
13
  Revision,
14
- dataset_repository_factory,
15
- file_repository_factory,
16
14
  )
17
- from .sink import Sink, sink_factory
15
+ from .sink import Sink
18
16
  from .source import Source
19
17
  from .task import Task, TaskSet
20
18
  from .data_spec_version_collection import DataSpecVersionCollection
@@ -35,11 +33,8 @@ __all__ = [
35
33
  "FileRepository",
36
34
  "FileCollection",
37
35
  "DatasetRepository",
38
- "dataset_repository_factory",
39
- "file_repository_factory",
40
36
  "TaskSet",
41
37
  "Task",
42
38
  "Sink",
43
- "sink_factory",
44
39
  "DataSpecVersionCollection",
45
40
  ]
@@ -0,0 +1,22 @@
1
+ from functools import partial
2
+ from typing import ClassVar, Any, Optional
3
+
4
+ import pydantic
5
+ from pydantic import BaseModel as PydanticBaseModel, ConfigDict
6
+
7
+
8
+ # class BaseModel(PydanticBaseModel):
9
+ # model_config = ConfigDict(arbitrary_types_allowed=True)
10
+ #
11
+ # _sa_instance_state: Optional[dict] = None
12
+ from sqlalchemy.orm import MappedAsDataclass
13
+
14
+
15
+ class BaseModel(
16
+ MappedAsDataclass,
17
+ # DeclarativeBase,
18
+ dataclass_callable=partial(
19
+ pydantic.dataclasses.dataclass, config=ConfigDict(arbitrary_types_allowed=True)
20
+ ),
21
+ ):
22
+ pass
@@ -16,6 +16,12 @@ class DataSpecVersionCollection(dict):
16
16
 
17
17
  return cls(items_)
18
18
 
19
+ def to_dict(self):
20
+ return {
21
+ data_feed_key: list(data_spec_versions)
22
+ for data_feed_key, data_spec_versions in self.items()
23
+ }
24
+
19
25
  def copy(self):
20
26
  return DataSpecVersionCollection(copy.deepcopy(self))
21
27
 
@@ -1,8 +1,8 @@
1
+ from .file import DraftFile, File, LoadedFile
1
2
  from .collection import DatasetCollection
2
3
  from .dataset import Dataset
3
- from .dataset_repository import DatasetRepository, dataset_repository_factory
4
- from .file import DraftFile, File, LoadedFile
5
- from .file_repository import FileRepository, file_repository_factory
4
+ from .dataset_repository import DatasetRepository
5
+ from .file_repository import FileRepository
6
6
  from .file_collection import FileCollection
7
7
  from .identifier import Identifier
8
8
  from .selector import Selector
@@ -16,12 +16,10 @@ __all__ = [
16
16
  "Identifier",
17
17
  "DatasetCollection",
18
18
  "DatasetCreated",
19
- "dataset_repository_factory",
20
19
  "File",
21
20
  "DraftFile",
22
21
  "LoadedFile",
23
22
  "DatasetRepository",
24
23
  "FileRepository",
25
- "file_repository_factory",
26
24
  "FileCollection",
27
25
  ]
@@ -1,70 +1,52 @@
1
- from dataclasses import dataclass, field
2
1
  from datetime import datetime
3
2
  from enum import Enum
4
3
  from typing import List, Optional
4
+ from pydantic import Field
5
5
 
6
6
  from ingestify.utils import utcnow
7
-
7
+ from .dataset_state import DatasetState
8
8
  from .file import DraftFile
9
9
  from .identifier import Identifier
10
- from .revision import Revision
11
-
12
-
13
- class DatasetState(Enum):
14
- SCHEDULED = "SCHEDULED"
15
- PARTIAL = "PARTIAL"
16
- COMPLETE = "COMPLETE"
17
-
18
- @property
19
- def is_complete(self):
20
- return self == DatasetState.COMPLETE
10
+ from .revision import Revision, RevisionSource, SourceType
11
+ from ..base import BaseModel
21
12
 
22
- def __str__(self):
23
- return self.value
24
13
 
25
-
26
- @dataclass
27
- class Dataset:
14
+ class Dataset(BaseModel):
28
15
  bucket: str # This must be set by the DatasetRepository
29
-
30
16
  dataset_id: str
31
17
  name: str
32
18
  state: DatasetState
33
-
34
19
  dataset_type: str
35
20
  provider: str
36
-
37
21
  identifier: Identifier
38
22
  metadata: dict
39
-
40
23
  created_at: datetime
41
24
  updated_at: datetime
42
-
43
- revisions: List[Revision] = field(default_factory=list)
25
+ revisions: List[Revision] = Field(default_factory=list)
44
26
 
45
27
  @property
46
28
  def is_complete(self):
47
29
  return self.state.is_complete
48
30
 
49
- def next_revision_id(self):
31
+ def next_revision_id(self) -> int:
50
32
  return len(self.revisions)
51
33
 
52
34
  def add_revision(self, revision: Revision):
53
35
  self.revisions.append(revision)
54
36
  self.updated_at = utcnow()
55
37
 
56
- def update_from_resource(self, dataset_resource) -> bool:
38
+ def update_metadata(self, name: str, metadata: dict, state: DatasetState) -> bool:
57
39
  changed = False
58
- if self.name != dataset_resource.name:
59
- self.name = dataset_resource.name
40
+ if self.name != name:
41
+ self.name = name
60
42
  changed = True
61
43
 
62
- if self.metadata != dataset_resource.metadata:
63
- self.metadata = dataset_resource.metadata
44
+ if self.metadata != metadata:
45
+ self.metadata = metadata
64
46
  changed = True
65
47
 
66
- if self.state != dataset_resource.state:
67
- self.state = dataset_resource.state
48
+ if self.state != state:
49
+ self.state = state
68
50
  changed = True
69
51
 
70
52
  if changed:
@@ -101,4 +83,5 @@ class Dataset:
101
83
  description="Squashed revision",
102
84
  is_squashed=True,
103
85
  modified_files=list(files.values()),
86
+ source=RevisionSource(source_type=SourceType.SQUASHED, source_id=""),
104
87
  )
@@ -1,16 +1,12 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from typing import Optional, List, Union
3
3
 
4
- from ingestify.utils import ComponentFactory, ComponentRegistry
5
-
6
4
  from .collection import DatasetCollection
7
5
  from .dataset import Dataset
8
6
  from .selector import Selector
9
7
 
10
- dataset_repository_registry = ComponentRegistry()
11
-
12
8
 
13
- class DatasetRepository(ABC, metaclass=dataset_repository_registry.metaclass):
9
+ class DatasetRepository(ABC):
14
10
  @abstractmethod
15
11
  def get_dataset_collection(
16
12
  self,
@@ -34,13 +30,3 @@ class DatasetRepository(ABC, metaclass=dataset_repository_registry.metaclass):
34
30
  @abstractmethod
35
31
  def next_identity(self):
36
32
  pass
37
-
38
- @classmethod
39
- @abstractmethod
40
- def supports(cls, url: str) -> bool:
41
- pass
42
-
43
-
44
- dataset_repository_factory = ComponentFactory.build_factory(
45
- DatasetRepository, dataset_repository_registry
46
- )
@@ -0,0 +1,11 @@
1
+ from enum import Enum
2
+
3
+
4
+ class DatasetState(str, Enum):
5
+ SCHEDULED = "SCHEDULED"
6
+ PARTIAL = "PARTIAL"
7
+ COMPLETE = "COMPLETE"
8
+
9
+ @property
10
+ def is_complete(self):
11
+ return self == DatasetState.COMPLETE
@@ -0,0 +1,21 @@
1
+ from typing import ClassVar
2
+
3
+ from pydantic import BaseModel
4
+
5
+ from ingestify.domain.models.event.domain_event import DomainEvent
6
+ from .dataset import Dataset
7
+
8
+
9
+ class DatasetCreated(DomainEvent):
10
+ dataset: Dataset
11
+ event_type: ClassVar[str] = "dataset_created"
12
+
13
+
14
+ class RevisionAdded(DomainEvent):
15
+ dataset: Dataset
16
+ event_type: ClassVar[str] = "revision_added"
17
+
18
+
19
+ class MetadataUpdated(DomainEvent):
20
+ dataset: Dataset
21
+ event_type: ClassVar[str] = "metadata_updated"