ingestify 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. ingestify/__init__.py +1 -1
  2. ingestify/application/dataset_store.py +44 -24
  3. ingestify/application/ingestion_engine.py +3 -3
  4. ingestify/application/loader.py +67 -237
  5. ingestify/domain/models/__init__.py +1 -6
  6. ingestify/domain/models/base.py +22 -0
  7. ingestify/domain/models/data_spec_version_collection.py +6 -0
  8. ingestify/domain/models/dataset/__init__.py +3 -5
  9. ingestify/domain/models/dataset/dataset.py +15 -32
  10. ingestify/domain/models/dataset/dataset_repository.py +1 -15
  11. ingestify/domain/models/dataset/dataset_state.py +11 -0
  12. ingestify/domain/models/dataset/events.py +6 -16
  13. ingestify/domain/models/dataset/file.py +21 -34
  14. ingestify/domain/models/dataset/file_collection.py +3 -1
  15. ingestify/domain/models/dataset/file_repository.py +1 -10
  16. ingestify/domain/models/dataset/revision.py +26 -3
  17. ingestify/domain/models/event/domain_event.py +8 -4
  18. ingestify/domain/models/ingestion/__init__.py +0 -0
  19. ingestify/domain/models/ingestion/ingestion_job.py +292 -0
  20. ingestify/domain/models/ingestion/ingestion_job_summary.py +106 -0
  21. ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
  22. ingestify/domain/models/resources/dataset_resource.py +29 -37
  23. ingestify/domain/models/sink.py +1 -8
  24. ingestify/domain/models/task/task.py +3 -1
  25. ingestify/domain/models/task/task_summary.py +118 -0
  26. ingestify/domain/models/timing.py +16 -0
  27. ingestify/infra/fetch/http.py +5 -0
  28. ingestify/infra/source/statsbomb_github.py +67 -54
  29. ingestify/infra/store/dataset/__init__.py +0 -2
  30. ingestify/infra/store/dataset/sqlalchemy/mapping.py +184 -4
  31. ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -22
  32. ingestify/main.py +42 -22
  33. ingestify/utils.py +15 -78
  34. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/METADATA +2 -1
  35. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/RECORD +38 -32
  36. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/WHEEL +1 -1
  37. ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
  38. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/entry_points.txt +0 -0
  39. {ingestify-0.1.3.dist-info → ingestify-0.2.0.dist-info}/top_level.txt +0 -0
ingestify/__init__.py CHANGED
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.1.3"
11
+ __version__ = "0.2.0"
@@ -5,13 +5,14 @@ import mimetypes
5
5
  import os
6
6
  import shutil
7
7
  from dataclasses import asdict
8
- from io import BytesIO, StringIO
8
+ from io import BytesIO
9
9
 
10
- from typing import Dict, List, Optional, Union, Callable, BinaryIO
10
+ from typing import Dict, List, Optional, Union, Callable, BinaryIO, Awaitable
11
11
 
12
12
  from ingestify.domain.models.dataset.dataset import DatasetState
13
13
  from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
14
14
  from ingestify.domain.models.dataset.file_collection import FileCollection
15
+ from ingestify.domain.models.dataset.revision import RevisionSource
15
16
  from ingestify.domain.models.event import EventBus
16
17
  from ingestify.domain.models import (
17
18
  Dataset,
@@ -27,7 +28,7 @@ from ingestify.domain.models import (
27
28
  Revision,
28
29
  DatasetCreated,
29
30
  )
30
- from ingestify.utils import utcnow, map_in_pool
31
+ from ingestify.utils import utcnow
31
32
 
32
33
 
33
34
  logger = logging.getLogger(__name__)
@@ -56,6 +57,10 @@ class DatasetStore:
56
57
  if self.event_bus:
57
58
  self.event_bus.dispatch(event)
58
59
 
60
+ def save_ingestion_job_summary(self, ingestion_job_summary):
61
+ self.dataset_repository.session.add(ingestion_job_summary)
62
+ self.dataset_repository.session.commit()
63
+
59
64
  def get_dataset_collection(
60
65
  self,
61
66
  dataset_type: Optional[str] = None,
@@ -107,7 +112,9 @@ class DatasetStore:
107
112
 
108
113
  return stream, storage_size, suffix
109
114
 
110
- def _prepare_read_stream(self) -> tuple[Callable[[BinaryIO], BytesIO], str]:
115
+ def _prepare_read_stream(
116
+ self,
117
+ ) -> tuple[Callable[[BinaryIO], Awaitable[BytesIO]], str]:
111
118
  if self.storage_compression_method == "gzip":
112
119
 
113
120
  def reader(fh: BinaryIO) -> BytesIO:
@@ -168,7 +175,11 @@ class DatasetStore:
168
175
  return modified_files_
169
176
 
170
177
  def add_revision(
171
- self, dataset: Dataset, files: Dict[str, DraftFile], description: str = "Update"
178
+ self,
179
+ dataset: Dataset,
180
+ files: Dict[str, DraftFile],
181
+ revision_source: RevisionSource,
182
+ description: str = "Update",
172
183
  ):
173
184
  """
174
185
  Create new revision first, so FileRepository can use
@@ -182,46 +193,53 @@ class DatasetStore:
182
193
  # It can happen an API tells us data is changed, but it was not changed. In this case
183
194
  # we decide to ignore it.
184
195
  # Make sure there are files changed before creating a new revision
185
- dataset.add_revision(
186
- Revision(
187
- revision_id=revision_id,
188
- created_at=created_at,
189
- description=description,
190
- modified_files=persisted_files_,
191
- )
196
+ revision = Revision(
197
+ revision_id=revision_id,
198
+ created_at=created_at,
199
+ description=description,
200
+ modified_files=persisted_files_,
201
+ source=revision_source,
192
202
  )
193
203
 
204
+ dataset.add_revision(revision)
205
+
194
206
  self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
195
207
  self.dispatch(RevisionAdded(dataset=dataset))
196
208
  logger.info(
197
209
  f"Added a new revision to {dataset.identifier} -> {', '.join([file.file_id for file in persisted_files_])}"
198
210
  )
199
- return True
200
211
  else:
201
212
  logger.info(
202
213
  f"Ignoring a new revision without changed files -> {dataset.identifier}"
203
214
  )
204
- return False
215
+ revision = None
216
+
217
+ return revision
205
218
 
206
219
  def update_dataset(
207
220
  self,
208
221
  dataset: Dataset,
209
- dataset_resource: DatasetResource,
222
+ name: str,
223
+ state: DatasetState,
224
+ metadata: dict,
210
225
  files: Dict[str, DraftFile],
226
+ revision_source: RevisionSource,
211
227
  ):
212
228
  """The add_revision will also save the dataset."""
213
229
  metadata_changed = False
214
- if dataset.update_from_resource(dataset_resource):
230
+ if dataset.update_metadata(name, metadata, state):
215
231
  self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
216
232
  metadata_changed = True
217
233
 
218
- self.add_revision(dataset, files)
234
+ revision = self.add_revision(dataset, files, revision_source)
219
235
 
220
236
  if metadata_changed:
221
237
  # Dispatch after revision added. Otherwise, the downstream handlers are not able to see
222
238
  # the new revision
223
239
  self.dispatch(MetadataUpdated(dataset=dataset))
224
240
 
241
+ return revision
242
+
225
243
  def destroy_dataset(self, dataset: Dataset):
226
244
  # TODO: remove files. Now we leave some orphaned files around
227
245
  self.dataset_repository.destroy(dataset)
@@ -235,6 +253,7 @@ class DatasetStore:
235
253
  state: DatasetState,
236
254
  metadata: dict,
237
255
  files: Dict[str, DraftFile],
256
+ revision_source: RevisionSource,
238
257
  description: str = "Create",
239
258
  ):
240
259
  now = utcnow()
@@ -251,9 +270,10 @@ class DatasetStore:
251
270
  created_at=now,
252
271
  updated_at=now,
253
272
  )
254
- self.add_revision(dataset, files, description)
273
+ revision = self.add_revision(dataset, files, revision_source, description)
255
274
 
256
275
  self.dispatch(DatasetCreated(dataset=dataset))
276
+ return revision
257
277
 
258
278
  def load_files(
259
279
  self,
@@ -302,8 +322,8 @@ class DatasetStore:
302
322
 
303
323
  try:
304
324
  return statsbomb.load(
305
- event_data=files.get_file("events").stream,
306
- lineup_data=files.get_file("lineups").stream,
325
+ event_data=(files.get_file("events")).stream,
326
+ lineup_data=(files.get_file("lineups")).stream,
307
327
  **kwargs,
308
328
  )
309
329
  except Exception as e:
@@ -333,7 +353,7 @@ class DatasetStore:
333
353
  # filename=filename,
334
354
  # )
335
355
 
336
- def map(
337
- self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
338
- ):
339
- return map_in_pool(fn, dataset_collection, processes)
356
+ # def map(
357
+ # self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
358
+ # ):
359
+ # return map_in_pool(fn, dataset_collection, processes)
@@ -5,7 +5,7 @@ from typing import Optional, List
5
5
 
6
6
  from .loader import Loader
7
7
  from .dataset_store import DatasetStore
8
- from ..domain.models.extract_job import ExtractJob
8
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
9
9
 
10
10
  logger = logging.getLogger(__name__)
11
11
 
@@ -18,8 +18,8 @@ class IngestionEngine:
18
18
  self.store = store
19
19
  self.loader = Loader(self.store)
20
20
 
21
- def add_extract_job(self, extract_job: ExtractJob):
22
- self.loader.add_extract_job(extract_job)
21
+ def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
22
+ self.loader.add_ingestion_plan(ingestion_plan)
23
23
 
24
24
  def load(self, dry_run: bool = False, provider: Optional[str] = None):
25
25
  self.loader.collect_and_run(dry_run=dry_run, provider=provider)
@@ -1,19 +1,15 @@
1
- import itertools
2
- import json
3
1
  import logging
4
2
  import platform
5
- from multiprocessing import set_start_method, cpu_count
3
+ import uuid
4
+ from multiprocessing import set_start_method
6
5
  from typing import List, Optional
7
6
 
8
- from ingestify.domain.models import Dataset, Identifier, Selector, Source, Task, TaskSet
9
- from ingestify.utils import map_in_pool, TaskExecutor, chunker
7
+ from ingestify.domain.models import Selector
8
+ from ingestify.utils import TaskExecutor
10
9
 
11
10
  from .dataset_store import DatasetStore
12
- from .. import DatasetResource, retrieve_http
13
- from ..domain import DraftFile
14
- from ..domain.models.data_spec_version_collection import DataSpecVersionCollection
15
- from ..domain.models.extract_job import ExtractJob
16
- from ..domain.models.resources.dataset_resource import FileResource
11
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
12
+ from ..domain.models.ingestion.ingestion_job import IngestionJob
17
13
  from ..exceptions import ConfigurationError
18
14
 
19
15
  if platform.system() == "Darwin":
@@ -25,176 +21,52 @@ else:
25
21
  logger = logging.getLogger(__name__)
26
22
 
27
23
 
28
- DEFAULT_CHUNK_SIZE = 1000
29
-
30
-
31
- def to_batches(input_):
32
- if isinstance(input_, list):
33
- batches = [input_]
34
- else:
35
- # Assume it's an iterator. Peek what's inside, and put it back
36
- try:
37
- peek = next(input_)
38
- except StopIteration:
39
- # Nothing to batch
40
- return []
41
-
42
- input_ = itertools.chain([peek], input_)
43
-
44
- if not isinstance(peek, list):
45
- batches = chunker(input_, DEFAULT_CHUNK_SIZE)
46
- else:
47
- batches = input_
48
- return batches
49
-
50
-
51
- def load_file(
52
- file_resource: FileResource, dataset: Optional[Dataset] = None
53
- ) -> Optional[DraftFile]:
54
- current_file = None
55
- if dataset:
56
- current_file = dataset.current_revision.modified_files_map.get(
57
- file_resource.file_id
58
- )
59
-
60
- if file_resource.json_content is not None:
61
- # Empty dictionary is allowed
62
- file = DraftFile.from_input(
63
- file_=json.dumps(file_resource.json_content, indent=4),
64
- data_serialization_format="json",
65
- data_feed_key=file_resource.data_feed_key,
66
- data_spec_version=file_resource.data_spec_version,
67
- modified_at=file_resource.last_modified,
68
- )
69
- if current_file and current_file.tag == file.tag:
70
- # Nothing changed
71
- return None
72
- return file
73
- elif file_resource.url:
74
- http_options = {}
75
- if file_resource.http_options:
76
- for k, v in file_resource.http_options.items():
77
- http_options[f"http_{k}"] = v
78
-
79
- return retrieve_http(
80
- url=file_resource.url,
81
- current_file=current_file,
82
- file_data_feed_key=file_resource.data_feed_key,
83
- file_data_spec_version=file_resource.data_spec_version,
84
- file_data_serialization_format=file_resource.data_serialization_format
85
- or "txt",
86
- **http_options,
87
- **file_resource.loader_kwargs,
88
- )
89
- else:
90
- return file_resource.file_loader(
91
- file_resource,
92
- current_file,
93
- # TODO: check how to fix this with typehints
94
- **file_resource.loader_kwargs,
95
- )
96
-
97
-
98
- class UpdateDatasetTask(Task):
99
- def __init__(
100
- self,
101
- dataset: Dataset,
102
- dataset_resource: DatasetResource,
103
- store: DatasetStore,
104
- ):
105
- self.dataset = dataset
106
- self.dataset_resource = dataset_resource
107
- self.store = store
108
-
109
- def run(self):
110
- self.store.update_dataset(
111
- dataset=self.dataset,
112
- dataset_resource=self.dataset_resource,
113
- files={
114
- file_id: load_file(file_resource, dataset=self.dataset)
115
- for file_id, file_resource in self.dataset_resource.files.items()
116
- },
117
- )
118
-
119
- def __repr__(self):
120
- return f"UpdateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
121
-
122
-
123
- class CreateDatasetTask(Task):
124
- def __init__(
125
- self,
126
- dataset_resource: DatasetResource,
127
- store: DatasetStore,
128
- ):
129
- self.dataset_resource = dataset_resource
130
- self.store = store
131
-
132
- def run(self):
133
- self.store.create_dataset(
134
- dataset_type=self.dataset_resource.dataset_type,
135
- provider=self.dataset_resource.provider,
136
- dataset_identifier=Identifier(**self.dataset_resource.dataset_resource_id),
137
- name=self.dataset_resource.name,
138
- state=self.dataset_resource.state,
139
- metadata=self.dataset_resource.metadata,
140
- files={
141
- file_id: load_file(file_resource)
142
- for file_id, file_resource in self.dataset_resource.files.items()
143
- },
144
- )
145
-
146
- def __repr__(self):
147
- return f"CreateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
148
-
149
-
150
24
  class Loader:
151
25
  def __init__(self, store: DatasetStore):
152
26
  self.store = store
153
- self.extract_jobs: List[ExtractJob] = []
27
+ self.ingestion_plans: List[IngestionPlan] = []
154
28
 
155
- def add_extract_job(self, extract_job: ExtractJob):
156
- self.extract_jobs.append(extract_job)
29
+ def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
30
+ self.ingestion_plans.append(ingestion_plan)
157
31
 
158
32
  def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
159
- total_dataset_count = 0
160
-
161
33
  # First collect all selectors, before discovering datasets
162
34
  selectors = {}
163
- for extract_job in self.extract_jobs:
35
+ for ingestion_plan in self.ingestion_plans:
164
36
  if provider is not None:
165
- if extract_job.source.provider != provider:
37
+ if ingestion_plan.source.provider != provider:
166
38
  logger.info(
167
- f"Skipping {extract_job } because provider doesn't match '{provider}'"
39
+ f"Skipping {ingestion_plan} because provider doesn't match '{provider}'"
168
40
  )
169
41
  continue
170
42
 
171
43
  static_selectors = [
172
44
  selector
173
- for selector in extract_job.selectors
45
+ for selector in ingestion_plan.selectors
174
46
  if not selector.is_dynamic
175
47
  ]
176
48
  dynamic_selectors = [
177
- selector for selector in extract_job.selectors if selector.is_dynamic
49
+ selector for selector in ingestion_plan.selectors if selector.is_dynamic
178
50
  ]
179
51
 
180
52
  no_selectors = len(static_selectors) == 1 and not bool(static_selectors[0])
181
53
  if dynamic_selectors or no_selectors:
182
- if hasattr(extract_job.source, "discover_selectors"):
54
+ if hasattr(ingestion_plan.source, "discover_selectors"):
183
55
  logger.debug(
184
- f"Discovering selectors from {extract_job.source.__class__.__name__}"
56
+ f"Discovering selectors from {ingestion_plan.source.__class__.__name__}"
185
57
  )
186
58
 
187
59
  # TODO: consider making this lazy and fetch once per Source instead of
188
- # once per ExtractJob
189
- all_selectors = extract_job.source.discover_selectors(
190
- extract_job.dataset_type
60
+ # once per IngestionPlan
61
+ all_selectors = ingestion_plan.source.discover_selectors(
62
+ ingestion_plan.dataset_type
191
63
  )
192
64
  if no_selectors:
193
65
  # When there were no selectors specified, just use all of them
194
66
  extra_static_selectors = [
195
67
  Selector.build(
196
68
  job_selector,
197
- data_spec_versions=extract_job.data_spec_versions,
69
+ data_spec_versions=ingestion_plan.data_spec_versions,
198
70
  )
199
71
  for job_selector in all_selectors
200
72
  ]
@@ -205,7 +77,7 @@ class Loader:
205
77
  dynamic_job_selectors = [
206
78
  Selector.build(
207
79
  job_selector,
208
- data_spec_versions=extract_job.data_spec_versions,
80
+ data_spec_versions=ingestion_plan.data_spec_versions,
209
81
  )
210
82
  for job_selector in all_selectors
211
83
  if dynamic_selector.is_match(job_selector)
@@ -216,7 +88,7 @@ class Loader:
216
88
  static_selectors.extend(extra_static_selectors)
217
89
 
218
90
  logger.info(
219
- f"Discovered {len(extra_static_selectors)} selectors from {extract_job.source.__class__.__name__}"
91
+ f"Discovered {len(extra_static_selectors)} selectors from {ingestion_plan.source.__class__.__name__}"
220
92
  )
221
93
  else:
222
94
  if not no_selectors:
@@ -224,112 +96,70 @@ class Loader:
224
96
  # later on
225
97
  raise ConfigurationError(
226
98
  f"Dynamic selectors cannot be used for "
227
- f"{extract_job.source.__class__.__name__} because it doesn't support"
99
+ f"{ingestion_plan.source.__class__.__name__} because it doesn't support"
228
100
  f" selector discovery"
229
101
  )
230
102
 
231
103
  # Merge selectors when source, dataset_type and actual selector is the same. This makes
232
104
  # sure there will be only 1 dataset for this combination
233
105
  for selector in static_selectors:
234
- key = (extract_job.source.name, extract_job.dataset_type, selector.key)
106
+ key = (
107
+ ingestion_plan.source.name,
108
+ ingestion_plan.dataset_type,
109
+ selector.key,
110
+ )
235
111
  if existing_selector := selectors.get(key):
236
112
  existing_selector[1].data_spec_versions.merge(
237
113
  selector.data_spec_versions
238
114
  )
239
115
  else:
240
- selectors[key] = (extract_job, selector)
241
-
242
- def run_task(task):
243
- logger.info(f"Running task {task}")
244
- task.run()
245
-
246
- for extract_job, selector in selectors.values():
116
+ selectors[key] = (ingestion_plan, selector)
117
+
118
+ """
119
+ Data is denormalized:
120
+
121
+ It actually looks like:
122
+ - IngestionPlan #1
123
+ - Selector 1.1
124
+ - Selector 1.2
125
+ - Selector 1.3
126
+ - IngestionPlan #2
127
+ - Selector 2.1
128
+ - Selector 2.2
129
+
130
+ We process this as:
131
+ - IngestionPlan #1, Selector 1.1
132
+ - IngestionPlan #1, Selector 1.2
133
+ - IngestionPlan #1, Selector 1.3
134
+ - IngestionPlan #2, Selector 2.1
135
+ - IngestionPlan #2, Selector 2.2
136
+
137
+ IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
138
+ """
139
+ for ingestion_plan, selector in selectors.values():
247
140
  logger.debug(
248
- f"Discovering datasets from {extract_job.source.__class__.__name__} using selector {selector}"
141
+ f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
249
142
  )
250
143
 
251
- dataset_collection_metadata = self.store.get_dataset_collection(
252
- dataset_type=extract_job.dataset_type,
253
- data_spec_versions=selector.data_spec_versions,
144
+ ingestion_job = IngestionJob(
145
+ ingestion_job_id=str(uuid.uuid1()),
146
+ ingestion_plan=ingestion_plan,
254
147
  selector=selector,
255
- metadata_only=True,
256
- ).metadata
257
-
258
- # There are two different, but similar flows here:
259
- # 1. The discover_datasets returns a list, and the entire list can be processed at once
260
- # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
261
- datasets = extract_job.source.find_datasets(
262
- dataset_type=extract_job.dataset_type,
263
- data_spec_versions=selector.data_spec_versions,
264
- dataset_collection_metadata=dataset_collection_metadata,
265
- **selector.custom_attributes,
266
148
  )
267
149
 
268
- batches = to_batches(datasets)
269
-
270
- for batch in batches:
271
- dataset_identifiers = [
272
- Identifier.create_from_selector(
273
- selector, **dataset_resource.dataset_resource_id
274
- )
275
- # We have to pass the data_spec_versions here as a Source can add some
276
- # extra data to the identifier which is retrieved in a certain data format
277
- for dataset_resource in batch
278
- ]
279
-
280
- # Load all available datasets based on the discovered dataset identifiers
281
- dataset_collection = self.store.get_dataset_collection(
282
- dataset_type=extract_job.dataset_type,
283
- # Assume all DatasetResources share the same provider
284
- provider=batch[0].provider,
285
- selector=dataset_identifiers,
150
+ with TaskExecutor(dry_run=dry_run) as task_executor:
151
+ ingestion_job_summary = ingestion_job.execute(
152
+ self.store, task_executor=task_executor
286
153
  )
287
154
 
288
- skip_count = 0
289
- total_dataset_count += len(dataset_identifiers)
290
-
291
- task_set = TaskSet()
292
- for dataset_resource in batch:
293
- dataset_identifier = Identifier.create_from_selector(
294
- selector, **dataset_resource.dataset_resource_id
295
- )
296
-
297
- if dataset := dataset_collection.get(dataset_identifier):
298
- if extract_job.fetch_policy.should_refetch(
299
- dataset, dataset_resource
300
- ):
301
- task_set.add(
302
- UpdateDatasetTask(
303
- dataset=dataset, # Current dataset from the database
304
- dataset_resource=dataset_resource, # Most recent dataset_resource
305
- store=self.store,
306
- )
307
- )
308
- else:
309
- skip_count += 1
310
- else:
311
- if extract_job.fetch_policy.should_fetch(dataset_resource):
312
- task_set.add(
313
- CreateDatasetTask(
314
- dataset_resource=dataset_resource,
315
- store=self.store,
316
- )
317
- )
318
- else:
319
- skip_count += 1
155
+ # TODO: handle task_summaries
156
+ # Summarize to a IngestionJobSummary, and save to a database. This Summary can later be used in a
157
+ # next run to determine where to resume.
158
+ # TODO 2: Do we want to add additional information from the summary back to the Task, so it can use
159
+ # extra information to determine how/where to resume
160
+ ingestion_job_summary.set_finished()
320
161
 
321
- if task_set:
322
- logger.info(
323
- f"Discovered {len(dataset_identifiers)} datasets from {extract_job.source.__class__.__name__} "
324
- f"using selector {selector} => {len(task_set)} tasks. {skip_count} skipped."
325
- )
326
- logger.info(f"Running {len(task_set)} tasks")
327
- with TaskExecutor(dry_run=dry_run) as task_executor:
328
- task_executor.run(run_task, task_set)
329
- else:
330
- logger.info(
331
- f"Discovered {len(dataset_identifiers)} datasets from {extract_job.source.__class__.__name__} "
332
- f"using selector {selector} => nothing to do"
333
- )
162
+ ingestion_job_summary.output_report()
163
+ self.store.save_ingestion_job_summary(ingestion_job_summary)
334
164
 
335
165
  logger.info("Done")
@@ -11,10 +11,8 @@ from .dataset import (
11
11
  LoadedFile,
12
12
  Selector,
13
13
  Revision,
14
- dataset_repository_factory,
15
- file_repository_factory,
16
14
  )
17
- from .sink import Sink, sink_factory
15
+ from .sink import Sink
18
16
  from .source import Source
19
17
  from .task import Task, TaskSet
20
18
  from .data_spec_version_collection import DataSpecVersionCollection
@@ -35,11 +33,8 @@ __all__ = [
35
33
  "FileRepository",
36
34
  "FileCollection",
37
35
  "DatasetRepository",
38
- "dataset_repository_factory",
39
- "file_repository_factory",
40
36
  "TaskSet",
41
37
  "Task",
42
38
  "Sink",
43
- "sink_factory",
44
39
  "DataSpecVersionCollection",
45
40
  ]
@@ -0,0 +1,22 @@
1
+ from functools import partial
2
+ from typing import ClassVar, Any, Optional
3
+
4
+ import pydantic
5
+ from pydantic import BaseModel as PydanticBaseModel, ConfigDict
6
+
7
+
8
+ # class BaseModel(PydanticBaseModel):
9
+ # model_config = ConfigDict(arbitrary_types_allowed=True)
10
+ #
11
+ # _sa_instance_state: Optional[dict] = None
12
+ from sqlalchemy.orm import MappedAsDataclass
13
+
14
+
15
+ class BaseModel(
16
+ MappedAsDataclass,
17
+ # DeclarativeBase,
18
+ dataclass_callable=partial(
19
+ pydantic.dataclasses.dataclass, config=ConfigDict(arbitrary_types_allowed=True)
20
+ ),
21
+ ):
22
+ pass
@@ -16,6 +16,12 @@ class DataSpecVersionCollection(dict):
16
16
 
17
17
  return cls(items_)
18
18
 
19
+ def to_dict(self):
20
+ return {
21
+ data_feed_key: list(data_spec_versions)
22
+ for data_feed_key, data_spec_versions in self.items()
23
+ }
24
+
19
25
  def copy(self):
20
26
  return DataSpecVersionCollection(copy.deepcopy(self))
21
27
 
@@ -1,8 +1,8 @@
1
+ from .file import DraftFile, File, LoadedFile
1
2
  from .collection import DatasetCollection
2
3
  from .dataset import Dataset
3
- from .dataset_repository import DatasetRepository, dataset_repository_factory
4
- from .file import DraftFile, File, LoadedFile
5
- from .file_repository import FileRepository, file_repository_factory
4
+ from .dataset_repository import DatasetRepository
5
+ from .file_repository import FileRepository
6
6
  from .file_collection import FileCollection
7
7
  from .identifier import Identifier
8
8
  from .selector import Selector
@@ -16,12 +16,10 @@ __all__ = [
16
16
  "Identifier",
17
17
  "DatasetCollection",
18
18
  "DatasetCreated",
19
- "dataset_repository_factory",
20
19
  "File",
21
20
  "DraftFile",
22
21
  "LoadedFile",
23
22
  "DatasetRepository",
24
23
  "FileRepository",
25
- "file_repository_factory",
26
24
  "FileCollection",
27
25
  ]