ingestify 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. ingestify/__init__.py +1 -1
  2. ingestify/application/dataset_store.py +47 -36
  3. ingestify/application/ingestion_engine.py +3 -3
  4. ingestify/application/loader.py +71 -241
  5. ingestify/domain/models/__init__.py +1 -6
  6. ingestify/domain/models/base.py +22 -0
  7. ingestify/domain/models/data_spec_version_collection.py +6 -0
  8. ingestify/domain/models/dataset/__init__.py +3 -5
  9. ingestify/domain/models/dataset/dataset.py +15 -32
  10. ingestify/domain/models/dataset/dataset_repository.py +1 -15
  11. ingestify/domain/models/dataset/dataset_state.py +11 -0
  12. ingestify/domain/models/dataset/events.py +6 -16
  13. ingestify/domain/models/dataset/file.py +21 -34
  14. ingestify/domain/models/dataset/file_collection.py +3 -1
  15. ingestify/domain/models/dataset/file_repository.py +29 -28
  16. ingestify/domain/models/dataset/revision.py +26 -3
  17. ingestify/domain/models/event/domain_event.py +8 -4
  18. ingestify/domain/models/ingestion/__init__.py +0 -0
  19. ingestify/domain/models/ingestion/ingestion_job.py +325 -0
  20. ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
  21. ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
  22. ingestify/domain/models/resources/dataset_resource.py +29 -37
  23. ingestify/domain/models/sink.py +1 -8
  24. ingestify/domain/models/task/task.py +3 -1
  25. ingestify/domain/models/task/task_summary.py +118 -0
  26. ingestify/domain/models/timing.py +16 -0
  27. ingestify/domain/services/identifier_key_transformer.py +111 -0
  28. ingestify/infra/fetch/http.py +5 -0
  29. ingestify/infra/source/statsbomb_github.py +67 -54
  30. ingestify/infra/store/dataset/__init__.py +0 -2
  31. ingestify/infra/store/dataset/sqlalchemy/mapping.py +187 -4
  32. ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
  33. ingestify/infra/store/file/local_file_repository.py +3 -5
  34. ingestify/infra/store/file/s3_file_repository.py +4 -9
  35. ingestify/main.py +64 -25
  36. ingestify/utils.py +15 -78
  37. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/METADATA +2 -1
  38. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/RECORD +41 -34
  39. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/WHEEL +1 -1
  40. ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
  41. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt +0 -0
  42. {ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/top_level.txt +0 -0
ingestify/__init__.py CHANGED
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.1.3"
11
+ __version__ = "0.3.0"
@@ -5,13 +5,14 @@ import mimetypes
5
5
  import os
6
6
  import shutil
7
7
  from dataclasses import asdict
8
- from io import BytesIO, StringIO
8
+ from io import BytesIO
9
9
 
10
- from typing import Dict, List, Optional, Union, Callable, BinaryIO
10
+ from typing import Dict, List, Optional, Union, Callable, BinaryIO, Awaitable
11
11
 
12
12
  from ingestify.domain.models.dataset.dataset import DatasetState
13
13
  from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
14
14
  from ingestify.domain.models.dataset.file_collection import FileCollection
15
+ from ingestify.domain.models.dataset.revision import RevisionSource
15
16
  from ingestify.domain.models.event import EventBus
16
17
  from ingestify.domain.models import (
17
18
  Dataset,
@@ -27,7 +28,7 @@ from ingestify.domain.models import (
27
28
  Revision,
28
29
  DatasetCreated,
29
30
  )
30
- from ingestify.utils import utcnow, map_in_pool
31
+ from ingestify.utils import utcnow
31
32
 
32
33
 
33
34
  logger = logging.getLogger(__name__)
@@ -56,11 +57,16 @@ class DatasetStore:
56
57
  if self.event_bus:
57
58
  self.event_bus.dispatch(event)
58
59
 
60
+ def save_ingestion_job_summary(self, ingestion_job_summary):
61
+ self.dataset_repository.session.add(ingestion_job_summary)
62
+ self.dataset_repository.session.commit()
63
+
59
64
  def get_dataset_collection(
60
65
  self,
61
66
  dataset_type: Optional[str] = None,
62
67
  provider: Optional[str] = None,
63
68
  dataset_id: Optional[str] = None,
69
+ metadata_only: Optional[bool] = False,
64
70
  **selector,
65
71
  ) -> DatasetCollection:
66
72
  if "selector" in selector:
@@ -81,6 +87,7 @@ class DatasetStore:
81
87
  dataset_type=dataset_type,
82
88
  dataset_id=dataset_id,
83
89
  provider=provider,
90
+ metadata_only=metadata_only,
84
91
  selector=selector,
85
92
  )
86
93
  return dataset_collection
@@ -107,7 +114,9 @@ class DatasetStore:
107
114
 
108
115
  return stream, storage_size, suffix
109
116
 
110
- def _prepare_read_stream(self) -> tuple[Callable[[BinaryIO], BytesIO], str]:
117
+ def _prepare_read_stream(
118
+ self,
119
+ ) -> tuple[Callable[[BinaryIO], Awaitable[BytesIO]], str]:
111
120
  if self.storage_compression_method == "gzip":
112
121
 
113
122
  def reader(fh: BinaryIO) -> BytesIO:
@@ -168,7 +177,11 @@ class DatasetStore:
168
177
  return modified_files_
169
178
 
170
179
  def add_revision(
171
- self, dataset: Dataset, files: Dict[str, DraftFile], description: str = "Update"
180
+ self,
181
+ dataset: Dataset,
182
+ files: Dict[str, DraftFile],
183
+ revision_source: RevisionSource,
184
+ description: str = "Update",
172
185
  ):
173
186
  """
174
187
  Create new revision first, so FileRepository can use
@@ -182,46 +195,53 @@ class DatasetStore:
182
195
  # It can happen an API tells us data is changed, but it was not changed. In this case
183
196
  # we decide to ignore it.
184
197
  # Make sure there are files changed before creating a new revision
185
- dataset.add_revision(
186
- Revision(
187
- revision_id=revision_id,
188
- created_at=created_at,
189
- description=description,
190
- modified_files=persisted_files_,
191
- )
198
+ revision = Revision(
199
+ revision_id=revision_id,
200
+ created_at=created_at,
201
+ description=description,
202
+ modified_files=persisted_files_,
203
+ source=revision_source,
192
204
  )
193
205
 
206
+ dataset.add_revision(revision)
207
+
194
208
  self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
195
209
  self.dispatch(RevisionAdded(dataset=dataset))
196
210
  logger.info(
197
211
  f"Added a new revision to {dataset.identifier} -> {', '.join([file.file_id for file in persisted_files_])}"
198
212
  )
199
- return True
200
213
  else:
201
214
  logger.info(
202
215
  f"Ignoring a new revision without changed files -> {dataset.identifier}"
203
216
  )
204
- return False
217
+ revision = None
218
+
219
+ return revision
205
220
 
206
221
  def update_dataset(
207
222
  self,
208
223
  dataset: Dataset,
209
- dataset_resource: DatasetResource,
224
+ name: str,
225
+ state: DatasetState,
226
+ metadata: dict,
210
227
  files: Dict[str, DraftFile],
228
+ revision_source: RevisionSource,
211
229
  ):
212
230
  """The add_revision will also save the dataset."""
213
231
  metadata_changed = False
214
- if dataset.update_from_resource(dataset_resource):
232
+ if dataset.update_metadata(name, metadata, state):
215
233
  self.dataset_repository.save(bucket=self.bucket, dataset=dataset)
216
234
  metadata_changed = True
217
235
 
218
- self.add_revision(dataset, files)
236
+ revision = self.add_revision(dataset, files, revision_source)
219
237
 
220
238
  if metadata_changed:
221
239
  # Dispatch after revision added. Otherwise, the downstream handlers are not able to see
222
240
  # the new revision
223
241
  self.dispatch(MetadataUpdated(dataset=dataset))
224
242
 
243
+ return revision
244
+
225
245
  def destroy_dataset(self, dataset: Dataset):
226
246
  # TODO: remove files. Now we leave some orphaned files around
227
247
  self.dataset_repository.destroy(dataset)
@@ -235,6 +255,7 @@ class DatasetStore:
235
255
  state: DatasetState,
236
256
  metadata: dict,
237
257
  files: Dict[str, DraftFile],
258
+ revision_source: RevisionSource,
238
259
  description: str = "Create",
239
260
  ):
240
261
  now = utcnow()
@@ -251,9 +272,10 @@ class DatasetStore:
251
272
  created_at=now,
252
273
  updated_at=now,
253
274
  )
254
- self.add_revision(dataset, files, description)
275
+ revision = self.add_revision(dataset, files, revision_source, description)
255
276
 
256
277
  self.dispatch(DatasetCreated(dataset=dataset))
278
+ return revision
257
279
 
258
280
  def load_files(
259
281
  self,
@@ -271,20 +293,9 @@ class DatasetStore:
271
293
  continue
272
294
 
273
295
  def get_stream(file_):
274
- revision_id = file_.revision_id
275
- if revision_id is None:
276
- revision_id = current_revision.revision_id
277
-
278
296
  return reader(
279
297
  self.file_repository.load_content(
280
- bucket=self.bucket,
281
- dataset=dataset,
282
- # When file.revision_id is set we must use it.
283
- revision_id=revision_id,
284
- filename=file_.file_id
285
- + "."
286
- + file_.data_serialization_format
287
- + suffix,
298
+ bucket=self.bucket, storage_path=file_.storage_path
288
299
  )
289
300
  )
290
301
 
@@ -302,8 +313,8 @@ class DatasetStore:
302
313
 
303
314
  try:
304
315
  return statsbomb.load(
305
- event_data=files.get_file("events").stream,
306
- lineup_data=files.get_file("lineups").stream,
316
+ event_data=(files.get_file("events")).stream,
317
+ lineup_data=(files.get_file("lineups")).stream,
307
318
  **kwargs,
308
319
  )
309
320
  except Exception as e:
@@ -333,7 +344,7 @@ class DatasetStore:
333
344
  # filename=filename,
334
345
  # )
335
346
 
336
- def map(
337
- self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
338
- ):
339
- return map_in_pool(fn, dataset_collection, processes)
347
+ # def map(
348
+ # self, fn, dataset_collection: DatasetCollection, processes: Optional[int] = None
349
+ # ):
350
+ # return map_in_pool(fn, dataset_collection, processes)
@@ -5,7 +5,7 @@ from typing import Optional, List
5
5
 
6
6
  from .loader import Loader
7
7
  from .dataset_store import DatasetStore
8
- from ..domain.models.extract_job import ExtractJob
8
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
9
9
 
10
10
  logger = logging.getLogger(__name__)
11
11
 
@@ -18,8 +18,8 @@ class IngestionEngine:
18
18
  self.store = store
19
19
  self.loader = Loader(self.store)
20
20
 
21
- def add_extract_job(self, extract_job: ExtractJob):
22
- self.loader.add_extract_job(extract_job)
21
+ def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
22
+ self.loader.add_ingestion_plan(ingestion_plan)
23
23
 
24
24
  def load(self, dry_run: bool = False, provider: Optional[str] = None):
25
25
  self.loader.collect_and_run(dry_run=dry_run, provider=provider)
@@ -1,19 +1,15 @@
1
- import itertools
2
- import json
3
1
  import logging
4
2
  import platform
5
- from multiprocessing import set_start_method, cpu_count
3
+ import uuid
4
+ from multiprocessing import set_start_method
6
5
  from typing import List, Optional
7
6
 
8
- from ingestify.domain.models import Dataset, Identifier, Selector, Source, Task, TaskSet
9
- from ingestify.utils import map_in_pool, TaskExecutor, chunker
7
+ from ingestify.domain.models import Selector
8
+ from ingestify.utils import TaskExecutor
10
9
 
11
10
  from .dataset_store import DatasetStore
12
- from .. import DatasetResource, retrieve_http
13
- from ..domain import DraftFile
14
- from ..domain.models.data_spec_version_collection import DataSpecVersionCollection
15
- from ..domain.models.extract_job import ExtractJob
16
- from ..domain.models.resources.dataset_resource import FileResource
11
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
12
+ from ..domain.models.ingestion.ingestion_job import IngestionJob
17
13
  from ..exceptions import ConfigurationError
18
14
 
19
15
  if platform.system() == "Darwin":
@@ -25,176 +21,54 @@ else:
25
21
  logger = logging.getLogger(__name__)
26
22
 
27
23
 
28
- DEFAULT_CHUNK_SIZE = 1000
29
-
30
-
31
- def to_batches(input_):
32
- if isinstance(input_, list):
33
- batches = [input_]
34
- else:
35
- # Assume it's an iterator. Peek what's inside, and put it back
36
- try:
37
- peek = next(input_)
38
- except StopIteration:
39
- # Nothing to batch
40
- return []
41
-
42
- input_ = itertools.chain([peek], input_)
43
-
44
- if not isinstance(peek, list):
45
- batches = chunker(input_, DEFAULT_CHUNK_SIZE)
46
- else:
47
- batches = input_
48
- return batches
49
-
50
-
51
- def load_file(
52
- file_resource: FileResource, dataset: Optional[Dataset] = None
53
- ) -> Optional[DraftFile]:
54
- current_file = None
55
- if dataset:
56
- current_file = dataset.current_revision.modified_files_map.get(
57
- file_resource.file_id
58
- )
59
-
60
- if file_resource.json_content is not None:
61
- # Empty dictionary is allowed
62
- file = DraftFile.from_input(
63
- file_=json.dumps(file_resource.json_content, indent=4),
64
- data_serialization_format="json",
65
- data_feed_key=file_resource.data_feed_key,
66
- data_spec_version=file_resource.data_spec_version,
67
- modified_at=file_resource.last_modified,
68
- )
69
- if current_file and current_file.tag == file.tag:
70
- # Nothing changed
71
- return None
72
- return file
73
- elif file_resource.url:
74
- http_options = {}
75
- if file_resource.http_options:
76
- for k, v in file_resource.http_options.items():
77
- http_options[f"http_{k}"] = v
78
-
79
- return retrieve_http(
80
- url=file_resource.url,
81
- current_file=current_file,
82
- file_data_feed_key=file_resource.data_feed_key,
83
- file_data_spec_version=file_resource.data_spec_version,
84
- file_data_serialization_format=file_resource.data_serialization_format
85
- or "txt",
86
- **http_options,
87
- **file_resource.loader_kwargs,
88
- )
89
- else:
90
- return file_resource.file_loader(
91
- file_resource,
92
- current_file,
93
- # TODO: check how to fix this with typehints
94
- **file_resource.loader_kwargs,
95
- )
96
-
97
-
98
- class UpdateDatasetTask(Task):
99
- def __init__(
100
- self,
101
- dataset: Dataset,
102
- dataset_resource: DatasetResource,
103
- store: DatasetStore,
104
- ):
105
- self.dataset = dataset
106
- self.dataset_resource = dataset_resource
107
- self.store = store
108
-
109
- def run(self):
110
- self.store.update_dataset(
111
- dataset=self.dataset,
112
- dataset_resource=self.dataset_resource,
113
- files={
114
- file_id: load_file(file_resource, dataset=self.dataset)
115
- for file_id, file_resource in self.dataset_resource.files.items()
116
- },
117
- )
118
-
119
- def __repr__(self):
120
- return f"UpdateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
121
-
122
-
123
- class CreateDatasetTask(Task):
124
- def __init__(
125
- self,
126
- dataset_resource: DatasetResource,
127
- store: DatasetStore,
128
- ):
129
- self.dataset_resource = dataset_resource
130
- self.store = store
131
-
132
- def run(self):
133
- self.store.create_dataset(
134
- dataset_type=self.dataset_resource.dataset_type,
135
- provider=self.dataset_resource.provider,
136
- dataset_identifier=Identifier(**self.dataset_resource.dataset_resource_id),
137
- name=self.dataset_resource.name,
138
- state=self.dataset_resource.state,
139
- metadata=self.dataset_resource.metadata,
140
- files={
141
- file_id: load_file(file_resource)
142
- for file_id, file_resource in self.dataset_resource.files.items()
143
- },
144
- )
145
-
146
- def __repr__(self):
147
- return f"CreateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
148
-
149
-
150
24
  class Loader:
151
25
  def __init__(self, store: DatasetStore):
152
26
  self.store = store
153
- self.extract_jobs: List[ExtractJob] = []
27
+ self.ingestion_plans: List[IngestionPlan] = []
154
28
 
155
- def add_extract_job(self, extract_job: ExtractJob):
156
- self.extract_jobs.append(extract_job)
29
+ def add_ingestion_plan(self, ingestion_plan: IngestionPlan):
30
+ self.ingestion_plans.append(ingestion_plan)
157
31
 
158
32
  def collect_and_run(self, dry_run: bool = False, provider: Optional[str] = None):
159
- total_dataset_count = 0
160
-
161
33
  # First collect all selectors, before discovering datasets
162
34
  selectors = {}
163
- for extract_job in self.extract_jobs:
35
+ for ingestion_plan in self.ingestion_plans:
36
+ logger.info(f"Determining selectors for {ingestion_plan}")
37
+
164
38
  if provider is not None:
165
- if extract_job.source.provider != provider:
39
+ if ingestion_plan.source.provider != provider:
166
40
  logger.info(
167
- f"Skipping {extract_job } because provider doesn't match '{provider}'"
41
+ f"Skipping {ingestion_plan} because provider doesn't match '{provider}'"
168
42
  )
169
43
  continue
170
44
 
171
45
  static_selectors = [
172
46
  selector
173
- for selector in extract_job.selectors
47
+ for selector in ingestion_plan.selectors
174
48
  if not selector.is_dynamic
175
49
  ]
176
50
  dynamic_selectors = [
177
- selector for selector in extract_job.selectors if selector.is_dynamic
51
+ selector for selector in ingestion_plan.selectors if selector.is_dynamic
178
52
  ]
179
53
 
180
54
  no_selectors = len(static_selectors) == 1 and not bool(static_selectors[0])
181
55
  if dynamic_selectors or no_selectors:
182
- if hasattr(extract_job.source, "discover_selectors"):
56
+ if hasattr(ingestion_plan.source, "discover_selectors"):
183
57
  logger.debug(
184
- f"Discovering selectors from {extract_job.source.__class__.__name__}"
58
+ f"Discovering selectors from {ingestion_plan.source.__class__.__name__}"
185
59
  )
186
60
 
187
61
  # TODO: consider making this lazy and fetch once per Source instead of
188
- # once per ExtractJob
189
- all_selectors = extract_job.source.discover_selectors(
190
- extract_job.dataset_type
62
+ # once per IngestionPlan
63
+ all_selectors = ingestion_plan.source.discover_selectors(
64
+ ingestion_plan.dataset_type
191
65
  )
192
66
  if no_selectors:
193
67
  # When there were no selectors specified, just use all of them
194
68
  extra_static_selectors = [
195
69
  Selector.build(
196
70
  job_selector,
197
- data_spec_versions=extract_job.data_spec_versions,
71
+ data_spec_versions=ingestion_plan.data_spec_versions,
198
72
  )
199
73
  for job_selector in all_selectors
200
74
  ]
@@ -205,7 +79,7 @@ class Loader:
205
79
  dynamic_job_selectors = [
206
80
  Selector.build(
207
81
  job_selector,
208
- data_spec_versions=extract_job.data_spec_versions,
82
+ data_spec_versions=ingestion_plan.data_spec_versions,
209
83
  )
210
84
  for job_selector in all_selectors
211
85
  if dynamic_selector.is_match(job_selector)
@@ -216,7 +90,7 @@ class Loader:
216
90
  static_selectors.extend(extra_static_selectors)
217
91
 
218
92
  logger.info(
219
- f"Discovered {len(extra_static_selectors)} selectors from {extract_job.source.__class__.__name__}"
93
+ f"Discovered {len(extra_static_selectors)} selectors from {ingestion_plan.source.__class__.__name__}"
220
94
  )
221
95
  else:
222
96
  if not no_selectors:
@@ -224,112 +98,68 @@ class Loader:
224
98
  # later on
225
99
  raise ConfigurationError(
226
100
  f"Dynamic selectors cannot be used for "
227
- f"{extract_job.source.__class__.__name__} because it doesn't support"
101
+ f"{ingestion_plan.source.__class__.__name__} because it doesn't support"
228
102
  f" selector discovery"
229
103
  )
230
104
 
231
105
  # Merge selectors when source, dataset_type and actual selector is the same. This makes
232
106
  # sure there will be only 1 dataset for this combination
233
107
  for selector in static_selectors:
234
- key = (extract_job.source.name, extract_job.dataset_type, selector.key)
108
+ key = (
109
+ ingestion_plan.source.name,
110
+ ingestion_plan.dataset_type,
111
+ selector.key,
112
+ )
235
113
  if existing_selector := selectors.get(key):
236
114
  existing_selector[1].data_spec_versions.merge(
237
115
  selector.data_spec_versions
238
116
  )
239
117
  else:
240
- selectors[key] = (extract_job, selector)
241
-
242
- def run_task(task):
243
- logger.info(f"Running task {task}")
244
- task.run()
245
-
246
- for extract_job, selector in selectors.values():
247
- logger.debug(
248
- f"Discovering datasets from {extract_job.source.__class__.__name__} using selector {selector}"
118
+ selectors[key] = (ingestion_plan, selector)
119
+
120
+ """
121
+ Data is denormalized:
122
+
123
+ It actually looks like:
124
+ - IngestionPlan #1
125
+ - Selector 1.1
126
+ - Selector 1.2
127
+ - Selector 1.3
128
+ - IngestionPlan #2
129
+ - Selector 2.1
130
+ - Selector 2.2
131
+
132
+ We process this as:
133
+ - IngestionPlan #1, Selector 1.1
134
+ - IngestionPlan #1, Selector 1.2
135
+ - IngestionPlan #1, Selector 1.3
136
+ - IngestionPlan #2, Selector 2.1
137
+ - IngestionPlan #2, Selector 2.2
138
+
139
+ IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
140
+ """
141
+ for ingestion_plan, selector in selectors.values():
142
+ logger.info(
143
+ f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
249
144
  )
250
145
 
251
- dataset_collection_metadata = self.store.get_dataset_collection(
252
- dataset_type=extract_job.dataset_type,
253
- data_spec_versions=selector.data_spec_versions,
146
+ ingestion_job = IngestionJob(
147
+ ingestion_job_id=str(uuid.uuid1()),
148
+ ingestion_plan=ingestion_plan,
254
149
  selector=selector,
255
- metadata_only=True,
256
- ).metadata
257
-
258
- # There are two different, but similar flows here:
259
- # 1. The discover_datasets returns a list, and the entire list can be processed at once
260
- # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
261
- datasets = extract_job.source.find_datasets(
262
- dataset_type=extract_job.dataset_type,
263
- data_spec_versions=selector.data_spec_versions,
264
- dataset_collection_metadata=dataset_collection_metadata,
265
- **selector.custom_attributes,
266
150
  )
267
151
 
268
- batches = to_batches(datasets)
269
-
270
- for batch in batches:
271
- dataset_identifiers = [
272
- Identifier.create_from_selector(
273
- selector, **dataset_resource.dataset_resource_id
274
- )
275
- # We have to pass the data_spec_versions here as a Source can add some
276
- # extra data to the identifier which is retrieved in a certain data format
277
- for dataset_resource in batch
278
- ]
279
-
280
- # Load all available datasets based on the discovered dataset identifiers
281
- dataset_collection = self.store.get_dataset_collection(
282
- dataset_type=extract_job.dataset_type,
283
- # Assume all DatasetResources share the same provider
284
- provider=batch[0].provider,
285
- selector=dataset_identifiers,
286
- )
287
-
288
- skip_count = 0
289
- total_dataset_count += len(dataset_identifiers)
290
-
291
- task_set = TaskSet()
292
- for dataset_resource in batch:
293
- dataset_identifier = Identifier.create_from_selector(
294
- selector, **dataset_resource.dataset_resource_id
295
- )
296
-
297
- if dataset := dataset_collection.get(dataset_identifier):
298
- if extract_job.fetch_policy.should_refetch(
299
- dataset, dataset_resource
300
- ):
301
- task_set.add(
302
- UpdateDatasetTask(
303
- dataset=dataset, # Current dataset from the database
304
- dataset_resource=dataset_resource, # Most recent dataset_resource
305
- store=self.store,
306
- )
307
- )
308
- else:
309
- skip_count += 1
310
- else:
311
- if extract_job.fetch_policy.should_fetch(dataset_resource):
312
- task_set.add(
313
- CreateDatasetTask(
314
- dataset_resource=dataset_resource,
315
- store=self.store,
316
- )
317
- )
318
- else:
319
- skip_count += 1
320
-
321
- if task_set:
322
- logger.info(
323
- f"Discovered {len(dataset_identifiers)} datasets from {extract_job.source.__class__.__name__} "
324
- f"using selector {selector} => {len(task_set)} tasks. {skip_count} skipped."
325
- )
326
- logger.info(f"Running {len(task_set)} tasks")
327
- with TaskExecutor(dry_run=dry_run) as task_executor:
328
- task_executor.run(run_task, task_set)
329
- else:
330
- logger.info(
331
- f"Discovered {len(dataset_identifiers)} datasets from {extract_job.source.__class__.__name__} "
332
- f"using selector {selector} => nothing to do"
333
- )
152
+ with TaskExecutor(dry_run=dry_run) as task_executor:
153
+ for ingestion_job_summary in ingestion_job.execute(
154
+ self.store, task_executor=task_executor
155
+ ):
156
+ # TODO: handle task_summaries
157
+ # Summarize to a IngestionJobSummary, and save to a database. This Summary can later be used in a
158
+ # next run to determine where to resume.
159
+ # TODO 2: Do we want to add additional information from the summary back to the Task, so it can use
160
+ # extra information to determine how/where to resume
161
+ ingestion_job_summary.output_report()
162
+ logger.info(f"Storing IngestionJobSummary")
163
+ self.store.save_ingestion_job_summary(ingestion_job_summary)
334
164
 
335
165
  logger.info("Done")
@@ -11,10 +11,8 @@ from .dataset import (
11
11
  LoadedFile,
12
12
  Selector,
13
13
  Revision,
14
- dataset_repository_factory,
15
- file_repository_factory,
16
14
  )
17
- from .sink import Sink, sink_factory
15
+ from .sink import Sink
18
16
  from .source import Source
19
17
  from .task import Task, TaskSet
20
18
  from .data_spec_version_collection import DataSpecVersionCollection
@@ -35,11 +33,8 @@ __all__ = [
35
33
  "FileRepository",
36
34
  "FileCollection",
37
35
  "DatasetRepository",
38
- "dataset_repository_factory",
39
- "file_repository_factory",
40
36
  "TaskSet",
41
37
  "Task",
42
38
  "Sink",
43
- "sink_factory",
44
39
  "DataSpecVersionCollection",
45
40
  ]
@@ -0,0 +1,22 @@
1
+ from functools import partial
2
+ from typing import ClassVar, Any, Optional
3
+
4
+ import pydantic
5
+ from pydantic import BaseModel as PydanticBaseModel, ConfigDict
6
+
7
+
8
+ # class BaseModel(PydanticBaseModel):
9
+ # model_config = ConfigDict(arbitrary_types_allowed=True)
10
+ #
11
+ # _sa_instance_state: Optional[dict] = None
12
+ from sqlalchemy.orm import MappedAsDataclass
13
+
14
+
15
+ class BaseModel(
16
+ MappedAsDataclass,
17
+ # DeclarativeBase,
18
+ dataclass_callable=partial(
19
+ pydantic.dataclasses.dataclass, config=ConfigDict(arbitrary_types_allowed=True)
20
+ ),
21
+ ):
22
+ pass