ingestify 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ingestify/__init__.py CHANGED
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.2.0"
11
+ __version__ = "0.3.0"
@@ -66,6 +66,7 @@ class DatasetStore:
66
66
  dataset_type: Optional[str] = None,
67
67
  provider: Optional[str] = None,
68
68
  dataset_id: Optional[str] = None,
69
+ metadata_only: Optional[bool] = False,
69
70
  **selector,
70
71
  ) -> DatasetCollection:
71
72
  if "selector" in selector:
@@ -86,6 +87,7 @@ class DatasetStore:
86
87
  dataset_type=dataset_type,
87
88
  dataset_id=dataset_id,
88
89
  provider=provider,
90
+ metadata_only=metadata_only,
89
91
  selector=selector,
90
92
  )
91
93
  return dataset_collection
@@ -291,20 +293,9 @@ class DatasetStore:
291
293
  continue
292
294
 
293
295
  def get_stream(file_):
294
- revision_id = file_.revision_id
295
- if revision_id is None:
296
- revision_id = current_revision.revision_id
297
-
298
296
  return reader(
299
297
  self.file_repository.load_content(
300
- bucket=self.bucket,
301
- dataset=dataset,
302
- # When file.revision_id is set we must use it.
303
- revision_id=revision_id,
304
- filename=file_.file_id
305
- + "."
306
- + file_.data_serialization_format
307
- + suffix,
298
+ bucket=self.bucket, storage_path=file_.storage_path
308
299
  )
309
300
  )
310
301
 
@@ -33,6 +33,8 @@ class Loader:
33
33
  # First collect all selectors, before discovering datasets
34
34
  selectors = {}
35
35
  for ingestion_plan in self.ingestion_plans:
36
+ logger.info(f"Determining selectors for {ingestion_plan}")
37
+
36
38
  if provider is not None:
37
39
  if ingestion_plan.source.provider != provider:
38
40
  logger.info(
@@ -137,7 +139,7 @@ class Loader:
137
139
  IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
138
140
  """
139
141
  for ingestion_plan, selector in selectors.values():
140
- logger.debug(
142
+ logger.info(
141
143
  f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
142
144
  )
143
145
 
@@ -148,18 +150,16 @@ class Loader:
148
150
  )
149
151
 
150
152
  with TaskExecutor(dry_run=dry_run) as task_executor:
151
- ingestion_job_summary = ingestion_job.execute(
153
+ for ingestion_job_summary in ingestion_job.execute(
152
154
  self.store, task_executor=task_executor
153
- )
154
-
155
- # TODO: handle task_summaries
156
- # Summarize to a IngestionJobSummary, and save to a database. This Summary can later be used in a
157
- # next run to determine where to resume.
158
- # TODO 2: Do we want to add additional information from the summary back to the Task, so it can use
159
- # extra information to determine how/where to resume
160
- ingestion_job_summary.set_finished()
161
-
162
- ingestion_job_summary.output_report()
163
- self.store.save_ingestion_job_summary(ingestion_job_summary)
155
+ ):
156
+ # TODO: handle task_summaries
157
+ # Summarize to a IngestionJobSummary, and save to a database. This Summary can later be used in a
158
+ # next run to determine where to resume.
159
+ # TODO 2: Do we want to add additional information from the summary back to the Task, so it can use
160
+ # extra information to determine how/where to resume
161
+ ingestion_job_summary.output_report()
162
+ logger.info(f"Storing IngestionJobSummary")
163
+ self.store.save_ingestion_job_summary(ingestion_job_summary)
164
164
 
165
165
  logger.info("Done")
@@ -3,11 +3,34 @@ from pathlib import Path
3
3
  from typing import BinaryIO
4
4
 
5
5
  from .dataset import Dataset
6
+ from ...services.identifier_key_transformer import IdentifierTransformer
6
7
 
7
8
 
8
9
  class FileRepository(ABC):
9
- def __init__(self, url: str):
10
+ def __init__(self, url: str, identifier_transformer: IdentifierTransformer):
10
11
  self.base_dir = Path(url.split("://")[1])
12
+ self.identifier_transformer = identifier_transformer
13
+
14
+ def get_write_path(
15
+ self, bucket: str, dataset: Dataset, revision_id: int, filename: str
16
+ ) -> Path:
17
+ # TODO: use the IdentifierKeyTransformer
18
+ identifier_path = self.identifier_transformer.to_path(
19
+ provider=dataset.provider,
20
+ dataset_type=dataset.dataset_type,
21
+ identifier=dataset.identifier,
22
+ )
23
+
24
+ path = (
25
+ self.base_dir
26
+ / bucket
27
+ / f"provider={dataset.provider}"
28
+ / f"dataset_type={dataset.dataset_type}"
29
+ / identifier_path
30
+ / str(revision_id)
31
+ / filename
32
+ )
33
+ return path
11
34
 
12
35
  @abstractmethod
13
36
  def save_content(
@@ -20,10 +43,11 @@ class FileRepository(ABC):
20
43
  ) -> Path:
21
44
  pass
22
45
 
46
+ def get_read_path(self, storage_path: str) -> Path:
47
+ return self.base_dir / storage_path
48
+
23
49
  @abstractmethod
24
- def load_content(
25
- self, bucket: str, dataset: Dataset, revision_id: int, filename: str
26
- ) -> BinaryIO:
50
+ def load_content(self, storage_path: str) -> BinaryIO:
27
51
  pass
28
52
 
29
53
  @classmethod
@@ -31,20 +55,6 @@ class FileRepository(ABC):
31
55
  def supports(cls, url: str) -> bool:
32
56
  pass
33
57
 
34
- def get_path(
35
- self, bucket: str, dataset: Dataset, revision_id: int, filename: str
36
- ) -> Path:
37
- path = (
38
- self.base_dir
39
- / bucket
40
- / f"provider={dataset.provider}"
41
- / f"dataset_type={dataset.dataset_type}"
42
- / str(dataset.identifier)
43
- / str(revision_id)
44
- / filename
45
- )
46
- return path
47
-
48
58
  def get_relative_path(self, path: Path) -> Path:
49
59
  """Return the relative path to the base of the repository"""
50
60
  return path.relative_to(self.base_dir)
@@ -2,7 +2,7 @@ import itertools
2
2
  import json
3
3
  import logging
4
4
  import uuid
5
- from typing import Optional
5
+ from typing import Optional, Iterator
6
6
 
7
7
  from ingestify import retrieve_http
8
8
  from ingestify.application.dataset_store import DatasetStore
@@ -22,7 +22,7 @@ from ingestify.utils import TaskExecutor, chunker
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
24
 
25
- DEFAULT_CHUNK_SIZE = 1000
25
+ DEFAULT_CHUNK_SIZE = 1_000
26
26
 
27
27
 
28
28
  def run_task(task):
@@ -32,14 +32,14 @@ def run_task(task):
32
32
 
33
33
  def to_batches(input_):
34
34
  if isinstance(input_, list):
35
- batches = [input_]
35
+ batches = iter(input_)
36
36
  else:
37
37
  # Assume it's an iterator. Peek what's inside, and put it back
38
38
  try:
39
39
  peek = next(input_)
40
40
  except StopIteration:
41
41
  # Nothing to batch
42
- return []
42
+ return iter([])
43
43
 
44
44
  input_ = itertools.chain([peek], input_)
45
45
 
@@ -184,6 +184,9 @@ class CreateDatasetTask(Task):
184
184
  return f"CreateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
185
185
 
186
186
 
187
+ MAX_TASKS_PER_CHUNK = 10_000
188
+
189
+
187
190
  class IngestionJob:
188
191
  def __init__(
189
192
  self,
@@ -197,96 +200,126 @@ class IngestionJob:
197
200
 
198
201
  def execute(
199
202
  self, store: DatasetStore, task_executor: TaskExecutor
200
- ) -> IngestionJobSummary:
201
- with IngestionJobSummary.new(ingestion_job=self) as ingestion_job_summary:
202
- with ingestion_job_summary.record_timing("get_dataset_collection"):
203
- dataset_collection_metadata = store.get_dataset_collection(
204
- dataset_type=self.ingestion_plan.dataset_type,
205
- data_spec_versions=self.selector.data_spec_versions,
206
- selector=self.selector,
207
- metadata_only=True,
208
- ).metadata
209
-
210
- # There are two different, but similar flows here:
211
- # 1. The discover_datasets returns a list, and the entire list can be processed at once
212
- # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
213
- with ingestion_job_summary.record_timing("find_datasets"):
214
- # Timing might be incorrect as it is an iterator
215
- datasets = self.ingestion_plan.source.find_datasets(
216
- dataset_type=self.ingestion_plan.dataset_type,
217
- data_spec_versions=self.selector.data_spec_versions,
218
- dataset_collection_metadata=dataset_collection_metadata,
219
- **self.selector.custom_attributes,
220
- )
203
+ ) -> Iterator[IngestionJobSummary]:
204
+ is_first_chunk = True
205
+ ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
206
+ # Process all items in batches. Yield a IngestionJobSummary per batch
207
+
208
+ logger.info("Finding metadata")
209
+ with ingestion_job_summary.record_timing("get_dataset_collection"):
210
+ dataset_collection_metadata = store.get_dataset_collection(
211
+ dataset_type=self.ingestion_plan.dataset_type,
212
+ data_spec_versions=self.selector.data_spec_versions,
213
+ selector=self.selector,
214
+ metadata_only=True,
215
+ ).metadata
216
+ logger.info(f"Done: {dataset_collection_metadata}")
217
+
218
+ # There are two different, but similar flows here:
219
+ # 1. The discover_datasets returns a list, and the entire list can be processed at once
220
+ # 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
221
+ with ingestion_job_summary.record_timing("find_datasets"):
222
+ # Timing might be incorrect as it is an iterator
223
+ dataset_resources = self.ingestion_plan.source.find_datasets(
224
+ dataset_type=self.ingestion_plan.dataset_type,
225
+ data_spec_versions=self.selector.data_spec_versions,
226
+ dataset_collection_metadata=dataset_collection_metadata,
227
+ **self.selector.custom_attributes,
228
+ )
221
229
 
222
- batches = to_batches(datasets)
230
+ finish_task_timer = ingestion_job_summary.start_timing("tasks")
223
231
 
224
- with ingestion_job_summary.record_timing("tasks"):
225
- for batch in batches:
226
- dataset_identifiers = [
227
- Identifier.create_from_selector(
228
- self.selector, **dataset_resource.dataset_resource_id
229
- )
230
- # We have to pass the data_spec_versions here as a Source can add some
231
- # extra data to the identifier which is retrieved in a certain data format
232
- for dataset_resource in batch
233
- ]
234
-
235
- # Load all available datasets based on the discovered dataset identifiers
236
- dataset_collection = store.get_dataset_collection(
237
- dataset_type=self.ingestion_plan.dataset_type,
238
- # Assume all DatasetResources share the same provider
239
- provider=batch[0].provider,
240
- selector=dataset_identifiers,
241
- )
232
+ batches = to_batches(dataset_resources)
242
233
 
243
- skip_count = 0
234
+ while True:
235
+ try:
236
+ batch = next(batches)
237
+ except StopIteration:
238
+ break
239
+ except Exception:
240
+ # TODO: handle exception on IngestionJob level
241
+ raise
244
242
 
245
- task_set = TaskSet()
246
- for dataset_resource in batch:
247
- dataset_identifier = Identifier.create_from_selector(
248
- self.selector, **dataset_resource.dataset_resource_id
249
- )
243
+ dataset_identifiers = [
244
+ Identifier.create_from_selector(
245
+ self.selector, **dataset_resource.dataset_resource_id
246
+ )
247
+ # We have to pass the data_spec_versions here as a Source can add some
248
+ # extra data to the identifier which is retrieved in a certain data format
249
+ for dataset_resource in batch
250
+ ]
251
+
252
+ # Load all available datasets based on the discovered dataset identifiers
253
+ dataset_collection = store.get_dataset_collection(
254
+ dataset_type=self.ingestion_plan.dataset_type,
255
+ # Assume all DatasetResources share the same provider
256
+ provider=batch[0].provider,
257
+ selector=dataset_identifiers,
258
+ )
250
259
 
251
- if dataset := dataset_collection.get(dataset_identifier):
252
- if self.ingestion_plan.fetch_policy.should_refetch(
253
- dataset, dataset_resource
254
- ):
255
- task_set.add(
256
- UpdateDatasetTask(
257
- dataset=dataset, # Current dataset from the database
258
- dataset_resource=dataset_resource, # Most recent dataset_resource
259
- store=store,
260
- )
261
- )
262
- else:
263
- skip_count += 1
264
- else:
265
- if self.ingestion_plan.fetch_policy.should_fetch(
266
- dataset_resource
267
- ):
268
- task_set.add(
269
- CreateDatasetTask(
270
- dataset_resource=dataset_resource,
271
- store=store,
272
- )
273
- )
274
- else:
275
- skip_count += 1
276
-
277
- if task_set:
278
- logger.info(
279
- f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
280
- f"using selector {self.selector} => {len(task_set)} tasks. {skip_count} skipped."
281
- )
282
- logger.info(f"Running {len(task_set)} tasks")
283
- ingestion_job_summary.add_task_summaries(
284
- task_executor.run(run_task, task_set)
260
+ skipped_datasets = 0
261
+
262
+ task_set = TaskSet()
263
+ for dataset_resource in batch:
264
+ dataset_identifier = Identifier.create_from_selector(
265
+ self.selector, **dataset_resource.dataset_resource_id
266
+ )
267
+
268
+ if dataset := dataset_collection.get(dataset_identifier):
269
+ if self.ingestion_plan.fetch_policy.should_refetch(
270
+ dataset, dataset_resource
271
+ ):
272
+ task_set.add(
273
+ UpdateDatasetTask(
274
+ dataset=dataset, # Current dataset from the database
275
+ dataset_resource=dataset_resource, # Most recent dataset_resource
276
+ store=store,
277
+ )
285
278
  )
286
279
  else:
287
- logger.info(
288
- f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
289
- f"using selector {self.selector} => nothing to do"
280
+ skipped_datasets += 1
281
+ else:
282
+ if self.ingestion_plan.fetch_policy.should_fetch(dataset_resource):
283
+ task_set.add(
284
+ CreateDatasetTask(
285
+ dataset_resource=dataset_resource,
286
+ store=store,
287
+ )
290
288
  )
289
+ else:
290
+ skipped_datasets += 1
291
+
292
+ if task_set:
293
+ logger.info(
294
+ f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
295
+ f"using selector {self.selector} => {len(task_set)} tasks. {skipped_datasets} skipped."
296
+ )
297
+ logger.info(f"Running {len(task_set)} tasks")
298
+ ingestion_job_summary.add_task_summaries(
299
+ task_executor.run(run_task, task_set)
300
+ )
301
+ else:
302
+ logger.info(
303
+ f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
304
+ f"using selector {self.selector} => nothing to do"
305
+ )
306
+
307
+ ingestion_job_summary.increase_skipped_datasets(skipped_datasets)
308
+
309
+ if ingestion_job_summary.task_count() >= MAX_TASKS_PER_CHUNK:
310
+ finish_task_timer()
311
+ ingestion_job_summary.set_finished()
312
+ yield ingestion_job_summary
313
+
314
+ # Start a new one
315
+ is_first_chunk = False
316
+ ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
317
+
318
+ # We will resume tasks, start timer right away
319
+ finish_task_timer = ingestion_job_summary.start_timing("tasks")
291
320
 
292
- return ingestion_job_summary
321
+ if ingestion_job_summary.task_count() > 0 or is_first_chunk:
322
+ # When there is interesting information to store, or there was no data at all, store it
323
+ finish_task_timer()
324
+ ingestion_job_summary.set_finished()
325
+ yield ingestion_job_summary
@@ -1,3 +1,4 @@
1
+ import uuid
1
2
  from contextlib import contextmanager
2
3
  from datetime import datetime, timedelta
3
4
  from typing import Optional, List, TYPE_CHECKING
@@ -14,13 +15,15 @@ if TYPE_CHECKING:
14
15
 
15
16
 
16
17
  def format_duration(duration: timedelta):
17
- return f"{duration.total_seconds():.2}sec"
18
+ return f"{duration.total_seconds():.2f}sec"
18
19
 
19
20
 
20
21
  class IngestionJobSummary(BaseModel):
22
+ ingestion_job_summary_id: str
21
23
  ingestion_job_id: str
22
24
 
23
25
  # From the IngestionPlan
26
+ provider: str
24
27
  source_name: str
25
28
  dataset_type: str
26
29
  data_spec_versions: DataSpecVersionCollection
@@ -31,6 +34,7 @@ class IngestionJobSummary(BaseModel):
31
34
  timings: List[Timing] = Field(default_factory=list)
32
35
  task_summaries: List[TaskSummary] = Field(default_factory=list)
33
36
 
37
+ skipped_datasets: int = 0
34
38
  failed_tasks: int = 0
35
39
  successful_tasks: int = 0
36
40
  ignored_successful_tasks: int = 0
@@ -38,7 +42,9 @@ class IngestionJobSummary(BaseModel):
38
42
  @classmethod
39
43
  def new(cls, ingestion_job: "IngestionJob"):
40
44
  args = dict(
45
+ ingestion_job_summary_id=str(uuid.uuid1()),
41
46
  ingestion_job_id=ingestion_job.ingestion_job_id,
47
+ provider=ingestion_job.ingestion_plan.source.provider,
42
48
  source_name=ingestion_job.ingestion_plan.source.name,
43
49
  dataset_type=ingestion_job.ingestion_plan.dataset_type,
44
50
  data_spec_versions=ingestion_job.ingestion_plan.data_spec_versions,
@@ -52,9 +58,23 @@ class IngestionJobSummary(BaseModel):
52
58
  yield
53
59
  self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
54
60
 
61
+ def start_timing(self, name):
62
+ start = utcnow()
63
+
64
+ def finish():
65
+ self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
66
+
67
+ return finish
68
+
55
69
  def add_task_summaries(self, task_summaries: List[TaskSummary]):
56
70
  self.task_summaries.extend(task_summaries)
57
71
 
72
+ def increase_skipped_datasets(self, skipped_datasets: int):
73
+ self.skipped_datasets += skipped_datasets
74
+
75
+ def task_count(self):
76
+ return len(self.task_summaries)
77
+
58
78
  def set_finished(self):
59
79
  self.failed_tasks = len(
60
80
  [task for task in self.task_summaries if task.status == TaskStatus.FAILED]
@@ -80,6 +100,7 @@ class IngestionJobSummary(BaseModel):
80
100
  print("--------------------")
81
101
  print(f" - IngestionPlan:")
82
102
  print(f" Source: {self.source_name}")
103
+ print(f" Provider: {self.provider}")
83
104
  print(f" DatasetType: {self.dataset_type}")
84
105
  print(f" - Selector: {self.selector}")
85
106
  print(f" - Timings: ")
@@ -89,14 +110,10 @@ class IngestionJobSummary(BaseModel):
89
110
  f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
90
111
  )
91
112
 
92
- for status in [
93
- TaskStatus.FAILED,
94
- TaskStatus.FINISHED,
95
- TaskStatus.FINISHED_IGNORED,
96
- ]:
97
- print(
98
- f" - {status.value.lower()}: {len([task for task in self.task_summaries if task.status == status])}"
99
- )
113
+ print(f" - Failed tasks: {self.failed_tasks}")
114
+ print(f" - Successful tasks: {self.successful_tasks}")
115
+ print(f" - Successful ignored tasks: {self.successful_tasks}")
116
+ print(f" - Skipped datasets: {self.skipped_datasets}")
100
117
  print("--------------------")
101
118
 
102
119
  def __enter__(self):
@@ -0,0 +1,111 @@
1
+ from abc import ABC, abstractmethod
2
+ from enum import Enum
3
+ from typing import Callable, Optional, Union
4
+
5
+ from ingestify.exceptions import IngestifyError
6
+
7
+
8
+ class TransformationType(Enum):
9
+ IDENTITY = "IDENTITY"
10
+ BUCKET = "BUCKET"
11
+ RANGE = "RANGE"
12
+ CUSTOM = "CUSTOM"
13
+
14
+
15
+ class Transformation(ABC):
16
+ @property
17
+ @abstractmethod
18
+ def transformation_type(self) -> TransformationType:
19
+ pass
20
+
21
+ def is_identity(self) -> bool:
22
+ return self.transformation_type == TransformationType.IDENTITY
23
+
24
+ @abstractmethod
25
+ def __call__(self, id_key_value: Union[str, int]) -> str:
26
+ pass
27
+
28
+ @classmethod
29
+ def from_dict(cls, config: dict) -> "Transformation":
30
+ type_ = config.pop("type")
31
+ if type_ == "bucket":
32
+ return BucketTransformation(**config)
33
+ else:
34
+ raise IngestifyError(f"Cannot build Transformation from {config}")
35
+
36
+
37
+ class IdentityTransformation(Transformation):
38
+ transformation_type = TransformationType.IDENTITY
39
+
40
+ def __call__(self, id_key_value: Union[str, int]) -> str:
41
+ # Return the original value as a string
42
+ return str(id_key_value)
43
+
44
+
45
+ class BucketTransformation(Transformation):
46
+ transformation_type = TransformationType.BUCKET
47
+
48
+ def __init__(self, bucket_size: int = None, bucket_count: int = None):
49
+ self.bucket_size = bucket_size
50
+ self.bucket_count = bucket_count
51
+
52
+ def __call__(self, id_key_value: Union[str, int]) -> str:
53
+ if self.bucket_count:
54
+ return str(int(id_key_value) % self.bucket_count)
55
+ elif self.bucket_size:
56
+ bucket_start = int(id_key_value) // self.bucket_size * self.bucket_size
57
+ bucket_end = bucket_start + self.bucket_size - 1
58
+ return f"{bucket_start}-{bucket_end}"
59
+ else:
60
+ raise IngestifyError("Invalid BucketTransformation")
61
+
62
+
63
+ class IdentifierTransformer:
64
+ def __init__(self):
65
+ # Mapping of (provider, dataset_type, id_key) to the transformation
66
+ self.key_transformations: dict[tuple[str, str, str], Transformation] = {}
67
+
68
+ def register_transformation(
69
+ self,
70
+ provider: str,
71
+ dataset_type: str,
72
+ id_key: str,
73
+ transformation: Union[Transformation, dict],
74
+ ):
75
+ """
76
+ Registers a transformation for a specific (provider, dataset_type, id_key).
77
+ """
78
+ if isinstance(transformation, dict):
79
+ transformation = Transformation.from_dict(transformation)
80
+
81
+ self.key_transformations[(provider, dataset_type, id_key)] = transformation
82
+
83
+ def get_transformation(
84
+ self, provider: str, dataset_type: str, id_key: str
85
+ ) -> Transformation:
86
+ """
87
+ Retrieves the transformation for the given column or defaults to identity.
88
+ """
89
+ transformation = self.key_transformations.get((provider, dataset_type, id_key))
90
+ return transformation if transformation else IdentityTransformation()
91
+
92
+ def to_path(self, provider: str, dataset_type: str, identifier: dict) -> str:
93
+ """
94
+ Transforms the identifier into a path string using registered transformations.
95
+ For non-identity transformations, includes both transformed and original values,
96
+ with the transformed value appearing first and including the suffix.
97
+ """
98
+ path_parts = []
99
+ for key, value in identifier.items():
100
+ transformation = self.get_transformation(provider, dataset_type, key)
101
+ if not transformation.is_identity():
102
+ # Non-identity transformation: include both transformed and original
103
+ transformed_value = transformation(value)
104
+ suffix = transformation.transformation_type.value.lower()
105
+ path_parts.append(f"{key}_{suffix}={transformed_value}")
106
+
107
+ # Append the original value (either standalone for identity or alongside transformed)
108
+ path_parts.append(f"{key}={value}")
109
+
110
+ # Join the parts with `/` to form the full path
111
+ return "/".join(path_parts)
@@ -229,9 +229,11 @@ mapper_registry.map_imperatively(File, file_table)
229
229
  ingestion_job_summary = Table(
230
230
  "ingestion_job_summary",
231
231
  metadata,
232
- Column("ingestion_job_id", String(255), primary_key=True),
232
+ Column("ingestion_job_summary_id", String(255), primary_key=True),
233
+ Column("ingestion_job_id", String(255), index=True),
233
234
  # From the IngestionPlan
234
235
  Column("source_name", String(255)),
236
+ Column("provider", String(255)),
235
237
  Column("dataset_type", String(255)),
236
238
  Column(
237
239
  "data_spec_versions",
@@ -250,6 +252,7 @@ ingestion_job_summary = Table(
250
252
  # Some task counters
251
253
  Column("successful_tasks", Integer),
252
254
  Column("ignored_successful_tasks", Integer),
255
+ Column("skipped_datasets", Integer),
253
256
  Column("failed_tasks", Integer),
254
257
  Column(
255
258
  "timings",
@@ -281,9 +284,9 @@ task_summary_table = Table(
281
284
  "task_summary",
282
285
  metadata,
283
286
  Column(
284
- "ingestion_job_id",
287
+ "ingestion_job_summary_id",
285
288
  String(255),
286
- ForeignKey("ingestion_job_summary.ingestion_job_id"),
289
+ ForeignKey("ingestion_job_summary.ingestion_job_summary_id"),
287
290
  primary_key=True,
288
291
  ),
289
292
  Column("task_id", Integer, primary_key=True),
@@ -209,9 +209,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
209
209
  )
210
210
 
211
211
  if not metadata_only:
212
- dataset_query = apply_query_filter(
213
- self.session.query(Dataset) # .options(joinedload(Dataset.revisions))
214
- )
212
+ dataset_query = apply_query_filter(self.session.query(Dataset))
215
213
  datasets = list(dataset_query)
216
214
  else:
217
215
  datasets = []
@@ -19,14 +19,12 @@ class LocalFileRepository(FileRepository):
19
19
  filename: str,
20
20
  stream: BinaryIO,
21
21
  ) -> Path:
22
- path = self.get_path(bucket, dataset, revision_id, filename)
22
+ path = self.get_write_path(bucket, dataset, revision_id, filename)
23
23
  path.parent.mkdir(parents=True, exist_ok=True)
24
24
 
25
25
  with open(path, "wb") as fp:
26
26
  shutil.copyfileobj(stream, fp)
27
27
  return path
28
28
 
29
- def load_content(
30
- self, bucket: str, dataset: Dataset, revision_id: int, filename: str
31
- ) -> BinaryIO:
32
- return open(self.get_path(bucket, dataset, revision_id, filename), "rb")
29
+ def load_content(self, storage_path: str) -> BinaryIO:
30
+ return open(self.get_read_path(storage_path), "rb")
@@ -8,10 +8,7 @@ from ingestify.domain.models import FileRepository
8
8
 
9
9
 
10
10
  class S3FileRepository(FileRepository):
11
- def __init__(self, url):
12
- super().__init__(url)
13
-
14
- self._s3 = None
11
+ _s3 = None
15
12
 
16
13
  @property
17
14
  def s3(self):
@@ -30,16 +27,14 @@ class S3FileRepository(FileRepository):
30
27
  filename: str,
31
28
  stream: BinaryIO,
32
29
  ) -> Path:
33
- key = self.get_path(bucket, dataset, revision_id, filename)
30
+ key = self.get_write_path(bucket, dataset, revision_id, filename)
34
31
  s3_bucket = Path(key.parts[0])
35
32
 
36
33
  self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).put(Body=stream)
37
34
  return key
38
35
 
39
- def load_content(
40
- self, bucket: str, dataset: Dataset, revision_id: int, filename: str
41
- ) -> BinaryIO:
42
- key = self.get_path(bucket, dataset, revision_id, filename)
36
+ def load_content(self, storage_path: str) -> BinaryIO:
37
+ key = self.get_read_path(storage_path)
43
38
  s3_bucket = Path(key.parts[0])
44
39
  return self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).get()[
45
40
  "Body"
ingestify/main.py CHANGED
@@ -19,6 +19,7 @@ from ingestify.domain.models.event import EventBus, Publisher, Subscriber
19
19
 
20
20
  from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
21
21
  from ingestify.domain.models.fetch_policy import FetchPolicy
22
+ from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer
22
23
  from ingestify.exceptions import ConfigurationError
23
24
  from ingestify.infra import S3FileRepository, LocalFileRepository
24
25
  from ingestify.infra.store.dataset.sqlalchemy import SqlAlchemyDatasetRepository
@@ -60,11 +61,15 @@ def import_cls(name):
60
61
  return getattr(mod, components[-1])
61
62
 
62
63
 
63
- def build_file_repository(file_url: str) -> FileRepository:
64
+ def build_file_repository(file_url: str, identifier_transformer) -> FileRepository:
64
65
  if file_url.startswith("s3://"):
65
- repository = S3FileRepository(url=file_url)
66
+ repository = S3FileRepository(
67
+ url=file_url, identifier_transformer=identifier_transformer
68
+ )
66
69
  elif file_url.startswith("file://"):
67
- repository = LocalFileRepository(url=file_url)
70
+ repository = LocalFileRepository(
71
+ url=file_url, identifier_transformer=identifier_transformer
72
+ )
68
73
  else:
69
74
  raise Exception(f"Cannot find repository to handle file {file_url}")
70
75
 
@@ -72,7 +77,7 @@ def build_file_repository(file_url: str) -> FileRepository:
72
77
 
73
78
 
74
79
  def get_dataset_store_by_urls(
75
- metadata_url: str, file_url: str, bucket: str
80
+ metadata_url: str, file_url: str, bucket: str, dataset_types
76
81
  ) -> DatasetStore:
77
82
  """
78
83
  Initialize a DatasetStore by a DatasetRepository and a FileRepository
@@ -80,7 +85,19 @@ def get_dataset_store_by_urls(
80
85
  if not bucket:
81
86
  raise Exception("Bucket is not specified")
82
87
 
83
- file_repository = build_file_repository(file_url)
88
+ identifier_transformer = IdentifierTransformer()
89
+ for dataset_type in dataset_types:
90
+ for id_key, id_config in dataset_type["identifier_keys"].items():
91
+ identifier_transformer.register_transformation(
92
+ provider=dataset_type["provider"],
93
+ dataset_type=dataset_type["dataset_type"],
94
+ id_key=id_key,
95
+ transformation=id_config["transformation"],
96
+ )
97
+
98
+ file_repository = build_file_repository(
99
+ file_url, identifier_transformer=identifier_transformer
100
+ )
84
101
 
85
102
  if secrets_manager.supports(metadata_url):
86
103
  metadata_url = secrets_manager.load_as_db_url(metadata_url)
@@ -103,14 +120,15 @@ def get_datastore(config_file, bucket: Optional[str] = None) -> DatasetStore:
103
120
  config = parse_config(config_file, default_value="")
104
121
 
105
122
  return get_dataset_store_by_urls(
106
- dataset_url=config["main"]["dataset_url"],
123
+ metadata_url=config["main"]["metadata_url"],
107
124
  file_url=config["main"]["file_url"],
108
125
  bucket=bucket or config["main"].get("default_bucket"),
126
+ dataset_types=config.get("dataset_types", []),
109
127
  )
110
128
 
111
129
 
112
130
  def get_remote_datastore(url: str, bucket: str, **kwargs) -> DatasetStore:
113
- return get_dataset_store_by_urls(dataset_url=url, file_url=url, bucket=bucket)
131
+ return get_dataset_store_by_urls(metadata_url=url, file_url=url, bucket=bucket)
114
132
 
115
133
 
116
134
  def get_source_cls(key: str) -> Type[Source]:
@@ -173,6 +191,7 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
173
191
  metadata_url=config["main"]["metadata_url"],
174
192
  file_url=config["main"]["file_url"],
175
193
  bucket=bucket or config["main"].get("default_bucket"),
194
+ dataset_types=config.get("dataset_types", []),
176
195
  )
177
196
 
178
197
  # Setup an EventBus and wire some more components
@@ -188,7 +207,7 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
188
207
  store=store,
189
208
  )
190
209
 
191
- logger.info("Determining tasks...")
210
+ logger.info("Adding IngestionPlans...")
192
211
 
193
212
  fetch_policy = FetchPolicy()
194
213
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ingestify
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Standardizing soccer tracking- and event data
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -1,14 +1,14 @@
1
- ingestify/__init__.py,sha256=rzYt6rUUedAUB4VDxDENn6bzWpACW34yfbQKVjTzgQg,301
1
+ ingestify/__init__.py,sha256=DnPPEtJT32gAPuUKXgIsqUE4fIvc6QA96vrcKr6nz6A,301
2
2
  ingestify/cmdline.py,sha256=bIuyPgGEw4wIglNzpG9zp7TsJozsP8NSVsCe4eAyWUg,7189
3
3
  ingestify/exceptions.py,sha256=wMMuajl4AkQRfW60TLN7btJmQaH8-lUczXyW_2g9kOU,143
4
- ingestify/main.py,sha256=Lo8bCwOz3AOeO1pSTYhd7VjSZ8tcc9eSz0GLlwyy6DI,7632
4
+ ingestify/main.py,sha256=0sTNoLcS7euOavIAviQIMTolRnXsvOvNbmFdXgXgxhE,8516
5
5
  ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
6
6
  ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
7
7
  ingestify/utils.py,sha256=HETGhAoUlutLG0cQR63nac2JbFei9gnktDHeBQoYWfU,5692
8
8
  ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- ingestify/application/dataset_store.py,sha256=LccTpvsMWCIV0ewzS5sIXKk2kaQcZhnXGFT8Eao3U3Q,12074
9
+ ingestify/application/dataset_store.py,sha256=6xMHa_ShyPOyegIKl2xwmRl3BlV5i21z95cpKW3oARw,11712
10
10
  ingestify/application/ingestion_engine.py,sha256=PtMjKMpvfqB802G5zfKLzyamdH7qFOXl3x6_97y8w60,2288
11
- ingestify/application/loader.py,sha256=nqLKtwu48mJVumB9BtgTv79soCOtW9pzg-pvTvc66bc,7031
11
+ ingestify/application/loader.py,sha256=v8ZcpMDEml9k_uFPFqT4WaCjXED_OIpAr7g0Pz5Hp6Y,7153
12
12
  ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
13
13
  ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
14
14
  ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
@@ -27,7 +27,7 @@ ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4
27
27
  ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
28
28
  ingestify/domain/models/dataset/file.py,sha256=nuoZI9GI5OysYwWCCyNsHMlm1Z9A1GbEKd38jvBzJ4E,4119
29
29
  ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
30
- ingestify/domain/models/dataset/file_repository.py,sha256=ntzLiWZleZQFmrVsFvDSwfbOT86WtAXLbqgA8HlV56Q,1248
30
+ ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
31
31
  ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
32
32
  ingestify/domain/models/dataset/revision.py,sha256=O_1HG2S2EmYdWqI2K282S_D-d6IhRh_f4Q3wV8MEhkk,1311
33
33
  ingestify/domain/models/dataset/selector.py,sha256=kEGpU8pIyjZ0zwE9n2uo_NY5xrNanWiTTgapyMAUEsw,1039
@@ -39,8 +39,8 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
39
39
  ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
40
40
  ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
41
41
  ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- ingestify/domain/models/ingestion/ingestion_job.py,sha256=GnBQVnTU3FdKdSElXEISUrQz-orGIHchnNAo20Qg0DY,11511
43
- ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=YygBv0GgU396HRe-exQqW2QmitBEnAh2VG_xkW3wdyQ,3645
42
+ ingestify/domain/models/ingestion/ingestion_job.py,sha256=U6B62c7NGeHBAjmKhgOa4uHeul34xyR66WtWaPSRNTU,12276
43
+ ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=1l9O3QJkYLs74HhrwAijwNEriPMwHN9OFG64Iz4z3uI,4262
44
44
  ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
45
45
  ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
46
46
  ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
@@ -49,6 +49,7 @@ ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIp
49
49
  ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
50
50
  ingestify/domain/models/task/task_summary.py,sha256=ovzqKPstngRVzVA_JboQMluq5uQjKVJDsWNNcfcadhU,3774
51
51
  ingestify/domain/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
52
+ ingestify/domain/services/identifier_key_transformer.py,sha256=y4GS9u9Ej1MO2jUhAxWbifp0mrE_MqTHvVVcoQzSKb4,4034
52
53
  ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
54
  ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-_hWZJTMcduS1Gg7EM4X95Cqxi1QIM,809
54
55
  ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
@@ -63,11 +64,11 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
63
64
  ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
64
65
  ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
66
  ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
66
- ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256=-iTkC4_YGkkFrIsEZVTW2eoaofj4c7QZFaq7tl1r2G4,9288
67
- ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=O2g7g_clNz43g9gXjjBJZsIGvRTntQ6rJpQeDT8yQ7c,7141
67
+ ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256=UlEIfNusSOEWOxPi_ORrdLSylbi6-TO1qwEmcrBLwog,9447
68
+ ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=-eSR_F9tS9Hd3JNEpoJoDAb5RY38rFaKLMI3eBedjx8,7068
68
69
  ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
69
- ingestify/infra/store/file/local_file_repository.py,sha256=0oIzjjKO5U_7gPXhsBJFUqQBarQTFQS499ZK7HNxMxo,893
70
- ingestify/infra/store/file/s3_file_repository.py,sha256=txDviBrY9EHn3soqLFvTrjSPkyh548RxUgx4T83j0QY,1331
70
+ ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
71
+ ingestify/infra/store/file/s3_file_repository.py,sha256=_sekV1rfEbwIaSGhKRnFQlj92E9qNgONiwXt6ZLCyGg,1188
71
72
  ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
73
  ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
73
74
  ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
@@ -78,8 +79,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
78
79
  ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
79
80
  ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
80
81
  ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
81
- ingestify-0.2.0.dist-info/METADATA,sha256=8974JGisSq9_Q-4M1cFYY_AU5zBW7n_UZ8NKjj_ZBDM,18853
82
- ingestify-0.2.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
83
- ingestify-0.2.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
84
- ingestify-0.2.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
85
- ingestify-0.2.0.dist-info/RECORD,,
82
+ ingestify-0.3.0.dist-info/METADATA,sha256=-QlChdV6OYWkqSyXUmkQTG4deBliRsSmmZMTWKeURnI,18853
83
+ ingestify-0.3.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
84
+ ingestify-0.3.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
85
+ ingestify-0.3.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
86
+ ingestify-0.3.0.dist-info/RECORD,,