ingestify 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +3 -12
- ingestify/application/loader.py +13 -13
- ingestify/domain/models/dataset/file_repository.py +28 -18
- ingestify/domain/models/ingestion/ingestion_job.py +121 -88
- ingestify/domain/models/ingestion/ingestion_job_summary.py +26 -9
- ingestify/domain/services/identifier_key_transformer.py +111 -0
- ingestify/infra/store/dataset/sqlalchemy/mapping.py +6 -3
- ingestify/infra/store/dataset/sqlalchemy/repository.py +1 -3
- ingestify/infra/store/file/local_file_repository.py +3 -5
- ingestify/infra/store/file/s3_file_repository.py +4 -9
- ingestify/main.py +27 -8
- {ingestify-0.2.0.dist-info → ingestify-0.3.0.dist-info}/METADATA +1 -1
- {ingestify-0.2.0.dist-info → ingestify-0.3.0.dist-info}/RECORD +17 -16
- {ingestify-0.2.0.dist-info → ingestify-0.3.0.dist-info}/WHEEL +0 -0
- {ingestify-0.2.0.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.2.0.dist-info → ingestify-0.3.0.dist-info}/top_level.txt +0 -0
ingestify/__init__.py
CHANGED
|
@@ -66,6 +66,7 @@ class DatasetStore:
|
|
|
66
66
|
dataset_type: Optional[str] = None,
|
|
67
67
|
provider: Optional[str] = None,
|
|
68
68
|
dataset_id: Optional[str] = None,
|
|
69
|
+
metadata_only: Optional[bool] = False,
|
|
69
70
|
**selector,
|
|
70
71
|
) -> DatasetCollection:
|
|
71
72
|
if "selector" in selector:
|
|
@@ -86,6 +87,7 @@ class DatasetStore:
|
|
|
86
87
|
dataset_type=dataset_type,
|
|
87
88
|
dataset_id=dataset_id,
|
|
88
89
|
provider=provider,
|
|
90
|
+
metadata_only=metadata_only,
|
|
89
91
|
selector=selector,
|
|
90
92
|
)
|
|
91
93
|
return dataset_collection
|
|
@@ -291,20 +293,9 @@ class DatasetStore:
|
|
|
291
293
|
continue
|
|
292
294
|
|
|
293
295
|
def get_stream(file_):
|
|
294
|
-
revision_id = file_.revision_id
|
|
295
|
-
if revision_id is None:
|
|
296
|
-
revision_id = current_revision.revision_id
|
|
297
|
-
|
|
298
296
|
return reader(
|
|
299
297
|
self.file_repository.load_content(
|
|
300
|
-
bucket=self.bucket,
|
|
301
|
-
dataset=dataset,
|
|
302
|
-
# When file.revision_id is set we must use it.
|
|
303
|
-
revision_id=revision_id,
|
|
304
|
-
filename=file_.file_id
|
|
305
|
-
+ "."
|
|
306
|
-
+ file_.data_serialization_format
|
|
307
|
-
+ suffix,
|
|
298
|
+
bucket=self.bucket, storage_path=file_.storage_path
|
|
308
299
|
)
|
|
309
300
|
)
|
|
310
301
|
|
ingestify/application/loader.py
CHANGED
|
@@ -33,6 +33,8 @@ class Loader:
|
|
|
33
33
|
# First collect all selectors, before discovering datasets
|
|
34
34
|
selectors = {}
|
|
35
35
|
for ingestion_plan in self.ingestion_plans:
|
|
36
|
+
logger.info(f"Determining selectors for {ingestion_plan}")
|
|
37
|
+
|
|
36
38
|
if provider is not None:
|
|
37
39
|
if ingestion_plan.source.provider != provider:
|
|
38
40
|
logger.info(
|
|
@@ -137,7 +139,7 @@ class Loader:
|
|
|
137
139
|
IngestionJobSummary holds the summary for an IngestionPlan and a single Selector
|
|
138
140
|
"""
|
|
139
141
|
for ingestion_plan, selector in selectors.values():
|
|
140
|
-
logger.
|
|
142
|
+
logger.info(
|
|
141
143
|
f"Discovering datasets from {ingestion_plan.source.__class__.__name__} using selector {selector}"
|
|
142
144
|
)
|
|
143
145
|
|
|
@@ -148,18 +150,16 @@ class Loader:
|
|
|
148
150
|
)
|
|
149
151
|
|
|
150
152
|
with TaskExecutor(dry_run=dry_run) as task_executor:
|
|
151
|
-
ingestion_job_summary
|
|
153
|
+
for ingestion_job_summary in ingestion_job.execute(
|
|
152
154
|
self.store, task_executor=task_executor
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
ingestion_job_summary.output_report()
|
|
163
|
-
self.store.save_ingestion_job_summary(ingestion_job_summary)
|
|
155
|
+
):
|
|
156
|
+
# TODO: handle task_summaries
|
|
157
|
+
# Summarize to a IngestionJobSummary, and save to a database. This Summary can later be used in a
|
|
158
|
+
# next run to determine where to resume.
|
|
159
|
+
# TODO 2: Do we want to add additional information from the summary back to the Task, so it can use
|
|
160
|
+
# extra information to determine how/where to resume
|
|
161
|
+
ingestion_job_summary.output_report()
|
|
162
|
+
logger.info(f"Storing IngestionJobSummary")
|
|
163
|
+
self.store.save_ingestion_job_summary(ingestion_job_summary)
|
|
164
164
|
|
|
165
165
|
logger.info("Done")
|
|
@@ -3,11 +3,34 @@ from pathlib import Path
|
|
|
3
3
|
from typing import BinaryIO
|
|
4
4
|
|
|
5
5
|
from .dataset import Dataset
|
|
6
|
+
from ...services.identifier_key_transformer import IdentifierTransformer
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class FileRepository(ABC):
|
|
9
|
-
def __init__(self, url: str):
|
|
10
|
+
def __init__(self, url: str, identifier_transformer: IdentifierTransformer):
|
|
10
11
|
self.base_dir = Path(url.split("://")[1])
|
|
12
|
+
self.identifier_transformer = identifier_transformer
|
|
13
|
+
|
|
14
|
+
def get_write_path(
|
|
15
|
+
self, bucket: str, dataset: Dataset, revision_id: int, filename: str
|
|
16
|
+
) -> Path:
|
|
17
|
+
# TODO: use the IdentifierKeyTransformer
|
|
18
|
+
identifier_path = self.identifier_transformer.to_path(
|
|
19
|
+
provider=dataset.provider,
|
|
20
|
+
dataset_type=dataset.dataset_type,
|
|
21
|
+
identifier=dataset.identifier,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
path = (
|
|
25
|
+
self.base_dir
|
|
26
|
+
/ bucket
|
|
27
|
+
/ f"provider={dataset.provider}"
|
|
28
|
+
/ f"dataset_type={dataset.dataset_type}"
|
|
29
|
+
/ identifier_path
|
|
30
|
+
/ str(revision_id)
|
|
31
|
+
/ filename
|
|
32
|
+
)
|
|
33
|
+
return path
|
|
11
34
|
|
|
12
35
|
@abstractmethod
|
|
13
36
|
def save_content(
|
|
@@ -20,10 +43,11 @@ class FileRepository(ABC):
|
|
|
20
43
|
) -> Path:
|
|
21
44
|
pass
|
|
22
45
|
|
|
46
|
+
def get_read_path(self, storage_path: str) -> Path:
|
|
47
|
+
return self.base_dir / storage_path
|
|
48
|
+
|
|
23
49
|
@abstractmethod
|
|
24
|
-
def load_content(
|
|
25
|
-
self, bucket: str, dataset: Dataset, revision_id: int, filename: str
|
|
26
|
-
) -> BinaryIO:
|
|
50
|
+
def load_content(self, storage_path: str) -> BinaryIO:
|
|
27
51
|
pass
|
|
28
52
|
|
|
29
53
|
@classmethod
|
|
@@ -31,20 +55,6 @@ class FileRepository(ABC):
|
|
|
31
55
|
def supports(cls, url: str) -> bool:
|
|
32
56
|
pass
|
|
33
57
|
|
|
34
|
-
def get_path(
|
|
35
|
-
self, bucket: str, dataset: Dataset, revision_id: int, filename: str
|
|
36
|
-
) -> Path:
|
|
37
|
-
path = (
|
|
38
|
-
self.base_dir
|
|
39
|
-
/ bucket
|
|
40
|
-
/ f"provider={dataset.provider}"
|
|
41
|
-
/ f"dataset_type={dataset.dataset_type}"
|
|
42
|
-
/ str(dataset.identifier)
|
|
43
|
-
/ str(revision_id)
|
|
44
|
-
/ filename
|
|
45
|
-
)
|
|
46
|
-
return path
|
|
47
|
-
|
|
48
58
|
def get_relative_path(self, path: Path) -> Path:
|
|
49
59
|
"""Return the relative path to the base of the repository"""
|
|
50
60
|
return path.relative_to(self.base_dir)
|
|
@@ -2,7 +2,7 @@ import itertools
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import uuid
|
|
5
|
-
from typing import Optional
|
|
5
|
+
from typing import Optional, Iterator
|
|
6
6
|
|
|
7
7
|
from ingestify import retrieve_http
|
|
8
8
|
from ingestify.application.dataset_store import DatasetStore
|
|
@@ -22,7 +22,7 @@ from ingestify.utils import TaskExecutor, chunker
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
DEFAULT_CHUNK_SIZE =
|
|
25
|
+
DEFAULT_CHUNK_SIZE = 1_000
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def run_task(task):
|
|
@@ -32,14 +32,14 @@ def run_task(task):
|
|
|
32
32
|
|
|
33
33
|
def to_batches(input_):
|
|
34
34
|
if isinstance(input_, list):
|
|
35
|
-
batches =
|
|
35
|
+
batches = iter(input_)
|
|
36
36
|
else:
|
|
37
37
|
# Assume it's an iterator. Peek what's inside, and put it back
|
|
38
38
|
try:
|
|
39
39
|
peek = next(input_)
|
|
40
40
|
except StopIteration:
|
|
41
41
|
# Nothing to batch
|
|
42
|
-
return []
|
|
42
|
+
return iter([])
|
|
43
43
|
|
|
44
44
|
input_ = itertools.chain([peek], input_)
|
|
45
45
|
|
|
@@ -184,6 +184,9 @@ class CreateDatasetTask(Task):
|
|
|
184
184
|
return f"CreateDatasetTask({self.dataset_resource.provider} -> {self.dataset_resource.dataset_resource_id})"
|
|
185
185
|
|
|
186
186
|
|
|
187
|
+
MAX_TASKS_PER_CHUNK = 10_000
|
|
188
|
+
|
|
189
|
+
|
|
187
190
|
class IngestionJob:
|
|
188
191
|
def __init__(
|
|
189
192
|
self,
|
|
@@ -197,96 +200,126 @@ class IngestionJob:
|
|
|
197
200
|
|
|
198
201
|
def execute(
|
|
199
202
|
self, store: DatasetStore, task_executor: TaskExecutor
|
|
200
|
-
) -> IngestionJobSummary:
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
203
|
+
) -> Iterator[IngestionJobSummary]:
|
|
204
|
+
is_first_chunk = True
|
|
205
|
+
ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
|
|
206
|
+
# Process all items in batches. Yield a IngestionJobSummary per batch
|
|
207
|
+
|
|
208
|
+
logger.info("Finding metadata")
|
|
209
|
+
with ingestion_job_summary.record_timing("get_dataset_collection"):
|
|
210
|
+
dataset_collection_metadata = store.get_dataset_collection(
|
|
211
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
212
|
+
data_spec_versions=self.selector.data_spec_versions,
|
|
213
|
+
selector=self.selector,
|
|
214
|
+
metadata_only=True,
|
|
215
|
+
).metadata
|
|
216
|
+
logger.info(f"Done: {dataset_collection_metadata}")
|
|
217
|
+
|
|
218
|
+
# There are two different, but similar flows here:
|
|
219
|
+
# 1. The discover_datasets returns a list, and the entire list can be processed at once
|
|
220
|
+
# 2. The discover_datasets returns an iterator of batches, in this case we need to process each batch
|
|
221
|
+
with ingestion_job_summary.record_timing("find_datasets"):
|
|
222
|
+
# Timing might be incorrect as it is an iterator
|
|
223
|
+
dataset_resources = self.ingestion_plan.source.find_datasets(
|
|
224
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
225
|
+
data_spec_versions=self.selector.data_spec_versions,
|
|
226
|
+
dataset_collection_metadata=dataset_collection_metadata,
|
|
227
|
+
**self.selector.custom_attributes,
|
|
228
|
+
)
|
|
221
229
|
|
|
222
|
-
|
|
230
|
+
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
223
231
|
|
|
224
|
-
|
|
225
|
-
for batch in batches:
|
|
226
|
-
dataset_identifiers = [
|
|
227
|
-
Identifier.create_from_selector(
|
|
228
|
-
self.selector, **dataset_resource.dataset_resource_id
|
|
229
|
-
)
|
|
230
|
-
# We have to pass the data_spec_versions here as a Source can add some
|
|
231
|
-
# extra data to the identifier which is retrieved in a certain data format
|
|
232
|
-
for dataset_resource in batch
|
|
233
|
-
]
|
|
234
|
-
|
|
235
|
-
# Load all available datasets based on the discovered dataset identifiers
|
|
236
|
-
dataset_collection = store.get_dataset_collection(
|
|
237
|
-
dataset_type=self.ingestion_plan.dataset_type,
|
|
238
|
-
# Assume all DatasetResources share the same provider
|
|
239
|
-
provider=batch[0].provider,
|
|
240
|
-
selector=dataset_identifiers,
|
|
241
|
-
)
|
|
232
|
+
batches = to_batches(dataset_resources)
|
|
242
233
|
|
|
243
|
-
|
|
234
|
+
while True:
|
|
235
|
+
try:
|
|
236
|
+
batch = next(batches)
|
|
237
|
+
except StopIteration:
|
|
238
|
+
break
|
|
239
|
+
except Exception:
|
|
240
|
+
# TODO: handle exception on IngestionJob level
|
|
241
|
+
raise
|
|
244
242
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
243
|
+
dataset_identifiers = [
|
|
244
|
+
Identifier.create_from_selector(
|
|
245
|
+
self.selector, **dataset_resource.dataset_resource_id
|
|
246
|
+
)
|
|
247
|
+
# We have to pass the data_spec_versions here as a Source can add some
|
|
248
|
+
# extra data to the identifier which is retrieved in a certain data format
|
|
249
|
+
for dataset_resource in batch
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
# Load all available datasets based on the discovered dataset identifiers
|
|
253
|
+
dataset_collection = store.get_dataset_collection(
|
|
254
|
+
dataset_type=self.ingestion_plan.dataset_type,
|
|
255
|
+
# Assume all DatasetResources share the same provider
|
|
256
|
+
provider=batch[0].provider,
|
|
257
|
+
selector=dataset_identifiers,
|
|
258
|
+
)
|
|
250
259
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
dataset_resource
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
CreateDatasetTask(
|
|
270
|
-
dataset_resource=dataset_resource,
|
|
271
|
-
store=store,
|
|
272
|
-
)
|
|
273
|
-
)
|
|
274
|
-
else:
|
|
275
|
-
skip_count += 1
|
|
276
|
-
|
|
277
|
-
if task_set:
|
|
278
|
-
logger.info(
|
|
279
|
-
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
280
|
-
f"using selector {self.selector} => {len(task_set)} tasks. {skip_count} skipped."
|
|
281
|
-
)
|
|
282
|
-
logger.info(f"Running {len(task_set)} tasks")
|
|
283
|
-
ingestion_job_summary.add_task_summaries(
|
|
284
|
-
task_executor.run(run_task, task_set)
|
|
260
|
+
skipped_datasets = 0
|
|
261
|
+
|
|
262
|
+
task_set = TaskSet()
|
|
263
|
+
for dataset_resource in batch:
|
|
264
|
+
dataset_identifier = Identifier.create_from_selector(
|
|
265
|
+
self.selector, **dataset_resource.dataset_resource_id
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
if dataset := dataset_collection.get(dataset_identifier):
|
|
269
|
+
if self.ingestion_plan.fetch_policy.should_refetch(
|
|
270
|
+
dataset, dataset_resource
|
|
271
|
+
):
|
|
272
|
+
task_set.add(
|
|
273
|
+
UpdateDatasetTask(
|
|
274
|
+
dataset=dataset, # Current dataset from the database
|
|
275
|
+
dataset_resource=dataset_resource, # Most recent dataset_resource
|
|
276
|
+
store=store,
|
|
277
|
+
)
|
|
285
278
|
)
|
|
286
279
|
else:
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
280
|
+
skipped_datasets += 1
|
|
281
|
+
else:
|
|
282
|
+
if self.ingestion_plan.fetch_policy.should_fetch(dataset_resource):
|
|
283
|
+
task_set.add(
|
|
284
|
+
CreateDatasetTask(
|
|
285
|
+
dataset_resource=dataset_resource,
|
|
286
|
+
store=store,
|
|
287
|
+
)
|
|
290
288
|
)
|
|
289
|
+
else:
|
|
290
|
+
skipped_datasets += 1
|
|
291
|
+
|
|
292
|
+
if task_set:
|
|
293
|
+
logger.info(
|
|
294
|
+
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
295
|
+
f"using selector {self.selector} => {len(task_set)} tasks. {skipped_datasets} skipped."
|
|
296
|
+
)
|
|
297
|
+
logger.info(f"Running {len(task_set)} tasks")
|
|
298
|
+
ingestion_job_summary.add_task_summaries(
|
|
299
|
+
task_executor.run(run_task, task_set)
|
|
300
|
+
)
|
|
301
|
+
else:
|
|
302
|
+
logger.info(
|
|
303
|
+
f"Discovered {len(dataset_identifiers)} datasets from {self.ingestion_plan.source.__class__.__name__} "
|
|
304
|
+
f"using selector {self.selector} => nothing to do"
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
ingestion_job_summary.increase_skipped_datasets(skipped_datasets)
|
|
308
|
+
|
|
309
|
+
if ingestion_job_summary.task_count() >= MAX_TASKS_PER_CHUNK:
|
|
310
|
+
finish_task_timer()
|
|
311
|
+
ingestion_job_summary.set_finished()
|
|
312
|
+
yield ingestion_job_summary
|
|
313
|
+
|
|
314
|
+
# Start a new one
|
|
315
|
+
is_first_chunk = False
|
|
316
|
+
ingestion_job_summary = IngestionJobSummary.new(ingestion_job=self)
|
|
317
|
+
|
|
318
|
+
# We will resume tasks, start timer right away
|
|
319
|
+
finish_task_timer = ingestion_job_summary.start_timing("tasks")
|
|
291
320
|
|
|
292
|
-
|
|
321
|
+
if ingestion_job_summary.task_count() > 0 or is_first_chunk:
|
|
322
|
+
# When there is interesting information to store, or there was no data at all, store it
|
|
323
|
+
finish_task_timer()
|
|
324
|
+
ingestion_job_summary.set_finished()
|
|
325
|
+
yield ingestion_job_summary
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import uuid
|
|
1
2
|
from contextlib import contextmanager
|
|
2
3
|
from datetime import datetime, timedelta
|
|
3
4
|
from typing import Optional, List, TYPE_CHECKING
|
|
@@ -14,13 +15,15 @@ if TYPE_CHECKING:
|
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def format_duration(duration: timedelta):
|
|
17
|
-
return f"{duration.total_seconds():.
|
|
18
|
+
return f"{duration.total_seconds():.2f}sec"
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
class IngestionJobSummary(BaseModel):
|
|
22
|
+
ingestion_job_summary_id: str
|
|
21
23
|
ingestion_job_id: str
|
|
22
24
|
|
|
23
25
|
# From the IngestionPlan
|
|
26
|
+
provider: str
|
|
24
27
|
source_name: str
|
|
25
28
|
dataset_type: str
|
|
26
29
|
data_spec_versions: DataSpecVersionCollection
|
|
@@ -31,6 +34,7 @@ class IngestionJobSummary(BaseModel):
|
|
|
31
34
|
timings: List[Timing] = Field(default_factory=list)
|
|
32
35
|
task_summaries: List[TaskSummary] = Field(default_factory=list)
|
|
33
36
|
|
|
37
|
+
skipped_datasets: int = 0
|
|
34
38
|
failed_tasks: int = 0
|
|
35
39
|
successful_tasks: int = 0
|
|
36
40
|
ignored_successful_tasks: int = 0
|
|
@@ -38,7 +42,9 @@ class IngestionJobSummary(BaseModel):
|
|
|
38
42
|
@classmethod
|
|
39
43
|
def new(cls, ingestion_job: "IngestionJob"):
|
|
40
44
|
args = dict(
|
|
45
|
+
ingestion_job_summary_id=str(uuid.uuid1()),
|
|
41
46
|
ingestion_job_id=ingestion_job.ingestion_job_id,
|
|
47
|
+
provider=ingestion_job.ingestion_plan.source.provider,
|
|
42
48
|
source_name=ingestion_job.ingestion_plan.source.name,
|
|
43
49
|
dataset_type=ingestion_job.ingestion_plan.dataset_type,
|
|
44
50
|
data_spec_versions=ingestion_job.ingestion_plan.data_spec_versions,
|
|
@@ -52,9 +58,23 @@ class IngestionJobSummary(BaseModel):
|
|
|
52
58
|
yield
|
|
53
59
|
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
54
60
|
|
|
61
|
+
def start_timing(self, name):
|
|
62
|
+
start = utcnow()
|
|
63
|
+
|
|
64
|
+
def finish():
|
|
65
|
+
self.timings.append(Timing(name=name, started_at=start, ended_at=utcnow()))
|
|
66
|
+
|
|
67
|
+
return finish
|
|
68
|
+
|
|
55
69
|
def add_task_summaries(self, task_summaries: List[TaskSummary]):
|
|
56
70
|
self.task_summaries.extend(task_summaries)
|
|
57
71
|
|
|
72
|
+
def increase_skipped_datasets(self, skipped_datasets: int):
|
|
73
|
+
self.skipped_datasets += skipped_datasets
|
|
74
|
+
|
|
75
|
+
def task_count(self):
|
|
76
|
+
return len(self.task_summaries)
|
|
77
|
+
|
|
58
78
|
def set_finished(self):
|
|
59
79
|
self.failed_tasks = len(
|
|
60
80
|
[task for task in self.task_summaries if task.status == TaskStatus.FAILED]
|
|
@@ -80,6 +100,7 @@ class IngestionJobSummary(BaseModel):
|
|
|
80
100
|
print("--------------------")
|
|
81
101
|
print(f" - IngestionPlan:")
|
|
82
102
|
print(f" Source: {self.source_name}")
|
|
103
|
+
print(f" Provider: {self.provider}")
|
|
83
104
|
print(f" DatasetType: {self.dataset_type}")
|
|
84
105
|
print(f" - Selector: {self.selector}")
|
|
85
106
|
print(f" - Timings: ")
|
|
@@ -89,14 +110,10 @@ class IngestionJobSummary(BaseModel):
|
|
|
89
110
|
f" - Tasks: {len(self.task_summaries)} - {(len(self.task_summaries) / self.duration.total_seconds()):.1f} tasks/sec"
|
|
90
111
|
)
|
|
91
112
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
]:
|
|
97
|
-
print(
|
|
98
|
-
f" - {status.value.lower()}: {len([task for task in self.task_summaries if task.status == status])}"
|
|
99
|
-
)
|
|
113
|
+
print(f" - Failed tasks: {self.failed_tasks}")
|
|
114
|
+
print(f" - Successful tasks: {self.successful_tasks}")
|
|
115
|
+
print(f" - Successful ignored tasks: {self.successful_tasks}")
|
|
116
|
+
print(f" - Skipped datasets: {self.skipped_datasets}")
|
|
100
117
|
print("--------------------")
|
|
101
118
|
|
|
102
119
|
def __enter__(self):
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Callable, Optional, Union
|
|
4
|
+
|
|
5
|
+
from ingestify.exceptions import IngestifyError
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TransformationType(Enum):
|
|
9
|
+
IDENTITY = "IDENTITY"
|
|
10
|
+
BUCKET = "BUCKET"
|
|
11
|
+
RANGE = "RANGE"
|
|
12
|
+
CUSTOM = "CUSTOM"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Transformation(ABC):
|
|
16
|
+
@property
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def transformation_type(self) -> TransformationType:
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
def is_identity(self) -> bool:
|
|
22
|
+
return self.transformation_type == TransformationType.IDENTITY
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def __call__(self, id_key_value: Union[str, int]) -> str:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def from_dict(cls, config: dict) -> "Transformation":
|
|
30
|
+
type_ = config.pop("type")
|
|
31
|
+
if type_ == "bucket":
|
|
32
|
+
return BucketTransformation(**config)
|
|
33
|
+
else:
|
|
34
|
+
raise IngestifyError(f"Cannot build Transformation from {config}")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class IdentityTransformation(Transformation):
|
|
38
|
+
transformation_type = TransformationType.IDENTITY
|
|
39
|
+
|
|
40
|
+
def __call__(self, id_key_value: Union[str, int]) -> str:
|
|
41
|
+
# Return the original value as a string
|
|
42
|
+
return str(id_key_value)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class BucketTransformation(Transformation):
|
|
46
|
+
transformation_type = TransformationType.BUCKET
|
|
47
|
+
|
|
48
|
+
def __init__(self, bucket_size: int = None, bucket_count: int = None):
|
|
49
|
+
self.bucket_size = bucket_size
|
|
50
|
+
self.bucket_count = bucket_count
|
|
51
|
+
|
|
52
|
+
def __call__(self, id_key_value: Union[str, int]) -> str:
|
|
53
|
+
if self.bucket_count:
|
|
54
|
+
return str(int(id_key_value) % self.bucket_count)
|
|
55
|
+
elif self.bucket_size:
|
|
56
|
+
bucket_start = int(id_key_value) // self.bucket_size * self.bucket_size
|
|
57
|
+
bucket_end = bucket_start + self.bucket_size - 1
|
|
58
|
+
return f"{bucket_start}-{bucket_end}"
|
|
59
|
+
else:
|
|
60
|
+
raise IngestifyError("Invalid BucketTransformation")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class IdentifierTransformer:
|
|
64
|
+
def __init__(self):
|
|
65
|
+
# Mapping of (provider, dataset_type, id_key) to the transformation
|
|
66
|
+
self.key_transformations: dict[tuple[str, str, str], Transformation] = {}
|
|
67
|
+
|
|
68
|
+
def register_transformation(
|
|
69
|
+
self,
|
|
70
|
+
provider: str,
|
|
71
|
+
dataset_type: str,
|
|
72
|
+
id_key: str,
|
|
73
|
+
transformation: Union[Transformation, dict],
|
|
74
|
+
):
|
|
75
|
+
"""
|
|
76
|
+
Registers a transformation for a specific (provider, dataset_type, id_key).
|
|
77
|
+
"""
|
|
78
|
+
if isinstance(transformation, dict):
|
|
79
|
+
transformation = Transformation.from_dict(transformation)
|
|
80
|
+
|
|
81
|
+
self.key_transformations[(provider, dataset_type, id_key)] = transformation
|
|
82
|
+
|
|
83
|
+
def get_transformation(
|
|
84
|
+
self, provider: str, dataset_type: str, id_key: str
|
|
85
|
+
) -> Transformation:
|
|
86
|
+
"""
|
|
87
|
+
Retrieves the transformation for the given column or defaults to identity.
|
|
88
|
+
"""
|
|
89
|
+
transformation = self.key_transformations.get((provider, dataset_type, id_key))
|
|
90
|
+
return transformation if transformation else IdentityTransformation()
|
|
91
|
+
|
|
92
|
+
def to_path(self, provider: str, dataset_type: str, identifier: dict) -> str:
|
|
93
|
+
"""
|
|
94
|
+
Transforms the identifier into a path string using registered transformations.
|
|
95
|
+
For non-identity transformations, includes both transformed and original values,
|
|
96
|
+
with the transformed value appearing first and including the suffix.
|
|
97
|
+
"""
|
|
98
|
+
path_parts = []
|
|
99
|
+
for key, value in identifier.items():
|
|
100
|
+
transformation = self.get_transformation(provider, dataset_type, key)
|
|
101
|
+
if not transformation.is_identity():
|
|
102
|
+
# Non-identity transformation: include both transformed and original
|
|
103
|
+
transformed_value = transformation(value)
|
|
104
|
+
suffix = transformation.transformation_type.value.lower()
|
|
105
|
+
path_parts.append(f"{key}_{suffix}={transformed_value}")
|
|
106
|
+
|
|
107
|
+
# Append the original value (either standalone for identity or alongside transformed)
|
|
108
|
+
path_parts.append(f"{key}={value}")
|
|
109
|
+
|
|
110
|
+
# Join the parts with `/` to form the full path
|
|
111
|
+
return "/".join(path_parts)
|
|
@@ -229,9 +229,11 @@ mapper_registry.map_imperatively(File, file_table)
|
|
|
229
229
|
ingestion_job_summary = Table(
|
|
230
230
|
"ingestion_job_summary",
|
|
231
231
|
metadata,
|
|
232
|
-
Column("
|
|
232
|
+
Column("ingestion_job_summary_id", String(255), primary_key=True),
|
|
233
|
+
Column("ingestion_job_id", String(255), index=True),
|
|
233
234
|
# From the IngestionPlan
|
|
234
235
|
Column("source_name", String(255)),
|
|
236
|
+
Column("provider", String(255)),
|
|
235
237
|
Column("dataset_type", String(255)),
|
|
236
238
|
Column(
|
|
237
239
|
"data_spec_versions",
|
|
@@ -250,6 +252,7 @@ ingestion_job_summary = Table(
|
|
|
250
252
|
# Some task counters
|
|
251
253
|
Column("successful_tasks", Integer),
|
|
252
254
|
Column("ignored_successful_tasks", Integer),
|
|
255
|
+
Column("skipped_datasets", Integer),
|
|
253
256
|
Column("failed_tasks", Integer),
|
|
254
257
|
Column(
|
|
255
258
|
"timings",
|
|
@@ -281,9 +284,9 @@ task_summary_table = Table(
|
|
|
281
284
|
"task_summary",
|
|
282
285
|
metadata,
|
|
283
286
|
Column(
|
|
284
|
-
"
|
|
287
|
+
"ingestion_job_summary_id",
|
|
285
288
|
String(255),
|
|
286
|
-
ForeignKey("ingestion_job_summary.
|
|
289
|
+
ForeignKey("ingestion_job_summary.ingestion_job_summary_id"),
|
|
287
290
|
primary_key=True,
|
|
288
291
|
),
|
|
289
292
|
Column("task_id", Integer, primary_key=True),
|
|
@@ -209,9 +209,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
209
209
|
)
|
|
210
210
|
|
|
211
211
|
if not metadata_only:
|
|
212
|
-
dataset_query = apply_query_filter(
|
|
213
|
-
self.session.query(Dataset) # .options(joinedload(Dataset.revisions))
|
|
214
|
-
)
|
|
212
|
+
dataset_query = apply_query_filter(self.session.query(Dataset))
|
|
215
213
|
datasets = list(dataset_query)
|
|
216
214
|
else:
|
|
217
215
|
datasets = []
|
|
@@ -19,14 +19,12 @@ class LocalFileRepository(FileRepository):
|
|
|
19
19
|
filename: str,
|
|
20
20
|
stream: BinaryIO,
|
|
21
21
|
) -> Path:
|
|
22
|
-
path = self.
|
|
22
|
+
path = self.get_write_path(bucket, dataset, revision_id, filename)
|
|
23
23
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
24
24
|
|
|
25
25
|
with open(path, "wb") as fp:
|
|
26
26
|
shutil.copyfileobj(stream, fp)
|
|
27
27
|
return path
|
|
28
28
|
|
|
29
|
-
def load_content(
|
|
30
|
-
self,
|
|
31
|
-
) -> BinaryIO:
|
|
32
|
-
return open(self.get_path(bucket, dataset, revision_id, filename), "rb")
|
|
29
|
+
def load_content(self, storage_path: str) -> BinaryIO:
|
|
30
|
+
return open(self.get_read_path(storage_path), "rb")
|
|
@@ -8,10 +8,7 @@ from ingestify.domain.models import FileRepository
|
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class S3FileRepository(FileRepository):
|
|
11
|
-
|
|
12
|
-
super().__init__(url)
|
|
13
|
-
|
|
14
|
-
self._s3 = None
|
|
11
|
+
_s3 = None
|
|
15
12
|
|
|
16
13
|
@property
|
|
17
14
|
def s3(self):
|
|
@@ -30,16 +27,14 @@ class S3FileRepository(FileRepository):
|
|
|
30
27
|
filename: str,
|
|
31
28
|
stream: BinaryIO,
|
|
32
29
|
) -> Path:
|
|
33
|
-
key = self.
|
|
30
|
+
key = self.get_write_path(bucket, dataset, revision_id, filename)
|
|
34
31
|
s3_bucket = Path(key.parts[0])
|
|
35
32
|
|
|
36
33
|
self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).put(Body=stream)
|
|
37
34
|
return key
|
|
38
35
|
|
|
39
|
-
def load_content(
|
|
40
|
-
|
|
41
|
-
) -> BinaryIO:
|
|
42
|
-
key = self.get_path(bucket, dataset, revision_id, filename)
|
|
36
|
+
def load_content(self, storage_path: str) -> BinaryIO:
|
|
37
|
+
key = self.get_read_path(storage_path)
|
|
43
38
|
s3_bucket = Path(key.parts[0])
|
|
44
39
|
return self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).get()[
|
|
45
40
|
"Body"
|
ingestify/main.py
CHANGED
|
@@ -19,6 +19,7 @@ from ingestify.domain.models.event import EventBus, Publisher, Subscriber
|
|
|
19
19
|
|
|
20
20
|
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
21
21
|
from ingestify.domain.models.fetch_policy import FetchPolicy
|
|
22
|
+
from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer
|
|
22
23
|
from ingestify.exceptions import ConfigurationError
|
|
23
24
|
from ingestify.infra import S3FileRepository, LocalFileRepository
|
|
24
25
|
from ingestify.infra.store.dataset.sqlalchemy import SqlAlchemyDatasetRepository
|
|
@@ -60,11 +61,15 @@ def import_cls(name):
|
|
|
60
61
|
return getattr(mod, components[-1])
|
|
61
62
|
|
|
62
63
|
|
|
63
|
-
def build_file_repository(file_url: str) -> FileRepository:
|
|
64
|
+
def build_file_repository(file_url: str, identifier_transformer) -> FileRepository:
|
|
64
65
|
if file_url.startswith("s3://"):
|
|
65
|
-
repository = S3FileRepository(
|
|
66
|
+
repository = S3FileRepository(
|
|
67
|
+
url=file_url, identifier_transformer=identifier_transformer
|
|
68
|
+
)
|
|
66
69
|
elif file_url.startswith("file://"):
|
|
67
|
-
repository = LocalFileRepository(
|
|
70
|
+
repository = LocalFileRepository(
|
|
71
|
+
url=file_url, identifier_transformer=identifier_transformer
|
|
72
|
+
)
|
|
68
73
|
else:
|
|
69
74
|
raise Exception(f"Cannot find repository to handle file {file_url}")
|
|
70
75
|
|
|
@@ -72,7 +77,7 @@ def build_file_repository(file_url: str) -> FileRepository:
|
|
|
72
77
|
|
|
73
78
|
|
|
74
79
|
def get_dataset_store_by_urls(
|
|
75
|
-
metadata_url: str, file_url: str, bucket: str
|
|
80
|
+
metadata_url: str, file_url: str, bucket: str, dataset_types
|
|
76
81
|
) -> DatasetStore:
|
|
77
82
|
"""
|
|
78
83
|
Initialize a DatasetStore by a DatasetRepository and a FileRepository
|
|
@@ -80,7 +85,19 @@ def get_dataset_store_by_urls(
|
|
|
80
85
|
if not bucket:
|
|
81
86
|
raise Exception("Bucket is not specified")
|
|
82
87
|
|
|
83
|
-
|
|
88
|
+
identifier_transformer = IdentifierTransformer()
|
|
89
|
+
for dataset_type in dataset_types:
|
|
90
|
+
for id_key, id_config in dataset_type["identifier_keys"].items():
|
|
91
|
+
identifier_transformer.register_transformation(
|
|
92
|
+
provider=dataset_type["provider"],
|
|
93
|
+
dataset_type=dataset_type["dataset_type"],
|
|
94
|
+
id_key=id_key,
|
|
95
|
+
transformation=id_config["transformation"],
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
file_repository = build_file_repository(
|
|
99
|
+
file_url, identifier_transformer=identifier_transformer
|
|
100
|
+
)
|
|
84
101
|
|
|
85
102
|
if secrets_manager.supports(metadata_url):
|
|
86
103
|
metadata_url = secrets_manager.load_as_db_url(metadata_url)
|
|
@@ -103,14 +120,15 @@ def get_datastore(config_file, bucket: Optional[str] = None) -> DatasetStore:
|
|
|
103
120
|
config = parse_config(config_file, default_value="")
|
|
104
121
|
|
|
105
122
|
return get_dataset_store_by_urls(
|
|
106
|
-
|
|
123
|
+
metadata_url=config["main"]["metadata_url"],
|
|
107
124
|
file_url=config["main"]["file_url"],
|
|
108
125
|
bucket=bucket or config["main"].get("default_bucket"),
|
|
126
|
+
dataset_types=config.get("dataset_types", []),
|
|
109
127
|
)
|
|
110
128
|
|
|
111
129
|
|
|
112
130
|
def get_remote_datastore(url: str, bucket: str, **kwargs) -> DatasetStore:
|
|
113
|
-
return get_dataset_store_by_urls(
|
|
131
|
+
return get_dataset_store_by_urls(metadata_url=url, file_url=url, bucket=bucket)
|
|
114
132
|
|
|
115
133
|
|
|
116
134
|
def get_source_cls(key: str) -> Type[Source]:
|
|
@@ -173,6 +191,7 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
|
173
191
|
metadata_url=config["main"]["metadata_url"],
|
|
174
192
|
file_url=config["main"]["file_url"],
|
|
175
193
|
bucket=bucket or config["main"].get("default_bucket"),
|
|
194
|
+
dataset_types=config.get("dataset_types", []),
|
|
176
195
|
)
|
|
177
196
|
|
|
178
197
|
# Setup an EventBus and wire some more components
|
|
@@ -188,7 +207,7 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
|
|
|
188
207
|
store=store,
|
|
189
208
|
)
|
|
190
209
|
|
|
191
|
-
logger.info("
|
|
210
|
+
logger.info("Adding IngestionPlans...")
|
|
192
211
|
|
|
193
212
|
fetch_policy = FetchPolicy()
|
|
194
213
|
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
ingestify/__init__.py,sha256=
|
|
1
|
+
ingestify/__init__.py,sha256=DnPPEtJT32gAPuUKXgIsqUE4fIvc6QA96vrcKr6nz6A,301
|
|
2
2
|
ingestify/cmdline.py,sha256=bIuyPgGEw4wIglNzpG9zp7TsJozsP8NSVsCe4eAyWUg,7189
|
|
3
3
|
ingestify/exceptions.py,sha256=wMMuajl4AkQRfW60TLN7btJmQaH8-lUczXyW_2g9kOU,143
|
|
4
|
-
ingestify/main.py,sha256=
|
|
4
|
+
ingestify/main.py,sha256=0sTNoLcS7euOavIAviQIMTolRnXsvOvNbmFdXgXgxhE,8516
|
|
5
5
|
ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
|
|
6
6
|
ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
|
|
7
7
|
ingestify/utils.py,sha256=HETGhAoUlutLG0cQR63nac2JbFei9gnktDHeBQoYWfU,5692
|
|
8
8
|
ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
ingestify/application/dataset_store.py,sha256=
|
|
9
|
+
ingestify/application/dataset_store.py,sha256=6xMHa_ShyPOyegIKl2xwmRl3BlV5i21z95cpKW3oARw,11712
|
|
10
10
|
ingestify/application/ingestion_engine.py,sha256=PtMjKMpvfqB802G5zfKLzyamdH7qFOXl3x6_97y8w60,2288
|
|
11
|
-
ingestify/application/loader.py,sha256=
|
|
11
|
+
ingestify/application/loader.py,sha256=v8ZcpMDEml9k_uFPFqT4WaCjXED_OIpAr7g0Pz5Hp6Y,7153
|
|
12
12
|
ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
|
|
13
13
|
ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
|
|
14
14
|
ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
|
|
@@ -27,7 +27,7 @@ ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4
|
|
|
27
27
|
ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
|
|
28
28
|
ingestify/domain/models/dataset/file.py,sha256=nuoZI9GI5OysYwWCCyNsHMlm1Z9A1GbEKd38jvBzJ4E,4119
|
|
29
29
|
ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
|
|
30
|
-
ingestify/domain/models/dataset/file_repository.py,sha256=
|
|
30
|
+
ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
|
|
31
31
|
ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
|
|
32
32
|
ingestify/domain/models/dataset/revision.py,sha256=O_1HG2S2EmYdWqI2K282S_D-d6IhRh_f4Q3wV8MEhkk,1311
|
|
33
33
|
ingestify/domain/models/dataset/selector.py,sha256=kEGpU8pIyjZ0zwE9n2uo_NY5xrNanWiTTgapyMAUEsw,1039
|
|
@@ -39,8 +39,8 @@ ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmce
|
|
|
39
39
|
ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
|
|
40
40
|
ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
|
|
41
41
|
ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
-
ingestify/domain/models/ingestion/ingestion_job.py,sha256=
|
|
43
|
-
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=
|
|
42
|
+
ingestify/domain/models/ingestion/ingestion_job.py,sha256=U6B62c7NGeHBAjmKhgOa4uHeul34xyR66WtWaPSRNTU,12276
|
|
43
|
+
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=1l9O3QJkYLs74HhrwAijwNEriPMwHN9OFG64Iz4z3uI,4262
|
|
44
44
|
ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
|
|
45
45
|
ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
|
|
46
46
|
ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
|
|
@@ -49,6 +49,7 @@ ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIp
|
|
|
49
49
|
ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
|
|
50
50
|
ingestify/domain/models/task/task_summary.py,sha256=ovzqKPstngRVzVA_JboQMluq5uQjKVJDsWNNcfcadhU,3774
|
|
51
51
|
ingestify/domain/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
|
+
ingestify/domain/services/identifier_key_transformer.py,sha256=y4GS9u9Ej1MO2jUhAxWbifp0mrE_MqTHvVVcoQzSKb4,4034
|
|
52
53
|
ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
53
54
|
ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-_hWZJTMcduS1Gg7EM4X95Cqxi1QIM,809
|
|
54
55
|
ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
|
|
@@ -63,11 +64,11 @@ ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nO
|
|
|
63
64
|
ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
|
|
64
65
|
ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
65
66
|
ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
|
|
66
|
-
ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256
|
|
67
|
-
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256
|
|
67
|
+
ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256=UlEIfNusSOEWOxPi_ORrdLSylbi6-TO1qwEmcrBLwog,9447
|
|
68
|
+
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=-eSR_F9tS9Hd3JNEpoJoDAb5RY38rFaKLMI3eBedjx8,7068
|
|
68
69
|
ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
|
|
69
|
-
ingestify/infra/store/file/local_file_repository.py,sha256=
|
|
70
|
-
ingestify/infra/store/file/s3_file_repository.py,sha256=
|
|
70
|
+
ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
|
|
71
|
+
ingestify/infra/store/file/s3_file_repository.py,sha256=_sekV1rfEbwIaSGhKRnFQlj92E9qNgONiwXt6ZLCyGg,1188
|
|
71
72
|
ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
72
73
|
ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
|
|
73
74
|
ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
@@ -78,8 +79,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
|
78
79
|
ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
|
|
79
80
|
ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
80
81
|
ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
|
|
81
|
-
ingestify-0.
|
|
82
|
-
ingestify-0.
|
|
83
|
-
ingestify-0.
|
|
84
|
-
ingestify-0.
|
|
85
|
-
ingestify-0.
|
|
82
|
+
ingestify-0.3.0.dist-info/METADATA,sha256=-QlChdV6OYWkqSyXUmkQTG4deBliRsSmmZMTWKeURnI,18853
|
|
83
|
+
ingestify-0.3.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
84
|
+
ingestify-0.3.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
|
|
85
|
+
ingestify-0.3.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
|
|
86
|
+
ingestify-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|