ingestify 0.9.1__tar.gz → 0.9.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.9.1 → ingestify-0.9.3}/PKG-INFO +1 -1
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/__init__.py +1 -1
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/application/dataset_store.py +5 -2
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/revision.py +3 -1
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/task/task_summary.py +1 -3
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/fetch/http.py +3 -1
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/dataset/sqlalchemy/repository.py +16 -9
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_engine.py +46 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/PKG-INFO +1 -1
- {ingestify-0.9.1 → ingestify-0.9.3}/README.md +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/application/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/application/ingestion_engine.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/application/loader.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/cmdline.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/base.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/data_spec_version_collection.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/dataset.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/dataset_state.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/events.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/file.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/ingestion/ingestion_job.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/resources/dataset_resource.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/exceptions.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/source/statsbomb/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/source/statsbomb/base.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/source/statsbomb/match.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/source/statsbomb_github.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/file/s3_file_repository.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/main.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/server.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/source_base.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/__init__.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/conftest.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_auto_ingest.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_events.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_file_cache.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_pagination.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_store_version.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_table_prefix.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/utils.py +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/SOURCES.txt +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/requires.txt +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/setup.cfg +0 -0
- {ingestify-0.9.1 → ingestify-0.9.3}/setup.py +0 -0
|
@@ -383,6 +383,7 @@ class DatasetStore:
|
|
|
383
383
|
files: Dict[str, DraftFile],
|
|
384
384
|
revision_source: RevisionSource,
|
|
385
385
|
description: str = "Update",
|
|
386
|
+
force_save: bool = False,
|
|
386
387
|
):
|
|
387
388
|
"""
|
|
388
389
|
Create new revision first, so FileRepository can use
|
|
@@ -392,7 +393,7 @@ class DatasetStore:
|
|
|
392
393
|
created_at = utcnow()
|
|
393
394
|
|
|
394
395
|
persisted_files_ = self._persist_files(dataset, revision_id, files)
|
|
395
|
-
if persisted_files_:
|
|
396
|
+
if persisted_files_ or force_save:
|
|
396
397
|
# It can happen an API tells us data is changed, but it was not changed. In this case
|
|
397
398
|
# we decide to ignore it.
|
|
398
399
|
# Make sure there are files changed before creating a new revision
|
|
@@ -487,7 +488,9 @@ class DatasetStore:
|
|
|
487
488
|
updated_at=now,
|
|
488
489
|
last_modified_at=None, # Not known at this moment
|
|
489
490
|
)
|
|
490
|
-
revision = self.add_revision(
|
|
491
|
+
revision = self.add_revision(
|
|
492
|
+
dataset, files, revision_source, description, force_save=True
|
|
493
|
+
)
|
|
491
494
|
|
|
492
495
|
self.dispatch(DatasetCreated(dataset=dataset))
|
|
493
496
|
return revision
|
|
@@ -38,7 +38,9 @@ class Revision(BaseModel):
|
|
|
38
38
|
|
|
39
39
|
@property
|
|
40
40
|
def last_modified_at(self):
|
|
41
|
-
|
|
41
|
+
if self.modified_files:
|
|
42
|
+
return max(file.modified_at for file in self.modified_files)
|
|
43
|
+
return None
|
|
42
44
|
|
|
43
45
|
@property
|
|
44
46
|
def modified_files_map(self) -> Dict[str, File]:
|
|
@@ -86,9 +86,7 @@ class TaskSummary(BaseModel, HasTiming):
|
|
|
86
86
|
if revision:
|
|
87
87
|
self.persisted_file_count = len(revision.modified_files)
|
|
88
88
|
self.bytes_retrieved = sum(file.size for file in revision.modified_files)
|
|
89
|
-
self.last_modified =
|
|
90
|
-
file.modified_at for file in revision.modified_files
|
|
91
|
-
)
|
|
89
|
+
self.last_modified = revision.last_modified_at
|
|
92
90
|
else:
|
|
93
91
|
self.state = TaskState.FINISHED_IGNORED
|
|
94
92
|
|
|
@@ -58,7 +58,9 @@ def retrieve_http(
|
|
|
58
58
|
)
|
|
59
59
|
# else:
|
|
60
60
|
# print(f"{current_file.modified_at=} {last_modified=}")
|
|
61
|
-
headers["if-modified-since"] = format_datetime(
|
|
61
|
+
headers["if-modified-since"] = format_datetime(
|
|
62
|
+
current_file.modified_at, usegmt=True
|
|
63
|
+
)
|
|
62
64
|
headers["if-none-match"] = current_file.tag
|
|
63
65
|
|
|
64
66
|
http_kwargs = {}
|
|
@@ -375,7 +375,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
375
375
|
dataset_ids_cte.c.dataset_id == self.revision_table.c.dataset_id,
|
|
376
376
|
)
|
|
377
377
|
)
|
|
378
|
-
.order_by(
|
|
378
|
+
.order_by(
|
|
379
|
+
self.revision_table.c.dataset_id, self.revision_table.c.revision_id
|
|
380
|
+
)
|
|
379
381
|
)
|
|
380
382
|
|
|
381
383
|
for dataset_id, revisions in itertools.groupby(
|
|
@@ -468,9 +470,14 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
468
470
|
dataset_ids = [row.dataset_id for row in dataset_query]
|
|
469
471
|
datasets = self._load_datasets(dataset_ids)
|
|
470
472
|
|
|
473
|
+
last_modified_values = [
|
|
474
|
+
dataset.last_modified_at
|
|
475
|
+
for dataset in datasets
|
|
476
|
+
if dataset.last_modified_at is not None
|
|
477
|
+
]
|
|
471
478
|
dataset_collection_metadata = DatasetCollectionMetadata(
|
|
472
|
-
last_modified=max(
|
|
473
|
-
if
|
|
479
|
+
last_modified=max(last_modified_values)
|
|
480
|
+
if last_modified_values
|
|
474
481
|
else None,
|
|
475
482
|
row_count=len(datasets),
|
|
476
483
|
)
|
|
@@ -560,22 +567,22 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
560
567
|
try:
|
|
561
568
|
# Delete modified files related to the dataset
|
|
562
569
|
connection.execute(
|
|
563
|
-
file_table.delete().where(
|
|
564
|
-
file_table.c.dataset_id == dataset.dataset_id
|
|
570
|
+
self.file_table.delete().where(
|
|
571
|
+
self.file_table.c.dataset_id == dataset.dataset_id
|
|
565
572
|
)
|
|
566
573
|
)
|
|
567
574
|
|
|
568
575
|
# Delete revisions related to the dataset
|
|
569
576
|
connection.execute(
|
|
570
|
-
revision_table.delete().where(
|
|
571
|
-
revision_table.c.dataset_id == dataset.dataset_id
|
|
577
|
+
self.revision_table.delete().where(
|
|
578
|
+
self.revision_table.c.dataset_id == dataset.dataset_id
|
|
572
579
|
)
|
|
573
580
|
)
|
|
574
581
|
|
|
575
582
|
# Delete the dataset itself
|
|
576
583
|
connection.execute(
|
|
577
|
-
dataset_table.delete().where(
|
|
578
|
-
dataset_table.c.dataset_id == dataset.dataset_id
|
|
584
|
+
self.dataset_table.delete().where(
|
|
585
|
+
self.dataset_table.c.dataset_id == dataset.dataset_id
|
|
579
586
|
)
|
|
580
587
|
)
|
|
581
588
|
|
|
@@ -251,6 +251,28 @@ class FailingJobSource(Source):
|
|
|
251
251
|
raise Exception("some failure")
|
|
252
252
|
|
|
253
253
|
|
|
254
|
+
class NoFilesSource(Source):
|
|
255
|
+
provider = "fake"
|
|
256
|
+
|
|
257
|
+
def find_datasets(
|
|
258
|
+
self,
|
|
259
|
+
dataset_type: str,
|
|
260
|
+
data_spec_versions: DataSpecVersionCollection,
|
|
261
|
+
dataset_collection_metadata: DatasetCollectionMetadata,
|
|
262
|
+
competition_id,
|
|
263
|
+
season_id,
|
|
264
|
+
**kwargs,
|
|
265
|
+
):
|
|
266
|
+
yield DatasetResource(
|
|
267
|
+
dataset_resource_id=dict(
|
|
268
|
+
competition_id=competition_id, season_id=season_id, match_id=1
|
|
269
|
+
),
|
|
270
|
+
provider="fake",
|
|
271
|
+
dataset_type="match",
|
|
272
|
+
name="Dataset Without Files",
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
|
|
254
276
|
def test_engine(config_file):
|
|
255
277
|
engine = get_engine(config_file, "main")
|
|
256
278
|
|
|
@@ -499,3 +521,27 @@ def test_post_load_files_hook(config_file):
|
|
|
499
521
|
engine.load()
|
|
500
522
|
dataset2 = engine.store.get_dataset_collection().first()
|
|
501
523
|
assert dataset2.state == DatasetState.COMPLETE
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def test_force_save_creates_revision(config_file):
|
|
527
|
+
"""Test that datasets get a revision even when no files are persisted."""
|
|
528
|
+
engine = get_engine(config_file, "main")
|
|
529
|
+
|
|
530
|
+
# Create one dataset with files and one without
|
|
531
|
+
add_ingestion_plan(
|
|
532
|
+
engine, SimpleFakeSource("fake-source"), competition_id=1, season_id=2
|
|
533
|
+
)
|
|
534
|
+
add_ingestion_plan(
|
|
535
|
+
engine, NoFilesSource("fake-source"), competition_id=1, season_id=3
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
engine.load()
|
|
539
|
+
|
|
540
|
+
# This should not fail even though one dataset has no last_modified_at
|
|
541
|
+
datasets = engine.store.get_dataset_collection()
|
|
542
|
+
assert len(datasets) == 2
|
|
543
|
+
|
|
544
|
+
# Verify the dataset without files still has a revision
|
|
545
|
+
dataset_without_files = engine.store.get_dataset_collection(season_id=3).first()
|
|
546
|
+
assert len(dataset_without_files.revisions) == 1
|
|
547
|
+
assert len(dataset_without_files.current_revision.modified_files) == 0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/ingestion/ingestion_job_summary.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/services/transformers/kloppy_to_pandas.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|