ingestify 0.9.1__tar.gz → 0.9.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {ingestify-0.9.1 → ingestify-0.9.3}/PKG-INFO +1 -1
  2. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/__init__.py +1 -1
  3. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/application/dataset_store.py +5 -2
  4. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/revision.py +3 -1
  5. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/task/task_summary.py +1 -3
  6. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/fetch/http.py +3 -1
  7. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/dataset/sqlalchemy/repository.py +16 -9
  8. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_engine.py +46 -0
  9. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/PKG-INFO +1 -1
  10. {ingestify-0.9.1 → ingestify-0.9.3}/README.md +0 -0
  11. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/application/__init__.py +0 -0
  12. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/application/ingestion_engine.py +0 -0
  13. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/application/loader.py +0 -0
  14. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/application/secrets_manager.py +0 -0
  15. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/cmdline.py +0 -0
  16. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/__init__.py +0 -0
  17. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/__init__.py +0 -0
  18. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/base.py +0 -0
  19. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  20. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/__init__.py +0 -0
  21. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/collection.py +0 -0
  22. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  23. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/dataset.py +0 -0
  24. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  25. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/dataset_state.py +0 -0
  26. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/events.py +0 -0
  27. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/file.py +0 -0
  28. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/file_collection.py +0 -0
  29. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/file_repository.py +0 -0
  30. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/identifier.py +0 -0
  31. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/dataset/selector.py +0 -0
  32. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/__init__.py +0 -0
  33. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/_old_event.py +0 -0
  34. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/dispatcher.py +0 -0
  35. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/domain_event.py +0 -0
  36. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/event_bus.py +0 -0
  37. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/publisher.py +0 -0
  38. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/event/subscriber.py +0 -0
  39. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/fetch_policy.py +0 -0
  40. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/ingestion/__init__.py +0 -0
  41. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/ingestion/ingestion_job.py +0 -0
  42. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
  43. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  44. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/resources/__init__.py +0 -0
  45. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/resources/dataset_resource.py +0 -0
  46. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/sink.py +0 -0
  47. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/source.py +0 -0
  48. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/task/__init__.py +0 -0
  49. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/task/set.py +0 -0
  50. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/task/task.py +0 -0
  51. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/models/timing.py +0 -0
  52. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/services/__init__.py +0 -0
  53. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  54. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/services/transformers/__init__.py +0 -0
  55. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  56. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/exceptions.py +0 -0
  57. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/__init__.py +0 -0
  58. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/fetch/__init__.py +0 -0
  59. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/serialization/__init__.py +0 -0
  60. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/sink/__init__.py +0 -0
  61. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/sink/postgresql.py +0 -0
  62. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/source/__init__.py +0 -0
  63. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/source/statsbomb/__init__.py +0 -0
  64. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/source/statsbomb/base.py +0 -0
  65. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/source/statsbomb/match.py +0 -0
  66. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/source/statsbomb_github.py +0 -0
  67. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/__init__.py +0 -0
  68. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/dataset/__init__.py +0 -0
  69. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  70. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/dataset/sqlalchemy/tables.py +0 -0
  71. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/file/__init__.py +0 -0
  72. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  73. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/file/local_file_repository.py +0 -0
  74. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  75. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/main.py +0 -0
  76. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/server.py +0 -0
  77. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/source_base.py +0 -0
  78. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/__init__.py +0 -0
  79. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/conftest.py +0 -0
  80. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_auto_ingest.py +0 -0
  81. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_events.py +0 -0
  82. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_file_cache.py +0 -0
  83. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_pagination.py +0 -0
  84. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_store_version.py +0 -0
  85. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/tests/test_table_prefix.py +0 -0
  86. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify/utils.py +0 -0
  87. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/SOURCES.txt +0 -0
  88. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/dependency_links.txt +0 -0
  89. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/entry_points.txt +0 -0
  90. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/requires.txt +0 -0
  91. {ingestify-0.9.1 → ingestify-0.9.3}/ingestify.egg-info/top_level.txt +0 -0
  92. {ingestify-0.9.1 → ingestify-0.9.3}/setup.cfg +0 -0
  93. {ingestify-0.9.1 → ingestify-0.9.3}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestify
3
- Version: 0.9.1
3
+ Version: 0.9.3
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -9,4 +9,4 @@ if not __INGESTIFY_SETUP__:
9
9
  from .source_base import Source, DatasetResource
10
10
  from .main import debug_source
11
11
 
12
- __version__ = "0.9.1"
12
+ __version__ = "0.9.3"
@@ -383,6 +383,7 @@ class DatasetStore:
383
383
  files: Dict[str, DraftFile],
384
384
  revision_source: RevisionSource,
385
385
  description: str = "Update",
386
+ force_save: bool = False,
386
387
  ):
387
388
  """
388
389
  Create new revision first, so FileRepository can use
@@ -392,7 +393,7 @@ class DatasetStore:
392
393
  created_at = utcnow()
393
394
 
394
395
  persisted_files_ = self._persist_files(dataset, revision_id, files)
395
- if persisted_files_:
396
+ if persisted_files_ or force_save:
396
397
  # It can happen an API tells us data is changed, but it was not changed. In this case
397
398
  # we decide to ignore it.
398
399
  # Make sure there are files changed before creating a new revision
@@ -487,7 +488,9 @@ class DatasetStore:
487
488
  updated_at=now,
488
489
  last_modified_at=None, # Not known at this moment
489
490
  )
490
- revision = self.add_revision(dataset, files, revision_source, description)
491
+ revision = self.add_revision(
492
+ dataset, files, revision_source, description, force_save=True
493
+ )
491
494
 
492
495
  self.dispatch(DatasetCreated(dataset=dataset))
493
496
  return revision
@@ -38,7 +38,9 @@ class Revision(BaseModel):
38
38
 
39
39
  @property
40
40
  def last_modified_at(self):
41
- return max(file.modified_at for file in self.modified_files)
41
+ if self.modified_files:
42
+ return max(file.modified_at for file in self.modified_files)
43
+ return None
42
44
 
43
45
  @property
44
46
  def modified_files_map(self) -> Dict[str, File]:
@@ -86,9 +86,7 @@ class TaskSummary(BaseModel, HasTiming):
86
86
  if revision:
87
87
  self.persisted_file_count = len(revision.modified_files)
88
88
  self.bytes_retrieved = sum(file.size for file in revision.modified_files)
89
- self.last_modified = max(
90
- file.modified_at for file in revision.modified_files
91
- )
89
+ self.last_modified = revision.last_modified_at
92
90
  else:
93
91
  self.state = TaskState.FINISHED_IGNORED
94
92
 
@@ -58,7 +58,9 @@ def retrieve_http(
58
58
  )
59
59
  # else:
60
60
  # print(f"{current_file.modified_at=} {last_modified=}")
61
- headers["if-modified-since"] = format_datetime(current_file.modified_at, usegmt=True)
61
+ headers["if-modified-since"] = format_datetime(
62
+ current_file.modified_at, usegmt=True
63
+ )
62
64
  headers["if-none-match"] = current_file.tag
63
65
 
64
66
  http_kwargs = {}
@@ -375,7 +375,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
375
375
  dataset_ids_cte.c.dataset_id == self.revision_table.c.dataset_id,
376
376
  )
377
377
  )
378
- .order_by(self.revision_table.c.dataset_id)
378
+ .order_by(
379
+ self.revision_table.c.dataset_id, self.revision_table.c.revision_id
380
+ )
379
381
  )
380
382
 
381
383
  for dataset_id, revisions in itertools.groupby(
@@ -468,9 +470,14 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
468
470
  dataset_ids = [row.dataset_id for row in dataset_query]
469
471
  datasets = self._load_datasets(dataset_ids)
470
472
 
473
+ last_modified_values = [
474
+ dataset.last_modified_at
475
+ for dataset in datasets
476
+ if dataset.last_modified_at is not None
477
+ ]
471
478
  dataset_collection_metadata = DatasetCollectionMetadata(
472
- last_modified=max(dataset.last_modified_at for dataset in datasets)
473
- if datasets
479
+ last_modified=max(last_modified_values)
480
+ if last_modified_values
474
481
  else None,
475
482
  row_count=len(datasets),
476
483
  )
@@ -560,22 +567,22 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
560
567
  try:
561
568
  # Delete modified files related to the dataset
562
569
  connection.execute(
563
- file_table.delete().where(
564
- file_table.c.dataset_id == dataset.dataset_id
570
+ self.file_table.delete().where(
571
+ self.file_table.c.dataset_id == dataset.dataset_id
565
572
  )
566
573
  )
567
574
 
568
575
  # Delete revisions related to the dataset
569
576
  connection.execute(
570
- revision_table.delete().where(
571
- revision_table.c.dataset_id == dataset.dataset_id
577
+ self.revision_table.delete().where(
578
+ self.revision_table.c.dataset_id == dataset.dataset_id
572
579
  )
573
580
  )
574
581
 
575
582
  # Delete the dataset itself
576
583
  connection.execute(
577
- dataset_table.delete().where(
578
- dataset_table.c.dataset_id == dataset.dataset_id
584
+ self.dataset_table.delete().where(
585
+ self.dataset_table.c.dataset_id == dataset.dataset_id
579
586
  )
580
587
  )
581
588
 
@@ -251,6 +251,28 @@ class FailingJobSource(Source):
251
251
  raise Exception("some failure")
252
252
 
253
253
 
254
+ class NoFilesSource(Source):
255
+ provider = "fake"
256
+
257
+ def find_datasets(
258
+ self,
259
+ dataset_type: str,
260
+ data_spec_versions: DataSpecVersionCollection,
261
+ dataset_collection_metadata: DatasetCollectionMetadata,
262
+ competition_id,
263
+ season_id,
264
+ **kwargs,
265
+ ):
266
+ yield DatasetResource(
267
+ dataset_resource_id=dict(
268
+ competition_id=competition_id, season_id=season_id, match_id=1
269
+ ),
270
+ provider="fake",
271
+ dataset_type="match",
272
+ name="Dataset Without Files",
273
+ )
274
+
275
+
254
276
  def test_engine(config_file):
255
277
  engine = get_engine(config_file, "main")
256
278
 
@@ -499,3 +521,27 @@ def test_post_load_files_hook(config_file):
499
521
  engine.load()
500
522
  dataset2 = engine.store.get_dataset_collection().first()
501
523
  assert dataset2.state == DatasetState.COMPLETE
524
+
525
+
526
+ def test_force_save_creates_revision(config_file):
527
+ """Test that datasets get a revision even when no files are persisted."""
528
+ engine = get_engine(config_file, "main")
529
+
530
+ # Create one dataset with files and one without
531
+ add_ingestion_plan(
532
+ engine, SimpleFakeSource("fake-source"), competition_id=1, season_id=2
533
+ )
534
+ add_ingestion_plan(
535
+ engine, NoFilesSource("fake-source"), competition_id=1, season_id=3
536
+ )
537
+
538
+ engine.load()
539
+
540
+ # This should not fail even though one dataset has no last_modified_at
541
+ datasets = engine.store.get_dataset_collection()
542
+ assert len(datasets) == 2
543
+
544
+ # Verify the dataset without files still has a revision
545
+ dataset_without_files = engine.store.get_dataset_collection(season_id=3).first()
546
+ assert len(dataset_without_files.revisions) == 1
547
+ assert len(dataset_without_files.current_revision.modified_files) == 0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestify
3
- Version: 0.9.1
3
+ Version: 0.9.3
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes