ingestify 0.9.4__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. {ingestify-0.9.4 → ingestify-0.10.0}/PKG-INFO +1 -1
  2. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/__init__.py +1 -1
  3. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +37 -13
  4. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/dataset/sqlalchemy/tables.py +19 -2
  5. ingestify-0.10.0/ingestify/tests/conftest.py +59 -0
  6. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_auto_ingest.py +13 -26
  7. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_engine.py +19 -30
  8. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_file_cache.py +1 -2
  9. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_pagination.py +2 -4
  10. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_store_version.py +17 -10
  11. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/PKG-INFO +1 -1
  12. ingestify-0.9.4/ingestify/tests/conftest.py +0 -17
  13. {ingestify-0.9.4 → ingestify-0.10.0}/README.md +0 -0
  14. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/application/__init__.py +0 -0
  15. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/application/dataset_store.py +0 -0
  16. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/application/ingestion_engine.py +0 -0
  17. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/application/loader.py +0 -0
  18. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/application/secrets_manager.py +0 -0
  19. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/cmdline.py +0 -0
  20. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/__init__.py +0 -0
  21. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/__init__.py +0 -0
  22. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/base.py +0 -0
  23. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  24. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/__init__.py +0 -0
  25. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/collection.py +0 -0
  26. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  27. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/dataset.py +0 -0
  28. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
  29. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/dataset_state.py +0 -0
  30. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/events.py +0 -0
  31. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/file.py +0 -0
  32. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
  33. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
  34. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/identifier.py +0 -0
  35. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/revision.py +0 -0
  36. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/selector.py +0 -0
  37. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/__init__.py +0 -0
  38. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/_old_event.py +0 -0
  39. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/dispatcher.py +0 -0
  40. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/domain_event.py +0 -0
  41. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/event_bus.py +0 -0
  42. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/publisher.py +0 -0
  43. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/subscriber.py +0 -0
  44. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/fetch_policy.py +0 -0
  45. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
  46. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/ingestion/ingestion_job.py +0 -0
  47. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
  48. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  49. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/resources/__init__.py +0 -0
  50. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/resources/dataset_resource.py +0 -0
  51. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/sink.py +0 -0
  52. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/source.py +0 -0
  53. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/task/__init__.py +0 -0
  54. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/task/set.py +0 -0
  55. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/task/task.py +0 -0
  56. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/task/task_summary.py +0 -0
  57. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/timing.py +0 -0
  58. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/services/__init__.py +0 -0
  59. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  60. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/services/transformers/__init__.py +0 -0
  61. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  62. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/exceptions.py +0 -0
  63. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/__init__.py +0 -0
  64. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/fetch/__init__.py +0 -0
  65. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/fetch/http.py +0 -0
  66. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/serialization/__init__.py +0 -0
  67. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/sink/__init__.py +0 -0
  68. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/sink/postgresql.py +0 -0
  69. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/source/__init__.py +0 -0
  70. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/source/statsbomb/__init__.py +0 -0
  71. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/source/statsbomb/base.py +0 -0
  72. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/source/statsbomb/match.py +0 -0
  73. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/source/statsbomb_github.py +0 -0
  74. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/__init__.py +0 -0
  75. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/dataset/__init__.py +0 -0
  76. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  77. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/file/__init__.py +0 -0
  78. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  79. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
  80. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  81. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/main.py +0 -0
  82. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/server.py +0 -0
  83. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/source_base.py +0 -0
  84. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/__init__.py +0 -0
  85. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_events.py +0 -0
  86. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_table_prefix.py +0 -0
  87. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/utils.py +0 -0
  88. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/SOURCES.txt +0 -0
  89. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/dependency_links.txt +0 -0
  90. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/entry_points.txt +0 -0
  91. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/requires.txt +0 -0
  92. {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/top_level.txt +0 -0
  93. {ingestify-0.9.4 → ingestify-0.10.0}/setup.cfg +0 -0
  94. {ingestify-0.9.4 → ingestify-0.10.0}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestify
3
- Version: 0.9.4
3
+ Version: 0.10.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -9,4 +9,4 @@ if not __INGESTIFY_SETUP__:
9
9
  from .source_base import Source, DatasetResource
10
10
  from .main import debug_source
11
11
 
12
- __version__ = "0.9.4"
12
+ __version__ = "0.10.0"
@@ -130,7 +130,7 @@ class SqlAlchemySessionProvider:
130
130
  self._init_engine()
131
131
 
132
132
  # Create all tables in the database
133
- self.metadata.create_all(self.engine)
133
+ self.create_all_tables()
134
134
 
135
135
  def __del__(self):
136
136
  self.close()
@@ -143,6 +143,14 @@ class SqlAlchemySessionProvider:
143
143
  if hasattr(self, "engine"):
144
144
  self.engine.dispose()
145
145
 
146
+ def create_all_tables(self):
147
+ self.metadata.create_all(self.engine)
148
+
149
+ def drop_all_tables(self):
150
+ """Drop all tables in the database. Useful for test cleanup."""
151
+ if hasattr(self, "metadata") and hasattr(self, "engine"):
152
+ self.metadata.drop_all(self.engine)
153
+
146
154
  def get(self):
147
155
  return self.session()
148
156
 
@@ -208,18 +216,33 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
208
216
 
209
217
  primary_key_columns = [column for column in table.columns if column.primary_key]
210
218
 
211
- if immutable_rows:
212
- stmt = stmt.on_conflict_do_nothing(index_elements=primary_key_columns)
219
+ if dialect == "mysql":
220
+ # MySQL uses ON DUPLICATE KEY UPDATE syntax
221
+ if immutable_rows:
222
+ # For MySQL immutable rows, use INSERT IGNORE to skip duplicates
223
+ stmt = stmt.prefix_with("IGNORE")
224
+ else:
225
+ # MySQL uses stmt.inserted instead of stmt.excluded
226
+ set_ = {
227
+ name: stmt.inserted[name]
228
+ for name, column in table.columns.items()
229
+ if column not in primary_key_columns
230
+ }
231
+ stmt = stmt.on_duplicate_key_update(set_)
213
232
  else:
214
- set_ = {
215
- name: getattr(stmt.excluded, name)
216
- for name, column in table.columns.items()
217
- if column not in primary_key_columns
218
- }
219
-
220
- stmt = stmt.on_conflict_do_update(
221
- index_elements=primary_key_columns, set_=set_
222
- )
233
+ # PostgreSQL and SQLite use ON CONFLICT syntax
234
+ if immutable_rows:
235
+ stmt = stmt.on_conflict_do_nothing(index_elements=primary_key_columns)
236
+ else:
237
+ set_ = {
238
+ name: getattr(stmt.excluded, name)
239
+ for name, column in table.columns.items()
240
+ if column not in primary_key_columns
241
+ }
242
+
243
+ stmt = stmt.on_conflict_do_update(
244
+ index_elements=primary_key_columns, set_=set_
245
+ )
223
246
 
224
247
  connection.execute(stmt)
225
248
 
@@ -242,7 +265,8 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
242
265
  def _build_cte(self, records: list[dict], name: str) -> CTE:
243
266
  """Build a CTE from a list of dictionaries."""
244
267
 
245
- if self.dialect.name == "sqlite":
268
+ if self.dialect.name in ("sqlite", "mysql"):
269
+ # SQLite and MySQL don't support VALUES syntax, use UNION ALL instead
246
270
  return self._build_cte_sqlite(records, name)
247
271
 
248
272
  first_row = records[0]
@@ -51,14 +51,31 @@ class TZDateTime(TypeDecorator):
51
51
  LOCAL_TIMEZONE = datetime.datetime.utcnow().astimezone().tzinfo
52
52
  cache_ok = True
53
53
 
54
+ def __init__(self, fsp=None, **kwargs):
55
+ super().__init__(**kwargs)
56
+ self.fsp = fsp
57
+
58
+ def load_dialect_impl(self, dialect):
59
+ # For MySQL, use DATETIME with fractional seconds precision
60
+ if dialect.name == "mysql" and self.fsp is not None:
61
+ from sqlalchemy.dialects.mysql import DATETIME as MySQL_DATETIME
62
+
63
+ # Return the type without type_descriptor to ensure our process methods are called
64
+ return MySQL_DATETIME(fsp=self.fsp)
65
+ return super().load_dialect_impl(dialect)
66
+
54
67
  def process_bind_param(self, value: Optional[datetime.datetime], dialect):
55
68
  if not value:
56
69
  return None
57
70
 
58
71
  if value.tzinfo is None:
59
- value = value.astimezone(self.LOCAL_TIMEZONE)
72
+ # Assume naive datetimes are already in UTC
73
+ value = value.replace(tzinfo=datetime.timezone.utc)
74
+ else:
75
+ # Convert timezone-aware datetimes to UTC
76
+ value = value.astimezone(datetime.timezone.utc)
60
77
 
61
- return value.astimezone(datetime.timezone.utc)
78
+ return value
62
79
 
63
80
  def process_result_value(self, value, dialect):
64
81
  if not value:
@@ -0,0 +1,59 @@
1
+ import tempfile
2
+
3
+ import os
4
+
5
+ import pytest
6
+
7
+ from ingestify.main import get_engine
8
+
9
+
10
+ @pytest.fixture(scope="function", autouse=True)
11
+ def datastore_dir():
12
+ with tempfile.TemporaryDirectory() as tmpdirname:
13
+ os.environ["TEST_DIR"] = tmpdirname
14
+ os.environ["INGESTIFY_RUN_EAGER"] = "true"
15
+ yield tmpdirname
16
+
17
+
18
+ @pytest.fixture(scope="function")
19
+ def ingestify_test_database_url(datastore_dir, monkeypatch):
20
+ key = "INGESTIFY_TEST_DATABASE_URL"
21
+
22
+ value = os.environ.get(key)
23
+ if value is None:
24
+ value = f"sqlite:///{datastore_dir}/main.db"
25
+ monkeypatch.setenv(key, value)
26
+
27
+ yield value
28
+
29
+
30
+ @pytest.fixture(scope="function")
31
+ def config_file(ingestify_test_database_url):
32
+ # Depend on ingestify_test_database_url to make sure environment variables are set in time, also make sure database is
33
+ # cleaned before ingestify opens a connection
34
+ return os.path.abspath(os.path.dirname(__file__) + "/config.yaml")
35
+
36
+
37
+ @pytest.fixture
38
+ def db_cleanup():
39
+ def do_cleanup(engine):
40
+ # # Close connections after test
41
+ session_provider = getattr(
42
+ engine.store.dataset_repository, "session_provider", None
43
+ )
44
+ if session_provider:
45
+ session_provider.session.remove()
46
+ session_provider.engine.dispose()
47
+ session_provider.drop_all_tables()
48
+
49
+ return do_cleanup
50
+
51
+
52
+ @pytest.fixture(scope="function")
53
+ def engine(config_file, db_cleanup):
54
+ # Now create the engine for the test
55
+ engine = get_engine(config_file, "main")
56
+
57
+ yield engine
58
+
59
+ db_cleanup(engine)
@@ -8,6 +8,7 @@ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
8
8
  from ingestify.domain.models.fetch_policy import FetchPolicy
9
9
  from ingestify.domain import Selector, DataSpecVersionCollection
10
10
  from ingestify import Source, DatasetResource
11
+ from ingestify.utils import utcnow
11
12
 
12
13
 
13
14
  class MockSource(Source):
@@ -39,7 +40,7 @@ class MockSource(Source):
39
40
  url="http://test.com/match1",
40
41
  ).add_file(
41
42
  data_feed_key="test",
42
- last_modified=datetime.datetime.now(),
43
+ last_modified=utcnow(),
43
44
  json_content={"blaat": "piet"},
44
45
  )
45
46
 
@@ -75,7 +76,7 @@ class MockSourceWithDiscoverSelectors(Source):
75
76
  url="http://test.com/match1",
76
77
  ).add_file(
77
78
  data_feed_key="test",
78
- last_modified=datetime.datetime.now(),
79
+ last_modified=utcnow(),
79
80
  json_content={"competition_id": 11},
80
81
  )
81
82
  elif competition_id == 22:
@@ -91,7 +92,7 @@ class MockSourceWithDiscoverSelectors(Source):
91
92
  url="http://test.com/match2",
92
93
  ).add_file(
93
94
  data_feed_key="test",
94
- last_modified=datetime.datetime.now(),
95
+ last_modified=utcnow(),
95
96
  json_content={"competition_id": 22},
96
97
  )
97
98
 
@@ -106,10 +107,8 @@ class MockSourceWithDiscoverSelectors(Source):
106
107
  ]
107
108
 
108
109
 
109
- def test_iter_datasets_basic_auto_ingest(config_file):
110
+ def test_iter_datasets_basic_auto_ingest(engine):
110
111
  """Test basic auto-ingest functionality."""
111
- engine = get_engine(config_file)
112
-
113
112
  # Add a simple ingestion plan
114
113
  mock_source = MockSource(name="test_source")
115
114
  data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
@@ -141,20 +140,16 @@ def test_iter_datasets_basic_auto_ingest(config_file):
141
140
  assert datasets[0].identifier["competition_id"] == 11
142
141
 
143
142
 
144
- def test_iter_datasets_auto_ingest_disabled(config_file):
143
+ def test_iter_datasets_auto_ingest_disabled(engine):
145
144
  """Test that auto_ingest=False returns only existing datasets."""
146
- engine = get_engine(config_file)
147
-
148
145
  # Should only return existing datasets (none in empty store)
149
146
  datasets = list(engine.iter_datasets(competition_id=11, auto_ingest=False))
150
147
 
151
148
  assert len(datasets) == 0
152
149
 
153
150
 
154
- def test_iter_datasets_outside_config_scope(config_file):
151
+ def test_iter_datasets_outside_config_scope(engine):
155
152
  """Test that requests outside IngestionPlan scope return nothing."""
156
- engine = get_engine(config_file)
157
-
158
153
  # Add plan only for competition_id=11
159
154
  mock_source = MockSource(name="test_source")
160
155
  data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
@@ -180,10 +175,8 @@ def test_iter_datasets_outside_config_scope(config_file):
180
175
  assert len(datasets) == 0
181
176
 
182
177
 
183
- def test_iter_datasets_discover_selectors_with_filters(config_file):
178
+ def test_iter_datasets_discover_selectors_with_filters(engine):
184
179
  """Test that selector_filters are applied after discover_selectors runs."""
185
- engine = get_engine(config_file)
186
-
187
180
  # Create an IngestionPlan with empty selector - this will trigger discover_selectors
188
181
  mock_source = MockSourceWithDiscoverSelectors(name="test_source_discover")
189
182
  data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
@@ -216,10 +209,8 @@ def test_iter_datasets_discover_selectors_with_filters(config_file):
216
209
  assert datasets[0].name == "Mock match comp 11"
217
210
 
218
211
 
219
- def test_iter_datasets_discover_selectors_multiple_matches(config_file):
212
+ def test_iter_datasets_discover_selectors_multiple_matches(engine):
220
213
  """Test that multiple discovered selectors can match the filters."""
221
- engine = get_engine(config_file)
222
-
223
214
  # Create an IngestionPlan with empty selector - this will trigger discover_selectors
224
215
  mock_source = MockSourceWithDiscoverSelectors(name="test_source_discover")
225
216
  data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
@@ -248,12 +239,10 @@ def test_iter_datasets_discover_selectors_multiple_matches(config_file):
248
239
  assert competition_ids == {11, 22}
249
240
 
250
241
 
251
- def test_selector_filters_make_discovered_selectors_more_strict(config_file):
242
+ def test_selector_filters_make_discovered_selectors_more_strict(engine):
252
243
  """Test that when selector_filters are more strict than discovered selectors, we make the selectors more strict."""
253
244
  from unittest.mock import Mock
254
245
 
255
- engine = get_engine(config_file)
256
-
257
246
  # Create a source that returns multiple matches per season
258
247
  class MockSourceMultipleMatches(Source):
259
248
  @property
@@ -291,7 +280,7 @@ def test_selector_filters_make_discovered_selectors_more_strict(config_file):
291
280
  url=f"http://test.com/match{mid}",
292
281
  ).add_file(
293
282
  data_feed_key="test",
294
- last_modified=datetime.datetime.now(),
283
+ last_modified=utcnow(),
295
284
  json_content={"match_id": mid},
296
285
  )
297
286
  return []
@@ -348,13 +337,11 @@ def test_selector_filters_make_discovered_selectors_more_strict(config_file):
348
337
  # Without this optimization, we'd call with match_id=None and fetch 3 matches instead of 1
349
338
 
350
339
 
351
- def test_iter_datasets_with_open_data_auto_discovery(config_file):
340
+ def test_iter_datasets_with_open_data_auto_discovery(engine):
352
341
  """Test that use_open_data=True auto-discovers open data sources without configuration."""
353
342
  from unittest.mock import Mock
354
343
  from ingestify.application import loader
355
344
 
356
- engine = get_engine(config_file)
357
-
358
345
  # Create mock source class that inherits from Source
359
346
  class MockOpenDataSource(Source):
360
347
  def __init__(self, name):
@@ -387,7 +374,7 @@ def test_iter_datasets_with_open_data_auto_discovery(config_file):
387
374
  url="http://open-data.com/match123",
388
375
  ).add_file(
389
376
  data_feed_key="test",
390
- last_modified=datetime.datetime.now(),
377
+ last_modified=utcnow(),
391
378
  json_content={"match_id": 123},
392
379
  )
393
380
 
@@ -25,7 +25,8 @@ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
25
25
  from ingestify.domain.models.fetch_policy import FetchPolicy
26
26
  from ingestify.domain.models.task.task_summary import TaskState
27
27
  from ingestify.infra.serialization import serialize, deserialize
28
- from ingestify.main import get_engine, get_dev_engine
28
+ from ingestify.main import get_dev_engine
29
+ from ingestify.utils import utcnow
29
30
 
30
31
 
31
32
  def add_ingestion_plan(engine: IngestionEngine, source: Source, **selector):
@@ -78,7 +79,7 @@ class SimpleFakeSource(Source):
78
79
  season_id,
79
80
  **kwargs,
80
81
  ):
81
- last_modified = datetime.now(pytz.utc)
82
+ last_modified = utcnow()
82
83
 
83
84
  yield (
84
85
  DatasetResource(
@@ -273,9 +274,7 @@ class NoFilesSource(Source):
273
274
  )
274
275
 
275
276
 
276
- def test_engine(config_file):
277
- engine = get_engine(config_file, "main")
278
-
277
+ def test_engine(engine):
279
278
  add_ingestion_plan(
280
279
  engine, SimpleFakeSource("fake-source"), competition_id=1, season_id=2
281
280
  )
@@ -293,6 +292,7 @@ def test_engine(config_file):
293
292
 
294
293
  dataset = datasets.first()
295
294
  assert dataset.identifier == Identifier(competition_id=1, season_id=2, match_id=1)
295
+
296
296
  assert len(dataset.revisions) == 2
297
297
  assert len(dataset.revisions[0].modified_files) == 3
298
298
  assert len(dataset.revisions[1].modified_files) == 1
@@ -325,13 +325,11 @@ def test_engine(config_file):
325
325
  assert dataset.last_modified_at is not None
326
326
 
327
327
 
328
- def test_iterator_source(config_file):
328
+ def test_iterator_source(engine):
329
329
  """Test when a Source returns a Iterator to do Batch processing.
330
330
 
331
331
  Every batch must be executed right away.
332
332
  """
333
- engine = get_engine(config_file, "main")
334
-
335
333
  batch_source = None
336
334
 
337
335
  def callback(idx):
@@ -339,7 +337,7 @@ def test_iterator_source(config_file):
339
337
  datasets = engine.store.get_dataset_collection()
340
338
  assert len(datasets) == idx
341
339
 
342
- if idx == 1000:
340
+ if idx == 100:
343
341
  batch_source.should_stop = True
344
342
 
345
343
  batch_source = BatchSource("fake-source", callback)
@@ -348,7 +346,7 @@ def test_iterator_source(config_file):
348
346
  engine.load()
349
347
 
350
348
  datasets = engine.store.get_dataset_collection()
351
- assert len(datasets) == 1000
349
+ assert len(datasets) == 100
352
350
  for dataset in datasets:
353
351
  assert len(dataset.revisions) == 1
354
352
 
@@ -357,14 +355,14 @@ def test_iterator_source(config_file):
357
355
  batch_source.should_stop = False
358
356
 
359
357
  def callback(idx):
360
- if idx == 1000:
358
+ if idx == 100:
361
359
  batch_source.should_stop = True
362
360
 
363
361
  batch_source.callback = callback
364
362
 
365
363
  engine.load()
366
364
  datasets = engine.store.get_dataset_collection()
367
- assert len(datasets) == 1000
365
+ assert len(datasets) == 100
368
366
  for dataset in datasets:
369
367
  assert len(dataset.revisions) == 2
370
368
 
@@ -373,9 +371,7 @@ def test_iterator_source(config_file):
373
371
  deserialize(s)
374
372
 
375
373
 
376
- def test_ingestion_plan_failing_task(config_file):
377
- engine = get_engine(config_file, "main")
378
-
374
+ def test_ingestion_plan_failing_task(engine):
379
375
  source = FailingLoadSource("fake-source")
380
376
 
381
377
  add_ingestion_plan(engine, source, competition_id=1, season_id=2)
@@ -387,9 +383,7 @@ def test_ingestion_plan_failing_task(config_file):
387
383
  assert items[0].task_summaries[0].state == TaskState.FAILED
388
384
 
389
385
 
390
- def test_ingestion_plan_failing_job(config_file):
391
- engine = get_engine(config_file, "main")
392
-
386
+ def test_ingestion_plan_failing_job(engine):
393
387
  source = FailingJobSource("fake-source")
394
388
 
395
389
  add_ingestion_plan(engine, source, competition_id=1, season_id=2)
@@ -412,9 +406,7 @@ def test_change_partition_key_transformer():
412
406
  """
413
407
 
414
408
 
415
- def test_serde(config_file):
416
- engine = get_engine(config_file, "main")
417
-
409
+ def test_serde(engine):
418
410
  add_ingestion_plan(
419
411
  engine, SimpleFakeSource("fake-source"), competition_id=1, season_id=2
420
412
  )
@@ -434,10 +426,8 @@ def test_serde(config_file):
434
426
  assert event.model_dump_json() == deserialized_event.model_dump_json()
435
427
 
436
428
 
437
- def test_empty_dataset_resource_id(config_file):
429
+ def test_empty_dataset_resource_id(engine):
438
430
  """When a empty DatasetResourceId is passed nothing should break"""
439
- engine = get_engine(config_file, "main")
440
-
441
431
  add_ingestion_plan(engine, EmptyDatasetResourceIdSource("fake-source"))
442
432
  engine.load()
443
433
 
@@ -509,9 +499,8 @@ class SourceWithHook(Source):
509
499
  )
510
500
 
511
501
 
512
- def test_post_load_files_hook(config_file):
502
+ def test_post_load_files_hook(engine):
513
503
  """Test that post_load_files hook changes state from SCHEDULED to COMPLETE when content is not empty."""
514
- engine = get_engine(config_file, "main")
515
504
  add_ingestion_plan(engine, SourceWithHook("test"), competition_id=1, season_id=2)
516
505
 
517
506
  # First run: file contains '{}', state should remain SCHEDULED
@@ -525,10 +514,8 @@ def test_post_load_files_hook(config_file):
525
514
  assert dataset2.state == DatasetState.COMPLETE
526
515
 
527
516
 
528
- def test_force_save_creates_revision(config_file):
517
+ def test_force_save_creates_revision(engine):
529
518
  """Test that datasets get a revision even when no files are persisted."""
530
- engine = get_engine(config_file, "main")
531
-
532
519
  # Create one dataset with files and one without
533
520
  add_ingestion_plan(
534
521
  engine, SimpleFakeSource("fake-source"), competition_id=1, season_id=2
@@ -552,7 +539,9 @@ def test_force_save_creates_revision(config_file):
552
539
  season_id=2
553
540
  ).first()
554
541
 
555
- dataset_without_files = engine.store.get_dataset_collection(metadata_only=True)
542
+ dataset_without_files = engine.store.get_dataset_collection(
543
+ season_id=2, metadata_only=True
544
+ )
556
545
  assert (
557
546
  dataset_without_files.metadata.last_modified
558
547
  == dataset_with_last_modified.last_modified_at
@@ -8,10 +8,9 @@ from ingestify.domain import Dataset, Identifier, Revision, File
8
8
  from ingestify.domain.models.dataset.revision import RevisionSource, SourceType
9
9
 
10
10
 
11
- def test_file_cache(config_file):
11
+ def test_file_cache(engine):
12
12
  """Test file caching with the with_file_cache context manager."""
13
13
  # Get engine from the fixture
14
- engine = get_engine(config_file, "main")
15
14
  store = engine.store
16
15
 
17
16
  # Create a timestamp for test data
@@ -6,10 +6,9 @@ from ingestify.domain import Dataset, Identifier, DatasetState
6
6
  from ingestify.main import get_engine
7
7
 
8
8
 
9
- def test_iter_dataset_collection_batches(config_file):
9
+ def test_iter_dataset_collection_batches(engine):
10
10
  """Test iteration over datasets with batches using iter_dataset_collection_batches."""
11
11
  # Get engine from the fixture
12
- engine = get_engine(config_file, "main")
13
12
  store = engine.store
14
13
  bucket = store.bucket
15
14
 
@@ -81,10 +80,9 @@ def test_iter_dataset_collection_batches(config_file):
81
80
  assert filtered_dataset_ids[0] == "dataset-5"
82
81
 
83
82
 
84
- def test_dataset_state_filter(config_file):
83
+ def test_dataset_state_filter(engine):
85
84
  """Test filtering datasets by state."""
86
85
  # Get engine from the fixture
87
- engine = get_engine(config_file, "main")
88
86
  store = engine.store
89
87
  bucket = store.bucket
90
88
 
@@ -4,7 +4,7 @@ from unittest.mock import patch
4
4
  from ingestify.main import get_engine
5
5
 
6
6
 
7
- def test_store_version_tracking_new_store(config_file):
7
+ def test_store_version_tracking_new_store(config_file, db_cleanup):
8
8
  """Test that a new store gets initialized with the current version."""
9
9
  with patch("ingestify.__version__", "1.0.0"):
10
10
  engine = get_engine(config_file)
@@ -13,8 +13,10 @@ def test_store_version_tracking_new_store(config_file):
13
13
  stored_version = engine.store.dataset_repository.get_store_version()
14
14
  assert stored_version == "1.0.0"
15
15
 
16
+ db_cleanup(engine)
16
17
 
17
- def test_store_version_tracking_existing_store_same_version(config_file):
18
+
19
+ def test_store_version_tracking_existing_store_same_version(config_file, db_cleanup):
18
20
  """Test that an existing store with same version doesn't cause issues."""
19
21
  with patch("ingestify.__version__", "1.0.0"):
20
22
  # Initialize store first time
@@ -29,16 +31,20 @@ def test_store_version_tracking_existing_store_same_version(config_file):
29
31
  stored_version = store2.dataset_repository.get_store_version()
30
32
  assert stored_version == "1.0.0"
31
33
 
34
+ db_cleanup(engine1)
35
+
32
36
 
33
- def test_store_version_tracking_version_mismatch(config_file, caplog):
37
+ def test_store_version_tracking_version_mismatch(config_file, caplog, db_cleanup):
34
38
  """Test that version mismatch is logged as warning."""
35
- # Initialize store with version 1.0.0
36
- with patch("ingestify.__version__", "1.0.0"):
39
+ # Use engine as fixture as this cleans up the database
40
+
41
+ # Initialize store with version 1.0.1
42
+ with patch("ingestify.__version__", "1.0.1"):
37
43
  engine1 = get_engine(config_file)
38
44
  store1 = engine1.store
39
45
 
40
46
  stored_version = store1.dataset_repository.get_store_version()
41
- assert stored_version == "1.0.0"
47
+ assert stored_version == "1.0.1"
42
48
 
43
49
  # Open store with different version
44
50
  with patch("ingestify.__version__", "2.0.0"):
@@ -47,16 +53,17 @@ def test_store_version_tracking_version_mismatch(config_file, caplog):
47
53
 
48
54
  # Version should still be the original one
49
55
  stored_version = store2.dataset_repository.get_store_version()
50
- assert stored_version == "1.0.0"
56
+ assert stored_version == "1.0.1"
51
57
 
52
58
  # Should have logged a warning about version mismatch
53
59
  assert "Store version mismatch" in caplog.text
54
- assert "stored=1.0.0, current=2.0.0" in caplog.text
60
+ assert "stored=1.0.1, current=2.0.0" in caplog.text
61
+
62
+ db_cleanup(engine1)
55
63
 
56
64
 
57
- def test_store_version_methods(config_file):
65
+ def test_store_version_methods(engine):
58
66
  """Test the repository version methods directly."""
59
- engine = get_engine(config_file)
60
67
  repo = engine.store.dataset_repository
61
68
 
62
69
  from ingestify import __version__
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ingestify
3
- Version: 0.9.4
3
+ Version: 0.10.0
4
4
  Summary: Data Ingestion Framework
5
5
  Author: Koen Vossen
6
6
  Author-email: info@koenvossen.nl
@@ -1,17 +0,0 @@
1
- import tempfile
2
-
3
- import pytest
4
- import os
5
-
6
-
7
- @pytest.fixture(scope="function", autouse=True)
8
- def datastore_dir():
9
- with tempfile.TemporaryDirectory() as tmpdirname:
10
- os.environ["TEST_DIR"] = tmpdirname
11
- os.environ["INGESTIFY_RUN_EAGER"] = "true"
12
- yield tmpdirname
13
-
14
-
15
- @pytest.fixture(scope="session")
16
- def config_file():
17
- return os.path.abspath(os.path.dirname(__file__) + "/config.yaml")
File without changes
File without changes
File without changes
File without changes
File without changes