ingestify 0.9.4__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ingestify-0.9.4 → ingestify-0.10.0}/PKG-INFO +1 -1
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/__init__.py +1 -1
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +37 -13
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/dataset/sqlalchemy/tables.py +19 -2
- ingestify-0.10.0/ingestify/tests/conftest.py +59 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_auto_ingest.py +13 -26
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_engine.py +19 -30
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_file_cache.py +1 -2
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_pagination.py +2 -4
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_store_version.py +17 -10
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/PKG-INFO +1 -1
- ingestify-0.9.4/ingestify/tests/conftest.py +0 -17
- {ingestify-0.9.4 → ingestify-0.10.0}/README.md +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/application/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/application/dataset_store.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/application/ingestion_engine.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/application/loader.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/cmdline.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/base.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/data_spec_version_collection.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/collection.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/dataset.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/dataset_repository.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/dataset_state.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/events.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/file.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/revision.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/dataset/selector.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/event_bus.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/ingestion/ingestion_job.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/resources/dataset_resource.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/task/task_summary.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/exceptions.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/fetch/http.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/source/statsbomb/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/source/statsbomb/base.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/source/statsbomb/match.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/source/statsbomb_github.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/main.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/server.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/source_base.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/__init__.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_events.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/tests/test_table_prefix.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify/utils.py +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/SOURCES.txt +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/requires.txt +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/setup.cfg +0 -0
- {ingestify-0.9.4 → ingestify-0.10.0}/setup.py +0 -0
|
@@ -130,7 +130,7 @@ class SqlAlchemySessionProvider:
|
|
|
130
130
|
self._init_engine()
|
|
131
131
|
|
|
132
132
|
# Create all tables in the database
|
|
133
|
-
self.
|
|
133
|
+
self.create_all_tables()
|
|
134
134
|
|
|
135
135
|
def __del__(self):
|
|
136
136
|
self.close()
|
|
@@ -143,6 +143,14 @@ class SqlAlchemySessionProvider:
|
|
|
143
143
|
if hasattr(self, "engine"):
|
|
144
144
|
self.engine.dispose()
|
|
145
145
|
|
|
146
|
+
def create_all_tables(self):
|
|
147
|
+
self.metadata.create_all(self.engine)
|
|
148
|
+
|
|
149
|
+
def drop_all_tables(self):
|
|
150
|
+
"""Drop all tables in the database. Useful for test cleanup."""
|
|
151
|
+
if hasattr(self, "metadata") and hasattr(self, "engine"):
|
|
152
|
+
self.metadata.drop_all(self.engine)
|
|
153
|
+
|
|
146
154
|
def get(self):
|
|
147
155
|
return self.session()
|
|
148
156
|
|
|
@@ -208,18 +216,33 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
208
216
|
|
|
209
217
|
primary_key_columns = [column for column in table.columns if column.primary_key]
|
|
210
218
|
|
|
211
|
-
if
|
|
212
|
-
|
|
219
|
+
if dialect == "mysql":
|
|
220
|
+
# MySQL uses ON DUPLICATE KEY UPDATE syntax
|
|
221
|
+
if immutable_rows:
|
|
222
|
+
# For MySQL immutable rows, use INSERT IGNORE to skip duplicates
|
|
223
|
+
stmt = stmt.prefix_with("IGNORE")
|
|
224
|
+
else:
|
|
225
|
+
# MySQL uses stmt.inserted instead of stmt.excluded
|
|
226
|
+
set_ = {
|
|
227
|
+
name: stmt.inserted[name]
|
|
228
|
+
for name, column in table.columns.items()
|
|
229
|
+
if column not in primary_key_columns
|
|
230
|
+
}
|
|
231
|
+
stmt = stmt.on_duplicate_key_update(set_)
|
|
213
232
|
else:
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
233
|
+
# PostgreSQL and SQLite use ON CONFLICT syntax
|
|
234
|
+
if immutable_rows:
|
|
235
|
+
stmt = stmt.on_conflict_do_nothing(index_elements=primary_key_columns)
|
|
236
|
+
else:
|
|
237
|
+
set_ = {
|
|
238
|
+
name: getattr(stmt.excluded, name)
|
|
239
|
+
for name, column in table.columns.items()
|
|
240
|
+
if column not in primary_key_columns
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
stmt = stmt.on_conflict_do_update(
|
|
244
|
+
index_elements=primary_key_columns, set_=set_
|
|
245
|
+
)
|
|
223
246
|
|
|
224
247
|
connection.execute(stmt)
|
|
225
248
|
|
|
@@ -242,7 +265,8 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
242
265
|
def _build_cte(self, records: list[dict], name: str) -> CTE:
|
|
243
266
|
"""Build a CTE from a list of dictionaries."""
|
|
244
267
|
|
|
245
|
-
if self.dialect.name
|
|
268
|
+
if self.dialect.name in ("sqlite", "mysql"):
|
|
269
|
+
# SQLite and MySQL don't support VALUES syntax, use UNION ALL instead
|
|
246
270
|
return self._build_cte_sqlite(records, name)
|
|
247
271
|
|
|
248
272
|
first_row = records[0]
|
|
@@ -51,14 +51,31 @@ class TZDateTime(TypeDecorator):
|
|
|
51
51
|
LOCAL_TIMEZONE = datetime.datetime.utcnow().astimezone().tzinfo
|
|
52
52
|
cache_ok = True
|
|
53
53
|
|
|
54
|
+
def __init__(self, fsp=None, **kwargs):
|
|
55
|
+
super().__init__(**kwargs)
|
|
56
|
+
self.fsp = fsp
|
|
57
|
+
|
|
58
|
+
def load_dialect_impl(self, dialect):
|
|
59
|
+
# For MySQL, use DATETIME with fractional seconds precision
|
|
60
|
+
if dialect.name == "mysql" and self.fsp is not None:
|
|
61
|
+
from sqlalchemy.dialects.mysql import DATETIME as MySQL_DATETIME
|
|
62
|
+
|
|
63
|
+
# Return the type without type_descriptor to ensure our process methods are called
|
|
64
|
+
return MySQL_DATETIME(fsp=self.fsp)
|
|
65
|
+
return super().load_dialect_impl(dialect)
|
|
66
|
+
|
|
54
67
|
def process_bind_param(self, value: Optional[datetime.datetime], dialect):
|
|
55
68
|
if not value:
|
|
56
69
|
return None
|
|
57
70
|
|
|
58
71
|
if value.tzinfo is None:
|
|
59
|
-
|
|
72
|
+
# Assume naive datetimes are already in UTC
|
|
73
|
+
value = value.replace(tzinfo=datetime.timezone.utc)
|
|
74
|
+
else:
|
|
75
|
+
# Convert timezone-aware datetimes to UTC
|
|
76
|
+
value = value.astimezone(datetime.timezone.utc)
|
|
60
77
|
|
|
61
|
-
return value
|
|
78
|
+
return value
|
|
62
79
|
|
|
63
80
|
def process_result_value(self, value, dialect):
|
|
64
81
|
if not value:
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from ingestify.main import get_engine
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@pytest.fixture(scope="function", autouse=True)
|
|
11
|
+
def datastore_dir():
|
|
12
|
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
13
|
+
os.environ["TEST_DIR"] = tmpdirname
|
|
14
|
+
os.environ["INGESTIFY_RUN_EAGER"] = "true"
|
|
15
|
+
yield tmpdirname
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.fixture(scope="function")
|
|
19
|
+
def ingestify_test_database_url(datastore_dir, monkeypatch):
|
|
20
|
+
key = "INGESTIFY_TEST_DATABASE_URL"
|
|
21
|
+
|
|
22
|
+
value = os.environ.get(key)
|
|
23
|
+
if value is None:
|
|
24
|
+
value = f"sqlite:///{datastore_dir}/main.db"
|
|
25
|
+
monkeypatch.setenv(key, value)
|
|
26
|
+
|
|
27
|
+
yield value
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.fixture(scope="function")
|
|
31
|
+
def config_file(ingestify_test_database_url):
|
|
32
|
+
# Depend on ingestify_test_database_url to make sure environment variables are set in time, also make sure database is
|
|
33
|
+
# cleaned before ingestify opens a connection
|
|
34
|
+
return os.path.abspath(os.path.dirname(__file__) + "/config.yaml")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@pytest.fixture
|
|
38
|
+
def db_cleanup():
|
|
39
|
+
def do_cleanup(engine):
|
|
40
|
+
# # Close connections after test
|
|
41
|
+
session_provider = getattr(
|
|
42
|
+
engine.store.dataset_repository, "session_provider", None
|
|
43
|
+
)
|
|
44
|
+
if session_provider:
|
|
45
|
+
session_provider.session.remove()
|
|
46
|
+
session_provider.engine.dispose()
|
|
47
|
+
session_provider.drop_all_tables()
|
|
48
|
+
|
|
49
|
+
return do_cleanup
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@pytest.fixture(scope="function")
|
|
53
|
+
def engine(config_file, db_cleanup):
|
|
54
|
+
# Now create the engine for the test
|
|
55
|
+
engine = get_engine(config_file, "main")
|
|
56
|
+
|
|
57
|
+
yield engine
|
|
58
|
+
|
|
59
|
+
db_cleanup(engine)
|
|
@@ -8,6 +8,7 @@ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
|
8
8
|
from ingestify.domain.models.fetch_policy import FetchPolicy
|
|
9
9
|
from ingestify.domain import Selector, DataSpecVersionCollection
|
|
10
10
|
from ingestify import Source, DatasetResource
|
|
11
|
+
from ingestify.utils import utcnow
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class MockSource(Source):
|
|
@@ -39,7 +40,7 @@ class MockSource(Source):
|
|
|
39
40
|
url="http://test.com/match1",
|
|
40
41
|
).add_file(
|
|
41
42
|
data_feed_key="test",
|
|
42
|
-
last_modified=
|
|
43
|
+
last_modified=utcnow(),
|
|
43
44
|
json_content={"blaat": "piet"},
|
|
44
45
|
)
|
|
45
46
|
|
|
@@ -75,7 +76,7 @@ class MockSourceWithDiscoverSelectors(Source):
|
|
|
75
76
|
url="http://test.com/match1",
|
|
76
77
|
).add_file(
|
|
77
78
|
data_feed_key="test",
|
|
78
|
-
last_modified=
|
|
79
|
+
last_modified=utcnow(),
|
|
79
80
|
json_content={"competition_id": 11},
|
|
80
81
|
)
|
|
81
82
|
elif competition_id == 22:
|
|
@@ -91,7 +92,7 @@ class MockSourceWithDiscoverSelectors(Source):
|
|
|
91
92
|
url="http://test.com/match2",
|
|
92
93
|
).add_file(
|
|
93
94
|
data_feed_key="test",
|
|
94
|
-
last_modified=
|
|
95
|
+
last_modified=utcnow(),
|
|
95
96
|
json_content={"competition_id": 22},
|
|
96
97
|
)
|
|
97
98
|
|
|
@@ -106,10 +107,8 @@ class MockSourceWithDiscoverSelectors(Source):
|
|
|
106
107
|
]
|
|
107
108
|
|
|
108
109
|
|
|
109
|
-
def test_iter_datasets_basic_auto_ingest(
|
|
110
|
+
def test_iter_datasets_basic_auto_ingest(engine):
|
|
110
111
|
"""Test basic auto-ingest functionality."""
|
|
111
|
-
engine = get_engine(config_file)
|
|
112
|
-
|
|
113
112
|
# Add a simple ingestion plan
|
|
114
113
|
mock_source = MockSource(name="test_source")
|
|
115
114
|
data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
|
|
@@ -141,20 +140,16 @@ def test_iter_datasets_basic_auto_ingest(config_file):
|
|
|
141
140
|
assert datasets[0].identifier["competition_id"] == 11
|
|
142
141
|
|
|
143
142
|
|
|
144
|
-
def test_iter_datasets_auto_ingest_disabled(
|
|
143
|
+
def test_iter_datasets_auto_ingest_disabled(engine):
|
|
145
144
|
"""Test that auto_ingest=False returns only existing datasets."""
|
|
146
|
-
engine = get_engine(config_file)
|
|
147
|
-
|
|
148
145
|
# Should only return existing datasets (none in empty store)
|
|
149
146
|
datasets = list(engine.iter_datasets(competition_id=11, auto_ingest=False))
|
|
150
147
|
|
|
151
148
|
assert len(datasets) == 0
|
|
152
149
|
|
|
153
150
|
|
|
154
|
-
def test_iter_datasets_outside_config_scope(
|
|
151
|
+
def test_iter_datasets_outside_config_scope(engine):
|
|
155
152
|
"""Test that requests outside IngestionPlan scope return nothing."""
|
|
156
|
-
engine = get_engine(config_file)
|
|
157
|
-
|
|
158
153
|
# Add plan only for competition_id=11
|
|
159
154
|
mock_source = MockSource(name="test_source")
|
|
160
155
|
data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
|
|
@@ -180,10 +175,8 @@ def test_iter_datasets_outside_config_scope(config_file):
|
|
|
180
175
|
assert len(datasets) == 0
|
|
181
176
|
|
|
182
177
|
|
|
183
|
-
def test_iter_datasets_discover_selectors_with_filters(
|
|
178
|
+
def test_iter_datasets_discover_selectors_with_filters(engine):
|
|
184
179
|
"""Test that selector_filters are applied after discover_selectors runs."""
|
|
185
|
-
engine = get_engine(config_file)
|
|
186
|
-
|
|
187
180
|
# Create an IngestionPlan with empty selector - this will trigger discover_selectors
|
|
188
181
|
mock_source = MockSourceWithDiscoverSelectors(name="test_source_discover")
|
|
189
182
|
data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
|
|
@@ -216,10 +209,8 @@ def test_iter_datasets_discover_selectors_with_filters(config_file):
|
|
|
216
209
|
assert datasets[0].name == "Mock match comp 11"
|
|
217
210
|
|
|
218
211
|
|
|
219
|
-
def test_iter_datasets_discover_selectors_multiple_matches(
|
|
212
|
+
def test_iter_datasets_discover_selectors_multiple_matches(engine):
|
|
220
213
|
"""Test that multiple discovered selectors can match the filters."""
|
|
221
|
-
engine = get_engine(config_file)
|
|
222
|
-
|
|
223
214
|
# Create an IngestionPlan with empty selector - this will trigger discover_selectors
|
|
224
215
|
mock_source = MockSourceWithDiscoverSelectors(name="test_source_discover")
|
|
225
216
|
data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
|
|
@@ -248,12 +239,10 @@ def test_iter_datasets_discover_selectors_multiple_matches(config_file):
|
|
|
248
239
|
assert competition_ids == {11, 22}
|
|
249
240
|
|
|
250
241
|
|
|
251
|
-
def test_selector_filters_make_discovered_selectors_more_strict(
|
|
242
|
+
def test_selector_filters_make_discovered_selectors_more_strict(engine):
|
|
252
243
|
"""Test that when selector_filters are more strict than discovered selectors, we make the selectors more strict."""
|
|
253
244
|
from unittest.mock import Mock
|
|
254
245
|
|
|
255
|
-
engine = get_engine(config_file)
|
|
256
|
-
|
|
257
246
|
# Create a source that returns multiple matches per season
|
|
258
247
|
class MockSourceMultipleMatches(Source):
|
|
259
248
|
@property
|
|
@@ -291,7 +280,7 @@ def test_selector_filters_make_discovered_selectors_more_strict(config_file):
|
|
|
291
280
|
url=f"http://test.com/match{mid}",
|
|
292
281
|
).add_file(
|
|
293
282
|
data_feed_key="test",
|
|
294
|
-
last_modified=
|
|
283
|
+
last_modified=utcnow(),
|
|
295
284
|
json_content={"match_id": mid},
|
|
296
285
|
)
|
|
297
286
|
return []
|
|
@@ -348,13 +337,11 @@ def test_selector_filters_make_discovered_selectors_more_strict(config_file):
|
|
|
348
337
|
# Without this optimization, we'd call with match_id=None and fetch 3 matches instead of 1
|
|
349
338
|
|
|
350
339
|
|
|
351
|
-
def test_iter_datasets_with_open_data_auto_discovery(
|
|
340
|
+
def test_iter_datasets_with_open_data_auto_discovery(engine):
|
|
352
341
|
"""Test that use_open_data=True auto-discovers open data sources without configuration."""
|
|
353
342
|
from unittest.mock import Mock
|
|
354
343
|
from ingestify.application import loader
|
|
355
344
|
|
|
356
|
-
engine = get_engine(config_file)
|
|
357
|
-
|
|
358
345
|
# Create mock source class that inherits from Source
|
|
359
346
|
class MockOpenDataSource(Source):
|
|
360
347
|
def __init__(self, name):
|
|
@@ -387,7 +374,7 @@ def test_iter_datasets_with_open_data_auto_discovery(config_file):
|
|
|
387
374
|
url="http://open-data.com/match123",
|
|
388
375
|
).add_file(
|
|
389
376
|
data_feed_key="test",
|
|
390
|
-
last_modified=
|
|
377
|
+
last_modified=utcnow(),
|
|
391
378
|
json_content={"match_id": 123},
|
|
392
379
|
)
|
|
393
380
|
|
|
@@ -25,7 +25,8 @@ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
|
25
25
|
from ingestify.domain.models.fetch_policy import FetchPolicy
|
|
26
26
|
from ingestify.domain.models.task.task_summary import TaskState
|
|
27
27
|
from ingestify.infra.serialization import serialize, deserialize
|
|
28
|
-
from ingestify.main import
|
|
28
|
+
from ingestify.main import get_dev_engine
|
|
29
|
+
from ingestify.utils import utcnow
|
|
29
30
|
|
|
30
31
|
|
|
31
32
|
def add_ingestion_plan(engine: IngestionEngine, source: Source, **selector):
|
|
@@ -78,7 +79,7 @@ class SimpleFakeSource(Source):
|
|
|
78
79
|
season_id,
|
|
79
80
|
**kwargs,
|
|
80
81
|
):
|
|
81
|
-
last_modified =
|
|
82
|
+
last_modified = utcnow()
|
|
82
83
|
|
|
83
84
|
yield (
|
|
84
85
|
DatasetResource(
|
|
@@ -273,9 +274,7 @@ class NoFilesSource(Source):
|
|
|
273
274
|
)
|
|
274
275
|
|
|
275
276
|
|
|
276
|
-
def test_engine(
|
|
277
|
-
engine = get_engine(config_file, "main")
|
|
278
|
-
|
|
277
|
+
def test_engine(engine):
|
|
279
278
|
add_ingestion_plan(
|
|
280
279
|
engine, SimpleFakeSource("fake-source"), competition_id=1, season_id=2
|
|
281
280
|
)
|
|
@@ -293,6 +292,7 @@ def test_engine(config_file):
|
|
|
293
292
|
|
|
294
293
|
dataset = datasets.first()
|
|
295
294
|
assert dataset.identifier == Identifier(competition_id=1, season_id=2, match_id=1)
|
|
295
|
+
|
|
296
296
|
assert len(dataset.revisions) == 2
|
|
297
297
|
assert len(dataset.revisions[0].modified_files) == 3
|
|
298
298
|
assert len(dataset.revisions[1].modified_files) == 1
|
|
@@ -325,13 +325,11 @@ def test_engine(config_file):
|
|
|
325
325
|
assert dataset.last_modified_at is not None
|
|
326
326
|
|
|
327
327
|
|
|
328
|
-
def test_iterator_source(
|
|
328
|
+
def test_iterator_source(engine):
|
|
329
329
|
"""Test when a Source returns a Iterator to do Batch processing.
|
|
330
330
|
|
|
331
331
|
Every batch must be executed right away.
|
|
332
332
|
"""
|
|
333
|
-
engine = get_engine(config_file, "main")
|
|
334
|
-
|
|
335
333
|
batch_source = None
|
|
336
334
|
|
|
337
335
|
def callback(idx):
|
|
@@ -339,7 +337,7 @@ def test_iterator_source(config_file):
|
|
|
339
337
|
datasets = engine.store.get_dataset_collection()
|
|
340
338
|
assert len(datasets) == idx
|
|
341
339
|
|
|
342
|
-
if idx ==
|
|
340
|
+
if idx == 100:
|
|
343
341
|
batch_source.should_stop = True
|
|
344
342
|
|
|
345
343
|
batch_source = BatchSource("fake-source", callback)
|
|
@@ -348,7 +346,7 @@ def test_iterator_source(config_file):
|
|
|
348
346
|
engine.load()
|
|
349
347
|
|
|
350
348
|
datasets = engine.store.get_dataset_collection()
|
|
351
|
-
assert len(datasets) ==
|
|
349
|
+
assert len(datasets) == 100
|
|
352
350
|
for dataset in datasets:
|
|
353
351
|
assert len(dataset.revisions) == 1
|
|
354
352
|
|
|
@@ -357,14 +355,14 @@ def test_iterator_source(config_file):
|
|
|
357
355
|
batch_source.should_stop = False
|
|
358
356
|
|
|
359
357
|
def callback(idx):
|
|
360
|
-
if idx ==
|
|
358
|
+
if idx == 100:
|
|
361
359
|
batch_source.should_stop = True
|
|
362
360
|
|
|
363
361
|
batch_source.callback = callback
|
|
364
362
|
|
|
365
363
|
engine.load()
|
|
366
364
|
datasets = engine.store.get_dataset_collection()
|
|
367
|
-
assert len(datasets) ==
|
|
365
|
+
assert len(datasets) == 100
|
|
368
366
|
for dataset in datasets:
|
|
369
367
|
assert len(dataset.revisions) == 2
|
|
370
368
|
|
|
@@ -373,9 +371,7 @@ def test_iterator_source(config_file):
|
|
|
373
371
|
deserialize(s)
|
|
374
372
|
|
|
375
373
|
|
|
376
|
-
def test_ingestion_plan_failing_task(
|
|
377
|
-
engine = get_engine(config_file, "main")
|
|
378
|
-
|
|
374
|
+
def test_ingestion_plan_failing_task(engine):
|
|
379
375
|
source = FailingLoadSource("fake-source")
|
|
380
376
|
|
|
381
377
|
add_ingestion_plan(engine, source, competition_id=1, season_id=2)
|
|
@@ -387,9 +383,7 @@ def test_ingestion_plan_failing_task(config_file):
|
|
|
387
383
|
assert items[0].task_summaries[0].state == TaskState.FAILED
|
|
388
384
|
|
|
389
385
|
|
|
390
|
-
def test_ingestion_plan_failing_job(
|
|
391
|
-
engine = get_engine(config_file, "main")
|
|
392
|
-
|
|
386
|
+
def test_ingestion_plan_failing_job(engine):
|
|
393
387
|
source = FailingJobSource("fake-source")
|
|
394
388
|
|
|
395
389
|
add_ingestion_plan(engine, source, competition_id=1, season_id=2)
|
|
@@ -412,9 +406,7 @@ def test_change_partition_key_transformer():
|
|
|
412
406
|
"""
|
|
413
407
|
|
|
414
408
|
|
|
415
|
-
def test_serde(
|
|
416
|
-
engine = get_engine(config_file, "main")
|
|
417
|
-
|
|
409
|
+
def test_serde(engine):
|
|
418
410
|
add_ingestion_plan(
|
|
419
411
|
engine, SimpleFakeSource("fake-source"), competition_id=1, season_id=2
|
|
420
412
|
)
|
|
@@ -434,10 +426,8 @@ def test_serde(config_file):
|
|
|
434
426
|
assert event.model_dump_json() == deserialized_event.model_dump_json()
|
|
435
427
|
|
|
436
428
|
|
|
437
|
-
def test_empty_dataset_resource_id(
|
|
429
|
+
def test_empty_dataset_resource_id(engine):
|
|
438
430
|
"""When a empty DatasetResourceId is passed nothing should break"""
|
|
439
|
-
engine = get_engine(config_file, "main")
|
|
440
|
-
|
|
441
431
|
add_ingestion_plan(engine, EmptyDatasetResourceIdSource("fake-source"))
|
|
442
432
|
engine.load()
|
|
443
433
|
|
|
@@ -509,9 +499,8 @@ class SourceWithHook(Source):
|
|
|
509
499
|
)
|
|
510
500
|
|
|
511
501
|
|
|
512
|
-
def test_post_load_files_hook(
|
|
502
|
+
def test_post_load_files_hook(engine):
|
|
513
503
|
"""Test that post_load_files hook changes state from SCHEDULED to COMPLETE when content is not empty."""
|
|
514
|
-
engine = get_engine(config_file, "main")
|
|
515
504
|
add_ingestion_plan(engine, SourceWithHook("test"), competition_id=1, season_id=2)
|
|
516
505
|
|
|
517
506
|
# First run: file contains '{}', state should remain SCHEDULED
|
|
@@ -525,10 +514,8 @@ def test_post_load_files_hook(config_file):
|
|
|
525
514
|
assert dataset2.state == DatasetState.COMPLETE
|
|
526
515
|
|
|
527
516
|
|
|
528
|
-
def test_force_save_creates_revision(
|
|
517
|
+
def test_force_save_creates_revision(engine):
|
|
529
518
|
"""Test that datasets get a revision even when no files are persisted."""
|
|
530
|
-
engine = get_engine(config_file, "main")
|
|
531
|
-
|
|
532
519
|
# Create one dataset with files and one without
|
|
533
520
|
add_ingestion_plan(
|
|
534
521
|
engine, SimpleFakeSource("fake-source"), competition_id=1, season_id=2
|
|
@@ -552,7 +539,9 @@ def test_force_save_creates_revision(config_file):
|
|
|
552
539
|
season_id=2
|
|
553
540
|
).first()
|
|
554
541
|
|
|
555
|
-
dataset_without_files = engine.store.get_dataset_collection(
|
|
542
|
+
dataset_without_files = engine.store.get_dataset_collection(
|
|
543
|
+
season_id=2, metadata_only=True
|
|
544
|
+
)
|
|
556
545
|
assert (
|
|
557
546
|
dataset_without_files.metadata.last_modified
|
|
558
547
|
== dataset_with_last_modified.last_modified_at
|
|
@@ -8,10 +8,9 @@ from ingestify.domain import Dataset, Identifier, Revision, File
|
|
|
8
8
|
from ingestify.domain.models.dataset.revision import RevisionSource, SourceType
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def test_file_cache(
|
|
11
|
+
def test_file_cache(engine):
|
|
12
12
|
"""Test file caching with the with_file_cache context manager."""
|
|
13
13
|
# Get engine from the fixture
|
|
14
|
-
engine = get_engine(config_file, "main")
|
|
15
14
|
store = engine.store
|
|
16
15
|
|
|
17
16
|
# Create a timestamp for test data
|
|
@@ -6,10 +6,9 @@ from ingestify.domain import Dataset, Identifier, DatasetState
|
|
|
6
6
|
from ingestify.main import get_engine
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def test_iter_dataset_collection_batches(
|
|
9
|
+
def test_iter_dataset_collection_batches(engine):
|
|
10
10
|
"""Test iteration over datasets with batches using iter_dataset_collection_batches."""
|
|
11
11
|
# Get engine from the fixture
|
|
12
|
-
engine = get_engine(config_file, "main")
|
|
13
12
|
store = engine.store
|
|
14
13
|
bucket = store.bucket
|
|
15
14
|
|
|
@@ -81,10 +80,9 @@ def test_iter_dataset_collection_batches(config_file):
|
|
|
81
80
|
assert filtered_dataset_ids[0] == "dataset-5"
|
|
82
81
|
|
|
83
82
|
|
|
84
|
-
def test_dataset_state_filter(
|
|
83
|
+
def test_dataset_state_filter(engine):
|
|
85
84
|
"""Test filtering datasets by state."""
|
|
86
85
|
# Get engine from the fixture
|
|
87
|
-
engine = get_engine(config_file, "main")
|
|
88
86
|
store = engine.store
|
|
89
87
|
bucket = store.bucket
|
|
90
88
|
|
|
@@ -4,7 +4,7 @@ from unittest.mock import patch
|
|
|
4
4
|
from ingestify.main import get_engine
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
def test_store_version_tracking_new_store(config_file):
|
|
7
|
+
def test_store_version_tracking_new_store(config_file, db_cleanup):
|
|
8
8
|
"""Test that a new store gets initialized with the current version."""
|
|
9
9
|
with patch("ingestify.__version__", "1.0.0"):
|
|
10
10
|
engine = get_engine(config_file)
|
|
@@ -13,8 +13,10 @@ def test_store_version_tracking_new_store(config_file):
|
|
|
13
13
|
stored_version = engine.store.dataset_repository.get_store_version()
|
|
14
14
|
assert stored_version == "1.0.0"
|
|
15
15
|
|
|
16
|
+
db_cleanup(engine)
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
|
|
19
|
+
def test_store_version_tracking_existing_store_same_version(config_file, db_cleanup):
|
|
18
20
|
"""Test that an existing store with same version doesn't cause issues."""
|
|
19
21
|
with patch("ingestify.__version__", "1.0.0"):
|
|
20
22
|
# Initialize store first time
|
|
@@ -29,16 +31,20 @@ def test_store_version_tracking_existing_store_same_version(config_file):
|
|
|
29
31
|
stored_version = store2.dataset_repository.get_store_version()
|
|
30
32
|
assert stored_version == "1.0.0"
|
|
31
33
|
|
|
34
|
+
db_cleanup(engine1)
|
|
35
|
+
|
|
32
36
|
|
|
33
|
-
def test_store_version_tracking_version_mismatch(config_file, caplog):
|
|
37
|
+
def test_store_version_tracking_version_mismatch(config_file, caplog, db_cleanup):
|
|
34
38
|
"""Test that version mismatch is logged as warning."""
|
|
35
|
-
#
|
|
36
|
-
|
|
39
|
+
# Use engine as fixture as this cleans up the database
|
|
40
|
+
|
|
41
|
+
# Initialize store with version 1.0.1
|
|
42
|
+
with patch("ingestify.__version__", "1.0.1"):
|
|
37
43
|
engine1 = get_engine(config_file)
|
|
38
44
|
store1 = engine1.store
|
|
39
45
|
|
|
40
46
|
stored_version = store1.dataset_repository.get_store_version()
|
|
41
|
-
assert stored_version == "1.0.
|
|
47
|
+
assert stored_version == "1.0.1"
|
|
42
48
|
|
|
43
49
|
# Open store with different version
|
|
44
50
|
with patch("ingestify.__version__", "2.0.0"):
|
|
@@ -47,16 +53,17 @@ def test_store_version_tracking_version_mismatch(config_file, caplog):
|
|
|
47
53
|
|
|
48
54
|
# Version should still be the original one
|
|
49
55
|
stored_version = store2.dataset_repository.get_store_version()
|
|
50
|
-
assert stored_version == "1.0.
|
|
56
|
+
assert stored_version == "1.0.1"
|
|
51
57
|
|
|
52
58
|
# Should have logged a warning about version mismatch
|
|
53
59
|
assert "Store version mismatch" in caplog.text
|
|
54
|
-
assert "stored=1.0.
|
|
60
|
+
assert "stored=1.0.1, current=2.0.0" in caplog.text
|
|
61
|
+
|
|
62
|
+
db_cleanup(engine1)
|
|
55
63
|
|
|
56
64
|
|
|
57
|
-
def test_store_version_methods(
|
|
65
|
+
def test_store_version_methods(engine):
|
|
58
66
|
"""Test the repository version methods directly."""
|
|
59
|
-
engine = get_engine(config_file)
|
|
60
67
|
repo = engine.store.dataset_repository
|
|
61
68
|
|
|
62
69
|
from ingestify import __version__
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
import tempfile
|
|
2
|
-
|
|
3
|
-
import pytest
|
|
4
|
-
import os
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@pytest.fixture(scope="function", autouse=True)
|
|
8
|
-
def datastore_dir():
|
|
9
|
-
with tempfile.TemporaryDirectory() as tmpdirname:
|
|
10
|
-
os.environ["TEST_DIR"] = tmpdirname
|
|
11
|
-
os.environ["INGESTIFY_RUN_EAGER"] = "true"
|
|
12
|
-
yield tmpdirname
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@pytest.fixture(scope="session")
|
|
16
|
-
def config_file():
|
|
17
|
-
return os.path.abspath(os.path.dirname(__file__) + "/config.yaml")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/data_spec_version_collection.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/services/identifier_key_transformer.py
RENAMED
|
File without changes
|
|
File without changes
|
{ingestify-0.9.4 → ingestify-0.10.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|