ingestify 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/domain/models/dataset/dataset_state.py +1 -0
- ingestify/domain/models/ingestion/ingestion_job.py +5 -1
- ingestify/domain/models/resources/dataset_resource.py +13 -1
- ingestify/infra/fetch/http.py +1 -3
- ingestify/infra/store/dataset/sqlalchemy/repository.py +90 -50
- ingestify/infra/store/dataset/sqlalchemy/tables.py +191 -174
- ingestify/main.py +34 -5
- ingestify/tests/__init__.py +0 -0
- ingestify/tests/conftest.py +17 -0
- ingestify/tests/test_auto_ingest.py +418 -0
- ingestify/tests/test_engine.py +501 -0
- ingestify/tests/test_events.py +201 -0
- ingestify/tests/test_file_cache.py +98 -0
- ingestify/tests/test_pagination.py +162 -0
- ingestify/tests/test_store_version.py +73 -0
- ingestify/tests/test_table_prefix.py +78 -0
- {ingestify-0.8.0.dist-info → ingestify-0.9.1.dist-info}/METADATA +11 -3
- {ingestify-0.8.0.dist-info → ingestify-0.9.1.dist-info}/RECORD +22 -13
- {ingestify-0.8.0.dist-info → ingestify-0.9.1.dist-info}/WHEEL +1 -1
- {ingestify-0.8.0.dist-info → ingestify-0.9.1.dist-info}/entry_points.txt +0 -0
- {ingestify-0.8.0.dist-info → ingestify-0.9.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
import datetime
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
from unittest.mock import MagicMock
|
|
5
|
+
|
|
6
|
+
from ingestify.main import get_engine
|
|
7
|
+
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
8
|
+
from ingestify.domain.models.fetch_policy import FetchPolicy
|
|
9
|
+
from ingestify.domain import Selector, DataSpecVersionCollection
|
|
10
|
+
from ingestify import Source, DatasetResource
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MockSource(Source):
|
|
14
|
+
"""Simple mock source for basic testing."""
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def provider(self) -> str:
|
|
18
|
+
return "test_provider"
|
|
19
|
+
|
|
20
|
+
def find_datasets(
|
|
21
|
+
self,
|
|
22
|
+
dataset_type: str,
|
|
23
|
+
data_spec_versions: DataSpecVersionCollection,
|
|
24
|
+
dataset_collection_metadata,
|
|
25
|
+
competition_id: int,
|
|
26
|
+
**kwargs,
|
|
27
|
+
):
|
|
28
|
+
# Return mock datasets for competition_id=11
|
|
29
|
+
if competition_id == 11:
|
|
30
|
+
yield DatasetResource(
|
|
31
|
+
dataset_resource_id={
|
|
32
|
+
"competition_id": 11,
|
|
33
|
+
"season_id": 90,
|
|
34
|
+
"match_id": 1,
|
|
35
|
+
},
|
|
36
|
+
name="Mock match",
|
|
37
|
+
dataset_type="match",
|
|
38
|
+
provider=self.provider,
|
|
39
|
+
url="http://test.com/match1",
|
|
40
|
+
).add_file(
|
|
41
|
+
data_feed_key="test",
|
|
42
|
+
last_modified=datetime.datetime.now(),
|
|
43
|
+
json_content={"blaat": "piet"},
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
return []
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class MockSourceWithDiscoverSelectors(Source):
|
|
50
|
+
"""Mock source that supports discover_selectors for testing."""
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def provider(self) -> str:
|
|
54
|
+
return "test_provider_discover"
|
|
55
|
+
|
|
56
|
+
def find_datasets(
|
|
57
|
+
self,
|
|
58
|
+
dataset_type: str,
|
|
59
|
+
data_spec_versions: DataSpecVersionCollection,
|
|
60
|
+
dataset_collection_metadata,
|
|
61
|
+
competition_id: int,
|
|
62
|
+
**kwargs,
|
|
63
|
+
):
|
|
64
|
+
# Return mock datasets for specific competition_ids
|
|
65
|
+
if competition_id == 11:
|
|
66
|
+
yield DatasetResource(
|
|
67
|
+
dataset_resource_id={
|
|
68
|
+
"competition_id": 11,
|
|
69
|
+
"season_id": 90,
|
|
70
|
+
"match_id": 1,
|
|
71
|
+
},
|
|
72
|
+
name="Mock match comp 11",
|
|
73
|
+
dataset_type="match",
|
|
74
|
+
provider=self.provider,
|
|
75
|
+
url="http://test.com/match1",
|
|
76
|
+
).add_file(
|
|
77
|
+
data_feed_key="test",
|
|
78
|
+
last_modified=datetime.datetime.now(),
|
|
79
|
+
json_content={"competition_id": 11},
|
|
80
|
+
)
|
|
81
|
+
elif competition_id == 22:
|
|
82
|
+
yield DatasetResource(
|
|
83
|
+
dataset_resource_id={
|
|
84
|
+
"competition_id": 22,
|
|
85
|
+
"season_id": 91,
|
|
86
|
+
"match_id": 2,
|
|
87
|
+
},
|
|
88
|
+
name="Mock match comp 22",
|
|
89
|
+
dataset_type="match",
|
|
90
|
+
provider=self.provider,
|
|
91
|
+
url="http://test.com/match2",
|
|
92
|
+
).add_file(
|
|
93
|
+
data_feed_key="test",
|
|
94
|
+
last_modified=datetime.datetime.now(),
|
|
95
|
+
json_content={"competition_id": 22},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return []
|
|
99
|
+
|
|
100
|
+
def discover_selectors(self, dataset_type: str):
|
|
101
|
+
"""Return multiple selectors that will be filtered by user criteria."""
|
|
102
|
+
return [
|
|
103
|
+
{"competition_id": 11, "season_id": 90},
|
|
104
|
+
{"competition_id": 22, "season_id": 91},
|
|
105
|
+
{"competition_id": 33, "season_id": 92},
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def test_iter_datasets_basic_auto_ingest(config_file):
|
|
110
|
+
"""Test basic auto-ingest functionality."""
|
|
111
|
+
engine = get_engine(config_file)
|
|
112
|
+
|
|
113
|
+
# Add a simple ingestion plan
|
|
114
|
+
mock_source = MockSource(name="test_source")
|
|
115
|
+
data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
|
|
116
|
+
|
|
117
|
+
plan = IngestionPlan(
|
|
118
|
+
source=mock_source,
|
|
119
|
+
fetch_policy=FetchPolicy(),
|
|
120
|
+
selectors=[
|
|
121
|
+
Selector.build(
|
|
122
|
+
{"competition_id": 11}, data_spec_versions=data_spec_versions
|
|
123
|
+
)
|
|
124
|
+
],
|
|
125
|
+
dataset_type="match",
|
|
126
|
+
data_spec_versions=data_spec_versions,
|
|
127
|
+
)
|
|
128
|
+
engine.add_ingestion_plan(plan)
|
|
129
|
+
|
|
130
|
+
# Test auto-ingest
|
|
131
|
+
datasets = list(
|
|
132
|
+
engine.iter_datasets(
|
|
133
|
+
provider="test_provider",
|
|
134
|
+
dataset_type="match",
|
|
135
|
+
competition_id=11,
|
|
136
|
+
auto_ingest=True,
|
|
137
|
+
)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
assert len(datasets) > 0
|
|
141
|
+
assert datasets[0].identifier["competition_id"] == 11
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_iter_datasets_auto_ingest_disabled(config_file):
|
|
145
|
+
"""Test that auto_ingest=False returns only existing datasets."""
|
|
146
|
+
engine = get_engine(config_file)
|
|
147
|
+
|
|
148
|
+
# Should only return existing datasets (none in empty store)
|
|
149
|
+
datasets = list(engine.iter_datasets(competition_id=11, auto_ingest=False))
|
|
150
|
+
|
|
151
|
+
assert len(datasets) == 0
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def test_iter_datasets_outside_config_scope(config_file):
|
|
155
|
+
"""Test that requests outside IngestionPlan scope return nothing."""
|
|
156
|
+
engine = get_engine(config_file)
|
|
157
|
+
|
|
158
|
+
# Add plan only for competition_id=11
|
|
159
|
+
mock_source = MockSource(name="test_source")
|
|
160
|
+
data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
|
|
161
|
+
|
|
162
|
+
plan = IngestionPlan(
|
|
163
|
+
source=mock_source,
|
|
164
|
+
fetch_policy=FetchPolicy(),
|
|
165
|
+
selectors=[
|
|
166
|
+
Selector.build(
|
|
167
|
+
{"competition_id": 11}, data_spec_versions=data_spec_versions
|
|
168
|
+
)
|
|
169
|
+
],
|
|
170
|
+
dataset_type="match",
|
|
171
|
+
data_spec_versions=data_spec_versions,
|
|
172
|
+
)
|
|
173
|
+
engine.add_ingestion_plan(plan)
|
|
174
|
+
|
|
175
|
+
# Request data outside plan scope
|
|
176
|
+
datasets = list(
|
|
177
|
+
engine.iter_datasets(competition_id=999, auto_ingest=True) # Not in plan
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
assert len(datasets) == 0
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def test_iter_datasets_discover_selectors_with_filters(config_file):
|
|
184
|
+
"""Test that selector_filters are applied after discover_selectors runs."""
|
|
185
|
+
engine = get_engine(config_file)
|
|
186
|
+
|
|
187
|
+
# Create an IngestionPlan with empty selector - this will trigger discover_selectors
|
|
188
|
+
mock_source = MockSourceWithDiscoverSelectors(name="test_source_discover")
|
|
189
|
+
data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
|
|
190
|
+
|
|
191
|
+
plan = IngestionPlan(
|
|
192
|
+
source=mock_source,
|
|
193
|
+
fetch_policy=FetchPolicy(),
|
|
194
|
+
selectors=[
|
|
195
|
+
Selector.build({}, data_spec_versions=data_spec_versions)
|
|
196
|
+
], # Empty selector - will use discover_selectors
|
|
197
|
+
dataset_type="match",
|
|
198
|
+
data_spec_versions=data_spec_versions,
|
|
199
|
+
)
|
|
200
|
+
engine.add_ingestion_plan(plan)
|
|
201
|
+
|
|
202
|
+
# Test that selector_filters are applied AFTER discover_selectors
|
|
203
|
+
# The source discovers 3 selectors (comp 11, 22, 33) but we filter for only comp 11
|
|
204
|
+
datasets = list(
|
|
205
|
+
engine.iter_datasets(
|
|
206
|
+
provider="test_provider_discover",
|
|
207
|
+
dataset_type="match",
|
|
208
|
+
competition_id=11, # This filter should be applied after discover_selectors
|
|
209
|
+
auto_ingest=True,
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Should only get datasets for competition_id=11, not all discovered selectors
|
|
214
|
+
assert len(datasets) == 1
|
|
215
|
+
assert datasets[0].identifier["competition_id"] == 11
|
|
216
|
+
assert datasets[0].name == "Mock match comp 11"
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def test_iter_datasets_discover_selectors_multiple_matches(config_file):
|
|
220
|
+
"""Test that multiple discovered selectors can match the filters."""
|
|
221
|
+
engine = get_engine(config_file)
|
|
222
|
+
|
|
223
|
+
# Create an IngestionPlan with empty selector - this will trigger discover_selectors
|
|
224
|
+
mock_source = MockSourceWithDiscoverSelectors(name="test_source_discover")
|
|
225
|
+
data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
|
|
226
|
+
|
|
227
|
+
plan = IngestionPlan(
|
|
228
|
+
source=mock_source,
|
|
229
|
+
fetch_policy=FetchPolicy(),
|
|
230
|
+
selectors=[
|
|
231
|
+
Selector.build({}, data_spec_versions=data_spec_versions)
|
|
232
|
+
], # Empty selector - will use discover_selectors
|
|
233
|
+
dataset_type="match",
|
|
234
|
+
data_spec_versions=data_spec_versions,
|
|
235
|
+
)
|
|
236
|
+
engine.add_ingestion_plan(plan)
|
|
237
|
+
|
|
238
|
+
# Test with no specific filters - should get all discovered selectors that have data
|
|
239
|
+
datasets = list(
|
|
240
|
+
engine.iter_datasets(
|
|
241
|
+
provider="test_provider_discover", dataset_type="match", auto_ingest=True
|
|
242
|
+
)
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Should get datasets for competition_ids 11 and 22 (33 has no mock data)
|
|
246
|
+
assert len(datasets) == 2
|
|
247
|
+
competition_ids = {d.identifier["competition_id"] for d in datasets}
|
|
248
|
+
assert competition_ids == {11, 22}
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def test_selector_filters_make_discovered_selectors_more_strict(config_file):
|
|
252
|
+
"""Test that when selector_filters are more strict than discovered selectors, we make the selectors more strict."""
|
|
253
|
+
from unittest.mock import Mock
|
|
254
|
+
|
|
255
|
+
engine = get_engine(config_file)
|
|
256
|
+
|
|
257
|
+
# Create a source that returns multiple matches per season
|
|
258
|
+
class MockSourceMultipleMatches(Source):
|
|
259
|
+
@property
|
|
260
|
+
def provider(self) -> str:
|
|
261
|
+
return "test_multi_provider"
|
|
262
|
+
|
|
263
|
+
def find_datasets(
|
|
264
|
+
self,
|
|
265
|
+
dataset_type,
|
|
266
|
+
data_spec_versions,
|
|
267
|
+
dataset_collection_metadata,
|
|
268
|
+
**kwargs,
|
|
269
|
+
):
|
|
270
|
+
competition_id = kwargs.get("competition_id")
|
|
271
|
+
season_id = kwargs.get("season_id")
|
|
272
|
+
match_id = kwargs.get("match_id")
|
|
273
|
+
|
|
274
|
+
if competition_id == 11 and season_id == 90:
|
|
275
|
+
# Return all matches in the season
|
|
276
|
+
all_matches = [123, 124, 125]
|
|
277
|
+
for mid in all_matches:
|
|
278
|
+
# Filter by match_id if specified, otherwise yield all
|
|
279
|
+
if match_id is not None and mid != match_id:
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
yield DatasetResource(
|
|
283
|
+
dataset_resource_id={
|
|
284
|
+
"competition_id": 11,
|
|
285
|
+
"season_id": 90,
|
|
286
|
+
"match_id": mid,
|
|
287
|
+
},
|
|
288
|
+
name=f"Match {mid}",
|
|
289
|
+
dataset_type="match",
|
|
290
|
+
provider=self.provider,
|
|
291
|
+
url=f"http://test.com/match{mid}",
|
|
292
|
+
).add_file(
|
|
293
|
+
data_feed_key="test",
|
|
294
|
+
last_modified=datetime.datetime.now(),
|
|
295
|
+
json_content={"match_id": mid},
|
|
296
|
+
)
|
|
297
|
+
return []
|
|
298
|
+
|
|
299
|
+
def discover_selectors(self, dataset_type):
|
|
300
|
+
# Returns broad selector - just competition + season, no specific match
|
|
301
|
+
return [
|
|
302
|
+
{
|
|
303
|
+
"competition_id": 11,
|
|
304
|
+
"season_id": 90,
|
|
305
|
+
}, # This would fetch ALL matches in season
|
|
306
|
+
]
|
|
307
|
+
|
|
308
|
+
mock_source = MockSourceMultipleMatches(name="multi_source")
|
|
309
|
+
original_find_datasets = mock_source.find_datasets
|
|
310
|
+
mock_source.find_datasets = Mock(side_effect=original_find_datasets)
|
|
311
|
+
|
|
312
|
+
data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
|
|
313
|
+
|
|
314
|
+
plan = IngestionPlan(
|
|
315
|
+
source=mock_source,
|
|
316
|
+
fetch_policy=FetchPolicy(),
|
|
317
|
+
selectors=[Selector.build({}, data_spec_versions=data_spec_versions)],
|
|
318
|
+
dataset_type="match",
|
|
319
|
+
data_spec_versions=data_spec_versions,
|
|
320
|
+
)
|
|
321
|
+
engine.add_ingestion_plan(plan)
|
|
322
|
+
|
|
323
|
+
# User requests specific match - more strict than discovered selector
|
|
324
|
+
datasets = list(
|
|
325
|
+
engine.iter_datasets(
|
|
326
|
+
provider="test_multi_provider",
|
|
327
|
+
dataset_type="match",
|
|
328
|
+
competition_id=11,
|
|
329
|
+
season_id=90,
|
|
330
|
+
match_id=123, # This should make the selector more strict
|
|
331
|
+
auto_ingest=True,
|
|
332
|
+
)
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Should get only the specific match, not all matches in the season
|
|
336
|
+
assert len(datasets) == 1
|
|
337
|
+
assert datasets[0].identifier["match_id"] == 123
|
|
338
|
+
|
|
339
|
+
# Verify find_datasets was called with the strict parameters including match_id
|
|
340
|
+
assert mock_source.find_datasets.call_count == 1
|
|
341
|
+
call_args = mock_source.find_datasets.call_args_list[0]
|
|
342
|
+
args, kwargs = call_args
|
|
343
|
+
|
|
344
|
+
assert kwargs["competition_id"] == 11
|
|
345
|
+
assert kwargs["season_id"] == 90
|
|
346
|
+
assert kwargs["match_id"] == 123 # Should be added to avoid fetching all matches
|
|
347
|
+
|
|
348
|
+
# Without this optimization, we'd call with match_id=None and fetch 3 matches instead of 1
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def test_iter_datasets_with_open_data_auto_discovery(config_file):
|
|
352
|
+
"""Test that use_open_data=True auto-discovers open data sources without configuration."""
|
|
353
|
+
from unittest.mock import Mock
|
|
354
|
+
from ingestify.application import loader
|
|
355
|
+
|
|
356
|
+
engine = get_engine(config_file)
|
|
357
|
+
|
|
358
|
+
# Create mock source class that inherits from Source
|
|
359
|
+
class MockOpenDataSource(Source):
|
|
360
|
+
def __init__(self, name):
|
|
361
|
+
super().__init__(name)
|
|
362
|
+
|
|
363
|
+
@property
|
|
364
|
+
def provider(self):
|
|
365
|
+
return "statsbomb"
|
|
366
|
+
|
|
367
|
+
def discover_selectors(self, dataset_type):
|
|
368
|
+
return [{"competition_id": 11, "season_id": 90}]
|
|
369
|
+
|
|
370
|
+
def find_datasets(
|
|
371
|
+
self,
|
|
372
|
+
dataset_type,
|
|
373
|
+
data_spec_versions,
|
|
374
|
+
dataset_collection_metadata,
|
|
375
|
+
**kwargs,
|
|
376
|
+
):
|
|
377
|
+
if kwargs.get("competition_id") == 11 and kwargs.get("season_id") == 90:
|
|
378
|
+
yield DatasetResource(
|
|
379
|
+
dataset_resource_id={
|
|
380
|
+
"competition_id": 11,
|
|
381
|
+
"season_id": 90,
|
|
382
|
+
"match_id": 123,
|
|
383
|
+
},
|
|
384
|
+
name="Open data match",
|
|
385
|
+
dataset_type="match",
|
|
386
|
+
provider="statsbomb",
|
|
387
|
+
url="http://open-data.com/match123",
|
|
388
|
+
).add_file(
|
|
389
|
+
data_feed_key="test",
|
|
390
|
+
last_modified=datetime.datetime.now(),
|
|
391
|
+
json_content={"match_id": 123},
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
mock_source_class = MockOpenDataSource
|
|
395
|
+
|
|
396
|
+
# Replace the real source with mock in the registry
|
|
397
|
+
original_source = loader.OPEN_DATA_SOURCES["statsbomb"]
|
|
398
|
+
loader.OPEN_DATA_SOURCES["statsbomb"] = mock_source_class
|
|
399
|
+
|
|
400
|
+
try:
|
|
401
|
+
# No ingestion plans configured - should still work with open data
|
|
402
|
+
datasets = list(
|
|
403
|
+
engine.iter_datasets(
|
|
404
|
+
auto_ingest={"use_open_data": True},
|
|
405
|
+
provider="statsbomb",
|
|
406
|
+
dataset_type="match",
|
|
407
|
+
competition_id=11,
|
|
408
|
+
season_id=90,
|
|
409
|
+
)
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
# Should find datasets from auto-discovered StatsBombGithub source
|
|
413
|
+
assert len(datasets) > 0
|
|
414
|
+
assert datasets[0].provider == "statsbomb"
|
|
415
|
+
assert datasets[0].identifier["competition_id"] == 11
|
|
416
|
+
finally:
|
|
417
|
+
# Restore original source
|
|
418
|
+
loader.OPEN_DATA_SOURCES["statsbomb"] = original_source
|