ingestify 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,418 @@
1
+ import datetime
2
+
3
+ import pytest
4
+ from unittest.mock import MagicMock
5
+
6
+ from ingestify.main import get_engine
7
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
8
+ from ingestify.domain.models.fetch_policy import FetchPolicy
9
+ from ingestify.domain import Selector, DataSpecVersionCollection
10
+ from ingestify import Source, DatasetResource
11
+
12
+
13
+ class MockSource(Source):
14
+ """Simple mock source for basic testing."""
15
+
16
+ @property
17
+ def provider(self) -> str:
18
+ return "test_provider"
19
+
20
+ def find_datasets(
21
+ self,
22
+ dataset_type: str,
23
+ data_spec_versions: DataSpecVersionCollection,
24
+ dataset_collection_metadata,
25
+ competition_id: int,
26
+ **kwargs,
27
+ ):
28
+ # Return mock datasets for competition_id=11
29
+ if competition_id == 11:
30
+ yield DatasetResource(
31
+ dataset_resource_id={
32
+ "competition_id": 11,
33
+ "season_id": 90,
34
+ "match_id": 1,
35
+ },
36
+ name="Mock match",
37
+ dataset_type="match",
38
+ provider=self.provider,
39
+ url="http://test.com/match1",
40
+ ).add_file(
41
+ data_feed_key="test",
42
+ last_modified=datetime.datetime.now(),
43
+ json_content={"blaat": "piet"},
44
+ )
45
+
46
+ return []
47
+
48
+
49
+ class MockSourceWithDiscoverSelectors(Source):
50
+ """Mock source that supports discover_selectors for testing."""
51
+
52
+ @property
53
+ def provider(self) -> str:
54
+ return "test_provider_discover"
55
+
56
+ def find_datasets(
57
+ self,
58
+ dataset_type: str,
59
+ data_spec_versions: DataSpecVersionCollection,
60
+ dataset_collection_metadata,
61
+ competition_id: int,
62
+ **kwargs,
63
+ ):
64
+ # Return mock datasets for specific competition_ids
65
+ if competition_id == 11:
66
+ yield DatasetResource(
67
+ dataset_resource_id={
68
+ "competition_id": 11,
69
+ "season_id": 90,
70
+ "match_id": 1,
71
+ },
72
+ name="Mock match comp 11",
73
+ dataset_type="match",
74
+ provider=self.provider,
75
+ url="http://test.com/match1",
76
+ ).add_file(
77
+ data_feed_key="test",
78
+ last_modified=datetime.datetime.now(),
79
+ json_content={"competition_id": 11},
80
+ )
81
+ elif competition_id == 22:
82
+ yield DatasetResource(
83
+ dataset_resource_id={
84
+ "competition_id": 22,
85
+ "season_id": 91,
86
+ "match_id": 2,
87
+ },
88
+ name="Mock match comp 22",
89
+ dataset_type="match",
90
+ provider=self.provider,
91
+ url="http://test.com/match2",
92
+ ).add_file(
93
+ data_feed_key="test",
94
+ last_modified=datetime.datetime.now(),
95
+ json_content={"competition_id": 22},
96
+ )
97
+
98
+ return []
99
+
100
+ def discover_selectors(self, dataset_type: str):
101
+ """Return multiple selectors that will be filtered by user criteria."""
102
+ return [
103
+ {"competition_id": 11, "season_id": 90},
104
+ {"competition_id": 22, "season_id": 91},
105
+ {"competition_id": 33, "season_id": 92},
106
+ ]
107
+
108
+
109
+ def test_iter_datasets_basic_auto_ingest(config_file):
110
+ """Test basic auto-ingest functionality."""
111
+ engine = get_engine(config_file)
112
+
113
+ # Add a simple ingestion plan
114
+ mock_source = MockSource(name="test_source")
115
+ data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
116
+
117
+ plan = IngestionPlan(
118
+ source=mock_source,
119
+ fetch_policy=FetchPolicy(),
120
+ selectors=[
121
+ Selector.build(
122
+ {"competition_id": 11}, data_spec_versions=data_spec_versions
123
+ )
124
+ ],
125
+ dataset_type="match",
126
+ data_spec_versions=data_spec_versions,
127
+ )
128
+ engine.add_ingestion_plan(plan)
129
+
130
+ # Test auto-ingest
131
+ datasets = list(
132
+ engine.iter_datasets(
133
+ provider="test_provider",
134
+ dataset_type="match",
135
+ competition_id=11,
136
+ auto_ingest=True,
137
+ )
138
+ )
139
+
140
+ assert len(datasets) > 0
141
+ assert datasets[0].identifier["competition_id"] == 11
142
+
143
+
144
+ def test_iter_datasets_auto_ingest_disabled(config_file):
145
+ """Test that auto_ingest=False returns only existing datasets."""
146
+ engine = get_engine(config_file)
147
+
148
+ # Should only return existing datasets (none in empty store)
149
+ datasets = list(engine.iter_datasets(competition_id=11, auto_ingest=False))
150
+
151
+ assert len(datasets) == 0
152
+
153
+
154
+ def test_iter_datasets_outside_config_scope(config_file):
155
+ """Test that requests outside IngestionPlan scope return nothing."""
156
+ engine = get_engine(config_file)
157
+
158
+ # Add plan only for competition_id=11
159
+ mock_source = MockSource(name="test_source")
160
+ data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
161
+
162
+ plan = IngestionPlan(
163
+ source=mock_source,
164
+ fetch_policy=FetchPolicy(),
165
+ selectors=[
166
+ Selector.build(
167
+ {"competition_id": 11}, data_spec_versions=data_spec_versions
168
+ )
169
+ ],
170
+ dataset_type="match",
171
+ data_spec_versions=data_spec_versions,
172
+ )
173
+ engine.add_ingestion_plan(plan)
174
+
175
+ # Request data outside plan scope
176
+ datasets = list(
177
+ engine.iter_datasets(competition_id=999, auto_ingest=True) # Not in plan
178
+ )
179
+
180
+ assert len(datasets) == 0
181
+
182
+
183
+ def test_iter_datasets_discover_selectors_with_filters(config_file):
184
+ """Test that selector_filters are applied after discover_selectors runs."""
185
+ engine = get_engine(config_file)
186
+
187
+ # Create an IngestionPlan with empty selector - this will trigger discover_selectors
188
+ mock_source = MockSourceWithDiscoverSelectors(name="test_source_discover")
189
+ data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
190
+
191
+ plan = IngestionPlan(
192
+ source=mock_source,
193
+ fetch_policy=FetchPolicy(),
194
+ selectors=[
195
+ Selector.build({}, data_spec_versions=data_spec_versions)
196
+ ], # Empty selector - will use discover_selectors
197
+ dataset_type="match",
198
+ data_spec_versions=data_spec_versions,
199
+ )
200
+ engine.add_ingestion_plan(plan)
201
+
202
+ # Test that selector_filters are applied AFTER discover_selectors
203
+ # The source discovers 3 selectors (comp 11, 22, 33) but we filter for only comp 11
204
+ datasets = list(
205
+ engine.iter_datasets(
206
+ provider="test_provider_discover",
207
+ dataset_type="match",
208
+ competition_id=11, # This filter should be applied after discover_selectors
209
+ auto_ingest=True,
210
+ )
211
+ )
212
+
213
+ # Should only get datasets for competition_id=11, not all discovered selectors
214
+ assert len(datasets) == 1
215
+ assert datasets[0].identifier["competition_id"] == 11
216
+ assert datasets[0].name == "Mock match comp 11"
217
+
218
+
219
+ def test_iter_datasets_discover_selectors_multiple_matches(config_file):
220
+ """Test that multiple discovered selectors can match the filters."""
221
+ engine = get_engine(config_file)
222
+
223
+ # Create an IngestionPlan with empty selector - this will trigger discover_selectors
224
+ mock_source = MockSourceWithDiscoverSelectors(name="test_source_discover")
225
+ data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
226
+
227
+ plan = IngestionPlan(
228
+ source=mock_source,
229
+ fetch_policy=FetchPolicy(),
230
+ selectors=[
231
+ Selector.build({}, data_spec_versions=data_spec_versions)
232
+ ], # Empty selector - will use discover_selectors
233
+ dataset_type="match",
234
+ data_spec_versions=data_spec_versions,
235
+ )
236
+ engine.add_ingestion_plan(plan)
237
+
238
+ # Test with no specific filters - should get all discovered selectors that have data
239
+ datasets = list(
240
+ engine.iter_datasets(
241
+ provider="test_provider_discover", dataset_type="match", auto_ingest=True
242
+ )
243
+ )
244
+
245
+ # Should get datasets for competition_ids 11 and 22 (33 has no mock data)
246
+ assert len(datasets) == 2
247
+ competition_ids = {d.identifier["competition_id"] for d in datasets}
248
+ assert competition_ids == {11, 22}
249
+
250
+
251
+ def test_selector_filters_make_discovered_selectors_more_strict(config_file):
252
+ """Test that when selector_filters are more strict than discovered selectors, we make the selectors more strict."""
253
+ from unittest.mock import Mock
254
+
255
+ engine = get_engine(config_file)
256
+
257
+ # Create a source that returns multiple matches per season
258
+ class MockSourceMultipleMatches(Source):
259
+ @property
260
+ def provider(self) -> str:
261
+ return "test_multi_provider"
262
+
263
+ def find_datasets(
264
+ self,
265
+ dataset_type,
266
+ data_spec_versions,
267
+ dataset_collection_metadata,
268
+ **kwargs,
269
+ ):
270
+ competition_id = kwargs.get("competition_id")
271
+ season_id = kwargs.get("season_id")
272
+ match_id = kwargs.get("match_id")
273
+
274
+ if competition_id == 11 and season_id == 90:
275
+ # Return all matches in the season
276
+ all_matches = [123, 124, 125]
277
+ for mid in all_matches:
278
+ # Filter by match_id if specified, otherwise yield all
279
+ if match_id is not None and mid != match_id:
280
+ continue
281
+
282
+ yield DatasetResource(
283
+ dataset_resource_id={
284
+ "competition_id": 11,
285
+ "season_id": 90,
286
+ "match_id": mid,
287
+ },
288
+ name=f"Match {mid}",
289
+ dataset_type="match",
290
+ provider=self.provider,
291
+ url=f"http://test.com/match{mid}",
292
+ ).add_file(
293
+ data_feed_key="test",
294
+ last_modified=datetime.datetime.now(),
295
+ json_content={"match_id": mid},
296
+ )
297
+ return []
298
+
299
+ def discover_selectors(self, dataset_type):
300
+ # Returns broad selector - just competition + season, no specific match
301
+ return [
302
+ {
303
+ "competition_id": 11,
304
+ "season_id": 90,
305
+ }, # This would fetch ALL matches in season
306
+ ]
307
+
308
+ mock_source = MockSourceMultipleMatches(name="multi_source")
309
+ original_find_datasets = mock_source.find_datasets
310
+ mock_source.find_datasets = Mock(side_effect=original_find_datasets)
311
+
312
+ data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
313
+
314
+ plan = IngestionPlan(
315
+ source=mock_source,
316
+ fetch_policy=FetchPolicy(),
317
+ selectors=[Selector.build({}, data_spec_versions=data_spec_versions)],
318
+ dataset_type="match",
319
+ data_spec_versions=data_spec_versions,
320
+ )
321
+ engine.add_ingestion_plan(plan)
322
+
323
+ # User requests specific match - more strict than discovered selector
324
+ datasets = list(
325
+ engine.iter_datasets(
326
+ provider="test_multi_provider",
327
+ dataset_type="match",
328
+ competition_id=11,
329
+ season_id=90,
330
+ match_id=123, # This should make the selector more strict
331
+ auto_ingest=True,
332
+ )
333
+ )
334
+
335
+ # Should get only the specific match, not all matches in the season
336
+ assert len(datasets) == 1
337
+ assert datasets[0].identifier["match_id"] == 123
338
+
339
+ # Verify find_datasets was called with the strict parameters including match_id
340
+ assert mock_source.find_datasets.call_count == 1
341
+ call_args = mock_source.find_datasets.call_args_list[0]
342
+ args, kwargs = call_args
343
+
344
+ assert kwargs["competition_id"] == 11
345
+ assert kwargs["season_id"] == 90
346
+ assert kwargs["match_id"] == 123 # Should be added to avoid fetching all matches
347
+
348
+ # Without this optimization, we'd call with match_id=None and fetch 3 matches instead of 1
349
+
350
+
351
+ def test_iter_datasets_with_open_data_auto_discovery(config_file):
352
+ """Test that use_open_data=True auto-discovers open data sources without configuration."""
353
+ from unittest.mock import Mock
354
+ from ingestify.application import loader
355
+
356
+ engine = get_engine(config_file)
357
+
358
+ # Create mock source class that inherits from Source
359
+ class MockOpenDataSource(Source):
360
+ def __init__(self, name):
361
+ super().__init__(name)
362
+
363
+ @property
364
+ def provider(self):
365
+ return "statsbomb"
366
+
367
+ def discover_selectors(self, dataset_type):
368
+ return [{"competition_id": 11, "season_id": 90}]
369
+
370
+ def find_datasets(
371
+ self,
372
+ dataset_type,
373
+ data_spec_versions,
374
+ dataset_collection_metadata,
375
+ **kwargs,
376
+ ):
377
+ if kwargs.get("competition_id") == 11 and kwargs.get("season_id") == 90:
378
+ yield DatasetResource(
379
+ dataset_resource_id={
380
+ "competition_id": 11,
381
+ "season_id": 90,
382
+ "match_id": 123,
383
+ },
384
+ name="Open data match",
385
+ dataset_type="match",
386
+ provider="statsbomb",
387
+ url="http://open-data.com/match123",
388
+ ).add_file(
389
+ data_feed_key="test",
390
+ last_modified=datetime.datetime.now(),
391
+ json_content={"match_id": 123},
392
+ )
393
+
394
+ mock_source_class = MockOpenDataSource
395
+
396
+ # Replace the real source with mock in the registry
397
+ original_source = loader.OPEN_DATA_SOURCES["statsbomb"]
398
+ loader.OPEN_DATA_SOURCES["statsbomb"] = mock_source_class
399
+
400
+ try:
401
+ # No ingestion plans configured - should still work with open data
402
+ datasets = list(
403
+ engine.iter_datasets(
404
+ auto_ingest={"use_open_data": True},
405
+ provider="statsbomb",
406
+ dataset_type="match",
407
+ competition_id=11,
408
+ season_id=90,
409
+ )
410
+ )
411
+
412
+ # Should find datasets from auto-discovered StatsBombGithub source
413
+ assert len(datasets) > 0
414
+ assert datasets[0].provider == "statsbomb"
415
+ assert datasets[0].identifier["competition_id"] == 11
416
+ finally:
417
+ # Restore original source
418
+ loader.OPEN_DATA_SOURCES["statsbomb"] = original_source