ingestify 0.7.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,501 @@
1
+ from datetime import datetime
2
+
3
+ import pytz
4
+
5
+ from ingestify import Source, DatasetResource
6
+ from ingestify.application.ingestion_engine import IngestionEngine
7
+ from ingestify.domain import (
8
+ Identifier,
9
+ Selector,
10
+ DataSpecVersionCollection,
11
+ DraftFile,
12
+ Dataset,
13
+ DatasetState,
14
+ DatasetCreated,
15
+ )
16
+ from ingestify.domain.models.dataset.collection_metadata import (
17
+ DatasetCollectionMetadata,
18
+ )
19
+ from ingestify.domain.models.dataset.events import RevisionAdded
20
+ from ingestify.domain.models.ingestion.ingestion_job_summary import (
21
+ IngestionJobSummary,
22
+ IngestionJobState,
23
+ )
24
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
25
+ from ingestify.domain.models.fetch_policy import FetchPolicy
26
+ from ingestify.domain.models.task.task_summary import TaskState
27
+ from ingestify.infra.serialization import serialize, deserialize
28
+ from ingestify.main import get_engine, get_dev_engine
29
+
30
+
31
+ def add_ingestion_plan(engine: IngestionEngine, source: Source, **selector):
32
+ data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
33
+
34
+ engine.add_ingestion_plan(
35
+ IngestionPlan(
36
+ source=source,
37
+ fetch_policy=FetchPolicy(),
38
+ selectors=[Selector.build(selector, data_spec_versions=data_spec_versions)],
39
+ dataset_type="match",
40
+ data_spec_versions=data_spec_versions,
41
+ )
42
+ )
43
+
44
+
45
+ def file_loader(file_resource, current_file, some_extract_config=None):
46
+ if some_extract_config is not None and some_extract_config != "test123":
47
+ # Test loader_kwargs are passed correctly
48
+ raise Exception(f"Incorrect value for this test value: {some_extract_config}")
49
+
50
+ if file_resource.file_id == "file1__v1":
51
+ if not current_file:
52
+ return DraftFile.from_input(
53
+ "content1",
54
+ data_feed_key="file1",
55
+ )
56
+ else:
57
+ return DraftFile.from_input(
58
+ "different_content",
59
+ data_feed_key="file1",
60
+ )
61
+
62
+ elif file_resource.file_id == "file2__v1":
63
+ return DraftFile.from_input(
64
+ "some_content" + str(file_resource.dataset_resource.dataset_resource_id),
65
+ data_feed_key="file2",
66
+ )
67
+
68
+
69
+ class SimpleFakeSource(Source):
70
+ provider = "fake"
71
+
72
+ def find_datasets(
73
+ self,
74
+ dataset_type: str,
75
+ data_spec_versions: DataSpecVersionCollection,
76
+ dataset_collection_metadata: DatasetCollectionMetadata,
77
+ competition_id,
78
+ season_id,
79
+ **kwargs,
80
+ ):
81
+ last_modified = datetime.now(pytz.utc)
82
+
83
+ yield (
84
+ DatasetResource(
85
+ dataset_resource_id=dict(
86
+ competition_id=competition_id, season_id=season_id, match_id=1
87
+ ),
88
+ provider="fake",
89
+ dataset_type="match",
90
+ name="Test Dataset",
91
+ )
92
+ .add_file(
93
+ last_modified=last_modified,
94
+ data_feed_key="file1",
95
+ data_spec_version="v1",
96
+ file_loader=file_loader,
97
+ loader_kwargs={"some_extract_config": "test123"},
98
+ )
99
+ .add_file(
100
+ last_modified=last_modified,
101
+ data_feed_key="file2",
102
+ data_spec_version="v1",
103
+ file_loader=file_loader,
104
+ )
105
+ .add_file(
106
+ last_modified=last_modified,
107
+ data_feed_key="file3",
108
+ data_spec_version="v1",
109
+ json_content={"test": "some-content"},
110
+ )
111
+ )
112
+ # dataset_resource.add_file(
113
+ # last_modified=last_modified,
114
+ # data_feed_key="file4",
115
+ # data_spec_version="v1",
116
+ # url="https://raw.githubusercontent.com/statsbomb/open-data/refs/heads/master/data/three-sixty/3788741.json",
117
+ # data_serialization_format="json"
118
+ # )
119
+
120
+
121
+ class EmptyDatasetResourceIdSource(Source):
122
+ provider = "fake"
123
+
124
+ def find_datasets(
125
+ self,
126
+ dataset_type: str,
127
+ data_spec_versions: DataSpecVersionCollection,
128
+ dataset_collection_metadata: DatasetCollectionMetadata,
129
+ **kwargs,
130
+ ):
131
+ last_modified = datetime.now(pytz.utc)
132
+
133
+ yield (
134
+ DatasetResource(
135
+ dataset_resource_id={},
136
+ provider="fake",
137
+ dataset_type="match",
138
+ name="Test Dataset",
139
+ ).add_file(
140
+ last_modified=last_modified,
141
+ data_feed_key="file3",
142
+ data_spec_version="v1",
143
+ json_content={"test": "some-content"},
144
+ )
145
+ )
146
+
147
+
148
+ class BatchSource(Source):
149
+ provider = "batch"
150
+
151
+ def __init__(self, name, callback):
152
+ super().__init__(name)
153
+ self.callback = callback
154
+ self.should_stop = False
155
+ self.idx = 0
156
+
157
+ def find_datasets(
158
+ self,
159
+ dataset_type: str,
160
+ data_spec_versions: DataSpecVersionCollection,
161
+ dataset_collection_metadata: DatasetCollectionMetadata,
162
+ competition_id,
163
+ season_id,
164
+ **kwargs,
165
+ ):
166
+ while not self.should_stop:
167
+ items = []
168
+ for i in range(10):
169
+ match_id = self.idx
170
+ self.idx += 1
171
+
172
+ last_modified = datetime.now(pytz.utc)
173
+ dataset_resource = (
174
+ DatasetResource(
175
+ dataset_resource_id=dict(
176
+ competition_id=competition_id,
177
+ season_id=season_id,
178
+ match_id=match_id,
179
+ ),
180
+ name="Test dataset",
181
+ provider="fake",
182
+ dataset_type="match",
183
+ )
184
+ .add_file(
185
+ last_modified=last_modified,
186
+ data_feed_key="file1",
187
+ data_spec_version="v1",
188
+ file_loader=file_loader,
189
+ )
190
+ .add_file(
191
+ last_modified=last_modified,
192
+ data_feed_key="file2",
193
+ data_spec_version="v1",
194
+ file_loader=file_loader,
195
+ )
196
+ )
197
+
198
+ items.append(dataset_resource)
199
+ yield items
200
+
201
+ self.callback and self.callback(self.idx)
202
+
203
+
204
+ class FailingLoadSource(Source):
205
+ provider = "fake"
206
+
207
+ def find_datasets(
208
+ self,
209
+ dataset_type: str,
210
+ data_spec_versions: DataSpecVersionCollection,
211
+ dataset_collection_metadata: DatasetCollectionMetadata,
212
+ competition_id,
213
+ season_id,
214
+ **kwargs,
215
+ ):
216
+ last_modified = datetime.now(pytz.utc)
217
+
218
+ def failing_loader(*args, **kwargs):
219
+ raise Exception("This is a failing task")
220
+
221
+ yield (
222
+ DatasetResource(
223
+ dataset_resource_id=dict(
224
+ competition_id=competition_id, season_id=season_id, match_id=1
225
+ ),
226
+ provider="fake",
227
+ dataset_type="match",
228
+ name="Test Dataset",
229
+ ).add_file(
230
+ last_modified=last_modified,
231
+ data_feed_key="file1",
232
+ data_spec_version="v1",
233
+ file_loader=failing_loader,
234
+ loader_kwargs={"some_extract_config": "test123"},
235
+ )
236
+ )
237
+
238
+
239
+ class FailingJobSource(Source):
240
+ provider = "fake"
241
+
242
+ def find_datasets(
243
+ self,
244
+ dataset_type: str,
245
+ data_spec_versions: DataSpecVersionCollection,
246
+ dataset_collection_metadata: DatasetCollectionMetadata,
247
+ competition_id,
248
+ season_id,
249
+ **kwargs,
250
+ ):
251
+ raise Exception("some failure")
252
+
253
+
254
+ def test_engine(config_file):
255
+ engine = get_engine(config_file, "main")
256
+
257
+ add_ingestion_plan(
258
+ engine, SimpleFakeSource("fake-source"), competition_id=1, season_id=2
259
+ )
260
+ engine.load()
261
+ datasets = engine.store.get_dataset_collection()
262
+ assert len(datasets) == 1
263
+
264
+ dataset = datasets.first()
265
+ assert dataset.identifier == Identifier(competition_id=1, season_id=2, match_id=1)
266
+ assert len(dataset.revisions) == 1
267
+
268
+ engine.load()
269
+ datasets = engine.store.get_dataset_collection()
270
+ assert len(datasets) == 1
271
+
272
+ dataset = datasets.first()
273
+ assert dataset.identifier == Identifier(competition_id=1, season_id=2, match_id=1)
274
+ assert len(dataset.revisions) == 2
275
+ assert len(dataset.revisions[0].modified_files) == 3
276
+ assert len(dataset.revisions[1].modified_files) == 1
277
+
278
+ add_ingestion_plan(
279
+ engine, SimpleFakeSource("fake-source"), competition_id=1, season_id=3
280
+ )
281
+ engine.load()
282
+
283
+ datasets = engine.store.get_dataset_collection()
284
+ assert len(datasets) == 2
285
+
286
+ datasets = engine.store.get_dataset_collection(season_id=3)
287
+ assert len(datasets) == 1
288
+
289
+ # Make sure everything still works with a fresh connection
290
+ engine.store.dataset_repository.session_provider.reset()
291
+
292
+ # TODO: reenable
293
+ # items = list(engine.store.dataset_repository.session.query(IngestionJobSummary))
294
+ # print(items)
295
+
296
+ # Make sure we can load the files
297
+ files = engine.store.load_files(datasets.first(), lazy=True)
298
+ assert files.get_file("file1").stream.read() == b"content1"
299
+
300
+ files = engine.store.load_files(datasets.first(), lazy=False)
301
+ assert files.get_file("file1").stream.read() == b"content1"
302
+
303
+ assert dataset.last_modified_at is not None
304
+
305
+
306
+ def test_iterator_source(config_file):
307
+ """Test when a Source returns a Iterator to do Batch processing.
308
+
309
+ Every batch must be executed right away.
310
+ """
311
+ engine = get_engine(config_file, "main")
312
+
313
+ batch_source = None
314
+
315
+ def callback(idx):
316
+ nonlocal batch_source
317
+ datasets = engine.store.get_dataset_collection()
318
+ assert len(datasets) == idx
319
+
320
+ if idx == 1000:
321
+ batch_source.should_stop = True
322
+
323
+ batch_source = BatchSource("fake-source", callback)
324
+
325
+ add_ingestion_plan(engine, batch_source, competition_id=1, season_id=2)
326
+ engine.load()
327
+
328
+ datasets = engine.store.get_dataset_collection()
329
+ assert len(datasets) == 1000
330
+ for dataset in datasets:
331
+ assert len(dataset.revisions) == 1
332
+
333
+ # Now lets run again. This should create new revisions
334
+ batch_source.idx = 0
335
+ batch_source.should_stop = False
336
+
337
+ def callback(idx):
338
+ if idx == 1000:
339
+ batch_source.should_stop = True
340
+
341
+ batch_source.callback = callback
342
+
343
+ engine.load()
344
+ datasets = engine.store.get_dataset_collection()
345
+ assert len(datasets) == 1000
346
+ for dataset in datasets:
347
+ assert len(dataset.revisions) == 2
348
+
349
+ # Sneaked in an extra test for serialization. This just shouldn't break
350
+ s = serialize(DatasetCreated(dataset=datasets.first()))
351
+ deserialize(s)
352
+
353
+
354
+ def test_ingestion_plan_failing_task(config_file):
355
+ engine = get_engine(config_file, "main")
356
+
357
+ source = FailingLoadSource("fake-source")
358
+
359
+ add_ingestion_plan(engine, source, competition_id=1, season_id=2)
360
+ engine.load()
361
+
362
+ items = engine.store.dataset_repository.load_ingestion_job_summaries()
363
+ assert len(items) == 1
364
+ assert items[0].state == IngestionJobState.FINISHED
365
+ assert items[0].task_summaries[0].state == TaskState.FAILED
366
+
367
+
368
+ def test_ingestion_plan_failing_job(config_file):
369
+ engine = get_engine(config_file, "main")
370
+
371
+ source = FailingJobSource("fake-source")
372
+
373
+ add_ingestion_plan(engine, source, competition_id=1, season_id=2)
374
+ engine.load()
375
+
376
+ items = engine.store.dataset_repository.load_ingestion_job_summaries()
377
+ assert len(items) == 1
378
+ assert items[0].state == IngestionJobState.FAILED
379
+
380
+ # The timing of second task should contain the exception
381
+ assert items[0].timings[1].metadata["result"]["message"] == "some failure"
382
+ assert items[0].timings[1].metadata["result"]["type"] == "Exception"
383
+
384
+
385
+ def test_change_partition_key_transformer():
386
+ """When the partition key transformer is changed after a file is written, it
387
+ must still be possible to read an existing file.
388
+
389
+ This probably means we need to use the storage_path for reading.
390
+ """
391
+
392
+
393
+ def test_serde(config_file):
394
+ engine = get_engine(config_file, "main")
395
+
396
+ add_ingestion_plan(
397
+ engine, SimpleFakeSource("fake-source"), competition_id=1, season_id=2
398
+ )
399
+ engine.load()
400
+ datasets = engine.store.get_dataset_collection()
401
+ dataset = datasets.first()
402
+
403
+ for event_cls in [DatasetCreated, RevisionAdded]:
404
+ event = event_cls(dataset=dataset)
405
+
406
+ event_dict = serialize(event)
407
+
408
+ assert event != event_dict
409
+
410
+ deserialized_event = deserialize(event_dict)
411
+
412
+ assert event.model_dump_json() == deserialized_event.model_dump_json()
413
+
414
+
415
+ def test_empty_dataset_resource_id(config_file):
416
+ """When a empty DatasetResourceId is passed nothing should break"""
417
+ engine = get_engine(config_file, "main")
418
+
419
+ add_ingestion_plan(engine, EmptyDatasetResourceIdSource("fake-source"))
420
+ engine.load()
421
+
422
+
423
+ def test_dev_engine():
424
+ """Test dev engine helper for easy development without config file"""
425
+ source = SimpleFakeSource("test-source")
426
+
427
+ engine = get_dev_engine(
428
+ source=source,
429
+ dataset_type="match",
430
+ data_spec_versions={"default": "v1"},
431
+ ephemeral=True,
432
+ )
433
+
434
+ engine.run(competition_id=1, season_id=2)
435
+
436
+ datasets = engine.store.get_dataset_collection()
437
+ assert len(datasets) == 1
438
+ assert datasets.first().name == "Test Dataset"
439
+
440
+
441
+ def post_load_hook(dataset_resource: DatasetResource, files: dict[str, DraftFile]):
442
+ # Change state to COMPLETE if file content is not '{}'
443
+ for file in files.values():
444
+ if file.size > 2:
445
+ dataset_resource.state = DatasetState.COMPLETE
446
+ break
447
+
448
+
449
+ def file_loader_with_hook(file_resource, current_file):
450
+ # First run: empty JSON, second run: actual data
451
+ content = "{}" if not current_file else '{"data": "value"}'
452
+ return DraftFile.from_input(content, data_feed_key="file1")
453
+
454
+
455
+ class SourceWithHook(Source):
456
+ provider = "test"
457
+
458
+ def find_datasets(
459
+ self,
460
+ dataset_type: str,
461
+ data_spec_versions: DataSpecVersionCollection,
462
+ dataset_collection_metadata,
463
+ competition_id,
464
+ season_id,
465
+ **kwargs,
466
+ ):
467
+ last_modified = datetime.now(pytz.utc)
468
+
469
+ yield (
470
+ DatasetResource(
471
+ dataset_resource_id=dict(
472
+ competition_id=competition_id, season_id=season_id, match_id=1
473
+ ),
474
+ provider="test",
475
+ dataset_type="match",
476
+ name="Test Dataset",
477
+ state=DatasetState.SCHEDULED,
478
+ post_load_files=post_load_hook,
479
+ ).add_file(
480
+ last_modified=last_modified,
481
+ data_feed_key="file1",
482
+ data_spec_version="v1",
483
+ file_loader=file_loader_with_hook,
484
+ )
485
+ )
486
+
487
+
488
+ def test_post_load_files_hook(config_file):
489
+ """Test that post_load_files hook changes state from SCHEDULED to COMPLETE when content is not empty."""
490
+ engine = get_engine(config_file, "main")
491
+ add_ingestion_plan(engine, SourceWithHook("test"), competition_id=1, season_id=2)
492
+
493
+ # First run: file contains '{}', state should remain SCHEDULED
494
+ engine.load()
495
+ dataset1 = engine.store.get_dataset_collection().first()
496
+ assert dataset1.state == DatasetState.SCHEDULED
497
+
498
+ # Second run: file contains actual data, state should change to COMPLETE
499
+ engine.load()
500
+ dataset2 = engine.store.get_dataset_collection().first()
501
+ assert dataset2.state == DatasetState.COMPLETE
@@ -0,0 +1,201 @@
1
+ import datetime
2
+ from unittest.mock import Mock
3
+
4
+ from ingestify import DatasetResource
5
+ from ingestify.domain.models.dataset.events import SelectorSkipped, DatasetSkipped
6
+ from ingestify.domain.models.dataset.dataset import Dataset
7
+ from ingestify.domain.models.dataset.dataset_state import DatasetState
8
+ from ingestify.domain.models.ingestion.ingestion_job import IngestionJob
9
+ from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
10
+ from ingestify.domain.models.fetch_policy import FetchPolicy
11
+ from ingestify.domain import Selector, DataSpecVersionCollection, Identifier, Source
12
+ from ingestify.utils import TaskExecutor
13
+
14
+
15
+ class TestIngestionJobEventEmission:
16
+ """Test that IngestionJob emits events at the right times."""
17
+
18
+ def setup_method(self):
19
+ """Set up test fixtures."""
20
+ self.mock_store = Mock()
21
+
22
+ # Create a proper mock source that inherits from Source
23
+ class MockTestSource(Source):
24
+ def __init__(self, name):
25
+ super().__init__(name)
26
+ self._find_datasets_mock = Mock(return_value=iter([]))
27
+
28
+ @property
29
+ def provider(self):
30
+ return "test_provider"
31
+
32
+ def find_datasets(
33
+ self,
34
+ dataset_type,
35
+ data_spec_versions,
36
+ dataset_collection_metadata,
37
+ **kwargs
38
+ ):
39
+ return self._find_datasets_mock(
40
+ dataset_type,
41
+ data_spec_versions,
42
+ dataset_collection_metadata,
43
+ **kwargs
44
+ )
45
+
46
+ self.mock_source = MockTestSource("test_source")
47
+
48
+ data_spec_versions = DataSpecVersionCollection.from_dict({"default": {"v1"}})
49
+ self.selector = Selector.build(
50
+ {"competition_id": 11}, data_spec_versions=data_spec_versions
51
+ )
52
+
53
+ self.mock_fetch_policy = Mock(spec=FetchPolicy)
54
+
55
+ self.ingestion_plan = IngestionPlan(
56
+ source=self.mock_source,
57
+ fetch_policy=self.mock_fetch_policy,
58
+ selectors=[self.selector],
59
+ dataset_type="match",
60
+ data_spec_versions=data_spec_versions,
61
+ )
62
+
63
+ self.ingestion_job = IngestionJob(
64
+ ingestion_job_id="test-job",
65
+ ingestion_plan=self.ingestion_plan,
66
+ selector=self.selector,
67
+ )
68
+
69
+ def test_selector_skipped_event_emitted_when_up_to_date(self):
70
+ """Test that SelectorSkipped event is emitted when selector is up-to-date."""
71
+ # Setup: selector has last_modified and metadata shows newer data
72
+ self.selector._last_modified = datetime.datetime(2023, 1, 1)
73
+
74
+ mock_metadata = Mock()
75
+ mock_metadata.last_modified = datetime.datetime(
76
+ 2023, 1, 2
77
+ ) # Newer than selector
78
+
79
+ mock_collection = Mock()
80
+ mock_collection.metadata = mock_metadata
81
+ self.mock_store.get_dataset_collection.return_value = mock_collection
82
+
83
+ # Execute
84
+ task_executor = TaskExecutor(dry_run=True)
85
+ summaries = list(self.ingestion_job.execute(self.mock_store, task_executor))
86
+
87
+ # Verify SelectorSkipped event was dispatched
88
+ self.mock_store.dispatch.assert_called_once()
89
+ dispatched_event = self.mock_store.dispatch.call_args[0][0]
90
+ assert isinstance(dispatched_event, SelectorSkipped)
91
+ assert dispatched_event.selector == self.selector
92
+
93
+ def test_dataset_skipped_event_emitted_when_should_refetch_false(self):
94
+ """Test that DatasetSkipped event is emitted when should_refetch returns False."""
95
+ # Setup: selector needs checking (no last_modified)
96
+ self.selector._last_modified = None
97
+
98
+ mock_metadata = Mock()
99
+ mock_metadata.last_modified = None
100
+ mock_collection = Mock()
101
+ mock_collection.metadata = mock_metadata
102
+
103
+ # Mock dataset exists and should not be refetched
104
+ existing_dataset = Dataset(
105
+ bucket="test",
106
+ dataset_id="existing-id",
107
+ name="Existing Dataset",
108
+ state=DatasetState.COMPLETE,
109
+ identifier=Identifier(competition_id=11, match_id=123),
110
+ dataset_type="match",
111
+ provider="test_provider",
112
+ metadata={},
113
+ created_at=datetime.datetime.now(),
114
+ updated_at=datetime.datetime.now(),
115
+ last_modified_at=None,
116
+ )
117
+ mock_collection.get.return_value = existing_dataset
118
+
119
+ self.mock_store.get_dataset_collection.return_value = mock_collection
120
+
121
+ # Mock dataset resource from find_datasets
122
+ from ingestify import DatasetResource
123
+
124
+ dataset_resource = DatasetResource(
125
+ dataset_resource_id={"competition_id": 11, "match_id": 123},
126
+ name="Test Resource",
127
+ dataset_type="match",
128
+ provider="test_provider",
129
+ url="http://test.com",
130
+ )
131
+
132
+ # Mock source returns one dataset resource in a batch
133
+ self.mock_source._find_datasets_mock.return_value = iter([[dataset_resource]])
134
+
135
+ # Mock fetch policy says don't refetch
136
+ self.mock_fetch_policy.should_refetch.return_value = False
137
+
138
+ # Execute
139
+ task_executor = TaskExecutor(dry_run=True)
140
+ summaries = list(self.ingestion_job.execute(self.mock_store, task_executor))
141
+
142
+ # Verify DatasetSkipped event was dispatched
143
+ assert self.mock_store.dispatch.call_count >= 1
144
+
145
+ # Find the DatasetSkipped event among the dispatched calls
146
+ dataset_skipped_calls = [
147
+ call
148
+ for call in self.mock_store.dispatch.call_args_list
149
+ if isinstance(call[0][0], DatasetSkipped)
150
+ ]
151
+ assert len(dataset_skipped_calls) == 1
152
+
153
+ dispatched_event = dataset_skipped_calls[0][0][0]
154
+ assert dispatched_event.dataset == existing_dataset
155
+
156
+ def test_no_events_emitted_when_tasks_created(self):
157
+ """Test that no skipping events are emitted when actual tasks are created and executed."""
158
+ # Setup: selector needs checking and dataset should be refetched
159
+ self.selector._last_modified = None
160
+
161
+ mock_metadata = Mock()
162
+ mock_metadata.last_modified = None
163
+ mock_collection = Mock()
164
+ mock_collection.metadata = mock_metadata
165
+ mock_collection.get.return_value = None # No existing dataset
166
+
167
+ self.mock_store.get_dataset_collection.return_value = mock_collection
168
+
169
+ # Mock dataset resource from find_datasets
170
+ dataset_resource = DatasetResource(
171
+ dataset_resource_id={"competition_id": 11, "match_id": 123},
172
+ name="Test Resource",
173
+ dataset_type="match",
174
+ provider="test_provider",
175
+ url="http://test.com",
176
+ )
177
+
178
+ self.mock_source._find_datasets_mock.return_value = iter([[dataset_resource]])
179
+ self.mock_fetch_policy.should_fetch.return_value = True
180
+
181
+ # Execute with a simple task executor that doesn't fail on None tasks
182
+ task_executor = TaskExecutor(dry_run=True)
183
+
184
+ # Mock the task executor to simulate task execution
185
+ mock_task_summary = Mock()
186
+ mock_executor = Mock()
187
+ mock_executor.map.return_value = [mock_task_summary]
188
+ task_executor.executor = mock_executor
189
+
190
+ summaries = list(self.ingestion_job.execute(self.mock_store, task_executor))
191
+
192
+ # Verify tasks were executed (mock executor was called)
193
+ assert mock_executor.map.called
194
+
195
+ # Verify no skipping events were dispatched (tasks should be created and executed instead)
196
+ skipping_event_calls = [
197
+ call
198
+ for call in self.mock_store.dispatch.call_args_list
199
+ if isinstance(call[0][0], (SelectorSkipped, DatasetSkipped))
200
+ ]
201
+ assert len(skipping_event_calls) == 0