ingestify 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +1 -1
- ingestify/application/dataset_store.py +228 -11
- ingestify/application/ingestion_engine.py +229 -7
- ingestify/application/loader.py +153 -28
- ingestify/cmdline.py +0 -48
- ingestify/domain/models/__init__.py +2 -0
- ingestify/domain/models/dataset/collection.py +0 -9
- ingestify/domain/models/dataset/dataset_repository.py +4 -0
- ingestify/domain/models/dataset/dataset_state.py +5 -0
- ingestify/domain/models/dataset/events.py +13 -0
- ingestify/domain/models/dataset/file.py +1 -1
- ingestify/domain/models/dataset/selector.py +8 -1
- ingestify/domain/models/event/event_bus.py +16 -1
- ingestify/domain/models/ingestion/ingestion_job.py +23 -4
- ingestify/domain/models/resources/dataset_resource.py +0 -1
- ingestify/infra/source/statsbomb/base.py +36 -0
- ingestify/infra/source/statsbomb/match.py +137 -0
- ingestify/infra/source/statsbomb_github.py +46 -44
- ingestify/infra/store/dataset/sqlalchemy/repository.py +77 -10
- ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
- ingestify/main.py +35 -10
- ingestify/utils.py +2 -32
- ingestify-0.7.0.dist-info/METADATA +211 -0
- {ingestify-0.6.4.dist-info → ingestify-0.7.0.dist-info}/RECORD +28 -36
- ingestify/infra/source/wyscout.py +0 -175
- ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
- ingestify/static/templates/statsbomb_github/database/README.md +0 -1
- ingestify/static/templates/statsbomb_github/query.py +0 -14
- ingestify/static/templates/wyscout/.env +0 -5
- ingestify/static/templates/wyscout/.gitignore +0 -2
- ingestify/static/templates/wyscout/README.md +0 -0
- ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
- ingestify/static/templates/wyscout/database/README.md +0 -1
- ingestify/static/templates/wyscout/query.py +0 -14
- ingestify-0.6.4.dist-info/METADATA +0 -266
- /ingestify/{static/templates/statsbomb_github/README.md → infra/source/statsbomb/__init__.py} +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.7.0.dist-info}/WHEEL +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.7.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.7.0.dist-info}/top_level.txt +0 -0
ingestify/__init__.py
CHANGED
|
@@ -1,13 +1,22 @@
|
|
|
1
1
|
import gzip
|
|
2
|
-
import hashlib
|
|
3
2
|
import logging
|
|
4
|
-
import mimetypes
|
|
5
3
|
import os
|
|
6
4
|
import shutil
|
|
7
|
-
from
|
|
5
|
+
from contextlib import contextmanager
|
|
6
|
+
import threading
|
|
8
7
|
from io import BytesIO
|
|
9
8
|
|
|
10
|
-
from typing import
|
|
9
|
+
from typing import (
|
|
10
|
+
Dict,
|
|
11
|
+
List,
|
|
12
|
+
Optional,
|
|
13
|
+
Union,
|
|
14
|
+
Callable,
|
|
15
|
+
BinaryIO,
|
|
16
|
+
Awaitable,
|
|
17
|
+
NewType,
|
|
18
|
+
Iterable,
|
|
19
|
+
)
|
|
11
20
|
|
|
12
21
|
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
13
22
|
from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
|
|
@@ -19,7 +28,6 @@ from ingestify.domain.models import (
|
|
|
19
28
|
Dataset,
|
|
20
29
|
DatasetCollection,
|
|
21
30
|
DatasetRepository,
|
|
22
|
-
DatasetResource,
|
|
23
31
|
DraftFile,
|
|
24
32
|
File,
|
|
25
33
|
LoadedFile,
|
|
@@ -34,6 +42,67 @@ from ingestify.utils import utcnow
|
|
|
34
42
|
|
|
35
43
|
logger = logging.getLogger(__name__)
|
|
36
44
|
|
|
45
|
+
# Type definition for dataset state parameters that can be strings or DatasetState objects
|
|
46
|
+
DatasetStateParam = NewType(
|
|
47
|
+
"DatasetStateParam", Union[str, Iterable[str], DatasetState, Iterable[DatasetState]]
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def normalize_dataset_state(
|
|
52
|
+
dataset_state: Optional[DatasetStateParam],
|
|
53
|
+
) -> Optional[List[DatasetState]]:
|
|
54
|
+
"""
|
|
55
|
+
Normalize dataset_state parameter to a list of DatasetState objects.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
dataset_state: Can be None, a string, a DatasetState enum,
|
|
59
|
+
or a list of strings or DatasetState enums
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
None if input is None, otherwise a list of DatasetState objects
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
ValueError: If an invalid state value is provided
|
|
66
|
+
TypeError: If dataset_state contains elements of invalid types
|
|
67
|
+
Warning: If an empty list is provided
|
|
68
|
+
"""
|
|
69
|
+
if dataset_state is None:
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
# Check for empty list
|
|
73
|
+
if isinstance(dataset_state, list) and len(dataset_state) == 0:
|
|
74
|
+
logger.warning(
|
|
75
|
+
"Empty list provided for dataset_state, this will not filter any states"
|
|
76
|
+
)
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
normalized_states = []
|
|
80
|
+
states_to_process = (
|
|
81
|
+
[dataset_state] if not isinstance(dataset_state, list) else dataset_state
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
for state in states_to_process:
|
|
85
|
+
if isinstance(state, str):
|
|
86
|
+
# Handle case-insensitive string matching
|
|
87
|
+
try:
|
|
88
|
+
# Try to match the string to a DatasetState enum value
|
|
89
|
+
normalized_state = DatasetState(state.upper())
|
|
90
|
+
normalized_states.append(normalized_state)
|
|
91
|
+
except ValueError:
|
|
92
|
+
valid_states = ", ".join([s.value for s in DatasetState])
|
|
93
|
+
raise ValueError(
|
|
94
|
+
f"Invalid dataset state: '{state}'. Valid states are: {valid_states}"
|
|
95
|
+
)
|
|
96
|
+
elif isinstance(state, DatasetState):
|
|
97
|
+
# Already a DatasetState enum, just add it
|
|
98
|
+
normalized_states.append(state)
|
|
99
|
+
else:
|
|
100
|
+
raise TypeError(
|
|
101
|
+
f"Dataset state must be a string or DatasetState enum, got {type(state).__name__}"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
return normalized_states
|
|
105
|
+
|
|
37
106
|
|
|
38
107
|
class DatasetStore:
|
|
39
108
|
def __init__(
|
|
@@ -48,8 +117,26 @@ class DatasetStore:
|
|
|
48
117
|
self.bucket = bucket
|
|
49
118
|
self.event_bus: Optional[EventBus] = None
|
|
50
119
|
|
|
51
|
-
|
|
52
|
-
|
|
120
|
+
# Pass current version to repository for validation/migration
|
|
121
|
+
from ingestify import __version__
|
|
122
|
+
|
|
123
|
+
self.dataset_repository.ensure_compatible_version(__version__)
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def _thread_local(self):
|
|
127
|
+
if not hasattr(self, "_thread_local_"):
|
|
128
|
+
self._thread_local_ = threading.local()
|
|
129
|
+
return self._thread_local_
|
|
130
|
+
|
|
131
|
+
def __getstate__(self):
|
|
132
|
+
"""When pickling this instance, don't pass EventBus. EventBus can contain all
|
|
133
|
+
kind of dispatchers, which may, or may not can be pickled."""
|
|
134
|
+
return {
|
|
135
|
+
"dataset_repository": self.dataset_repository,
|
|
136
|
+
"storage_compression_method": self.storage_compression_method,
|
|
137
|
+
"file_repository": self.file_repository,
|
|
138
|
+
"bucket": self.bucket,
|
|
139
|
+
}
|
|
53
140
|
|
|
54
141
|
def set_event_bus(self, event_bus: EventBus):
|
|
55
142
|
self.event_bus = event_bus
|
|
@@ -58,6 +145,34 @@ class DatasetStore:
|
|
|
58
145
|
if self.event_bus:
|
|
59
146
|
self.event_bus.dispatch(event)
|
|
60
147
|
|
|
148
|
+
@contextmanager
|
|
149
|
+
def with_file_cache(self):
|
|
150
|
+
"""Context manager to enable file caching during its scope.
|
|
151
|
+
|
|
152
|
+
Files loaded within this context will be cached and reused,
|
|
153
|
+
avoiding multiple downloads of the same file.
|
|
154
|
+
|
|
155
|
+
Example:
|
|
156
|
+
# Without caching (loads files twice)
|
|
157
|
+
analyzer1 = StatsAnalyzer(store, dataset)
|
|
158
|
+
analyzer2 = VisualizationTool(store, dataset)
|
|
159
|
+
|
|
160
|
+
# With caching (files are loaded once and shared)
|
|
161
|
+
with store.with_file_cache():
|
|
162
|
+
analyzer1 = StatsAnalyzer(store, dataset)
|
|
163
|
+
analyzer2 = VisualizationTool(store, dataset)
|
|
164
|
+
"""
|
|
165
|
+
# Enable caching for this thread
|
|
166
|
+
self._thread_local.use_file_cache = True
|
|
167
|
+
self._thread_local.file_cache = {}
|
|
168
|
+
|
|
169
|
+
try:
|
|
170
|
+
yield
|
|
171
|
+
finally:
|
|
172
|
+
# Disable caching for this thread
|
|
173
|
+
self._thread_local.use_file_cache = False
|
|
174
|
+
self._thread_local.file_cache = {}
|
|
175
|
+
|
|
61
176
|
def save_ingestion_job_summary(self, ingestion_job_summary):
|
|
62
177
|
self.dataset_repository.save_ingestion_job_summary(ingestion_job_summary)
|
|
63
178
|
|
|
@@ -67,6 +182,9 @@ class DatasetStore:
|
|
|
67
182
|
provider: Optional[str] = None,
|
|
68
183
|
dataset_id: Optional[str] = None,
|
|
69
184
|
metadata_only: Optional[bool] = False,
|
|
185
|
+
page: Optional[int] = None,
|
|
186
|
+
page_size: Optional[int] = None,
|
|
187
|
+
dataset_state: Optional[DatasetStateParam] = None,
|
|
70
188
|
**selector,
|
|
71
189
|
) -> DatasetCollection:
|
|
72
190
|
if "selector" in selector:
|
|
@@ -82,6 +200,9 @@ class DatasetStore:
|
|
|
82
200
|
# Convert all selector dicts to Selectors
|
|
83
201
|
selector = [Selector(_) for _ in selector]
|
|
84
202
|
|
|
203
|
+
# Normalize dataset_state to a list of DatasetState objects
|
|
204
|
+
normalized_dataset_state = normalize_dataset_state(dataset_state)
|
|
205
|
+
|
|
85
206
|
dataset_collection = self.dataset_repository.get_dataset_collection(
|
|
86
207
|
bucket=self.bucket,
|
|
87
208
|
dataset_type=dataset_type,
|
|
@@ -89,9 +210,89 @@ class DatasetStore:
|
|
|
89
210
|
provider=provider,
|
|
90
211
|
metadata_only=metadata_only,
|
|
91
212
|
selector=selector,
|
|
213
|
+
dataset_state=normalized_dataset_state,
|
|
214
|
+
page=page,
|
|
215
|
+
page_size=page_size,
|
|
92
216
|
)
|
|
93
217
|
return dataset_collection
|
|
94
218
|
|
|
219
|
+
def iter_dataset_collection_batches(
|
|
220
|
+
self,
|
|
221
|
+
dataset_type: Optional[str] = None,
|
|
222
|
+
provider: Optional[str] = None,
|
|
223
|
+
dataset_id: Optional[str] = None,
|
|
224
|
+
batch_size: int = 1000,
|
|
225
|
+
yield_dataset_collection: bool = False,
|
|
226
|
+
dataset_state: Optional[DatasetStateParam] = None,
|
|
227
|
+
**selector,
|
|
228
|
+
):
|
|
229
|
+
"""
|
|
230
|
+
Iterate through all datasets matching the criteria with automatic pagination.
|
|
231
|
+
|
|
232
|
+
Examples:
|
|
233
|
+
```
|
|
234
|
+
# Iterate through individual datasets
|
|
235
|
+
for dataset in store.iter_dataset_collection_batches(dataset_type="match", provider="statsbomb"):
|
|
236
|
+
process(dataset)
|
|
237
|
+
|
|
238
|
+
# Iterate through DatasetCollection objects (pages)
|
|
239
|
+
for collection in store.iter_dataset_collection(
|
|
240
|
+
dataset_type="match",
|
|
241
|
+
provider="statsbomb",
|
|
242
|
+
yield_dataset_collection=True
|
|
243
|
+
):
|
|
244
|
+
process_collection(collection)
|
|
245
|
+
|
|
246
|
+
# Filter by dataset state
|
|
247
|
+
for dataset in store.iter_dataset_collection(
|
|
248
|
+
dataset_type="match",
|
|
249
|
+
dataset_state="COMPLETE" # Can also use DatasetState.COMPLETE or ["COMPLETE", "PARTIAL"]
|
|
250
|
+
):
|
|
251
|
+
process_completed_dataset(dataset)
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
dataset_type: Optional dataset type filter
|
|
256
|
+
provider: Optional provider filter
|
|
257
|
+
dataset_id: Optional dataset ID filter
|
|
258
|
+
batch_size: Number of datasets to fetch per batch
|
|
259
|
+
yield_dataset_collection: If True, yields entire DatasetCollection objects
|
|
260
|
+
instead of individual Dataset objects
|
|
261
|
+
dataset_state: Optional filter for dataset state. Can be a string, DatasetState enum,
|
|
262
|
+
or a list of strings or DatasetState enums
|
|
263
|
+
**selector: Additional selector criteria
|
|
264
|
+
|
|
265
|
+
Yields:
|
|
266
|
+
If yield_dataset_collection is False (default): Dataset objects one by one
|
|
267
|
+
If yield_dataset_collection is True: DatasetCollection objects (pages)
|
|
268
|
+
"""
|
|
269
|
+
page = 1
|
|
270
|
+
while True:
|
|
271
|
+
collection = self.get_dataset_collection(
|
|
272
|
+
dataset_type=dataset_type,
|
|
273
|
+
provider=provider,
|
|
274
|
+
dataset_id=dataset_id,
|
|
275
|
+
page=page,
|
|
276
|
+
page_size=batch_size,
|
|
277
|
+
dataset_state=dataset_state,
|
|
278
|
+
**selector,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if not collection or len(collection) == 0:
|
|
282
|
+
break
|
|
283
|
+
|
|
284
|
+
if yield_dataset_collection:
|
|
285
|
+
yield collection
|
|
286
|
+
else:
|
|
287
|
+
for dataset in collection:
|
|
288
|
+
yield dataset
|
|
289
|
+
|
|
290
|
+
# If we got fewer results than page_size, we've reached the end
|
|
291
|
+
if len(collection) < batch_size:
|
|
292
|
+
break
|
|
293
|
+
|
|
294
|
+
page += 1
|
|
295
|
+
|
|
95
296
|
#
|
|
96
297
|
# def destroy_dataset(self, dataset_id: str):
|
|
97
298
|
# dataset = self.dataset_repository.
|
|
@@ -311,10 +512,21 @@ class DatasetStore:
|
|
|
311
512
|
self.file_repository.load_content(storage_path=file_.storage_path)
|
|
312
513
|
)
|
|
313
514
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
515
|
+
def make_loaded_file():
|
|
516
|
+
return LoadedFile(
|
|
517
|
+
stream_=get_stream if lazy else get_stream(file),
|
|
518
|
+
**file.model_dump(),
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
# Using getattr with a default value of False - simple one-liner
|
|
522
|
+
if getattr(self._thread_local, "use_file_cache", False):
|
|
523
|
+
key = (dataset.dataset_id, current_revision.revision_id, file.file_id)
|
|
524
|
+
if key not in self._thread_local.file_cache:
|
|
525
|
+
self._thread_local.file_cache[key] = make_loaded_file()
|
|
526
|
+
loaded_file = self._thread_local.file_cache[key]
|
|
527
|
+
else:
|
|
528
|
+
loaded_file = make_loaded_file()
|
|
529
|
+
|
|
318
530
|
files[file.file_id] = loaded_file
|
|
319
531
|
return FileCollection(files, auto_rewind=auto_rewind)
|
|
320
532
|
|
|
@@ -324,9 +536,14 @@ class DatasetStore:
|
|
|
324
536
|
from kloppy import statsbomb
|
|
325
537
|
|
|
326
538
|
try:
|
|
539
|
+
three_sixty_data = None
|
|
540
|
+
if tmp_file := files.get_file("360-frames"):
|
|
541
|
+
three_sixty_data = tmp_file.stream
|
|
542
|
+
|
|
327
543
|
return statsbomb.load(
|
|
328
544
|
event_data=(files.get_file("events")).stream,
|
|
329
545
|
lineup_data=(files.get_file("lineups")).stream,
|
|
546
|
+
three_sixty_data=three_sixty_data,
|
|
330
547
|
**kwargs,
|
|
331
548
|
)
|
|
332
549
|
except Exception as e:
|
|
@@ -1,15 +1,31 @@
|
|
|
1
1
|
import itertools
|
|
2
2
|
import logging
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
import threading
|
|
4
|
+
from queue import Queue
|
|
5
|
+
from typing import Optional, List, Union, Any, Iterator, TypedDict
|
|
5
6
|
|
|
6
7
|
from .loader import Loader
|
|
7
8
|
from .dataset_store import DatasetStore
|
|
8
9
|
from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
|
|
10
|
+
from ingestify.domain.models import Dataset
|
|
11
|
+
from ..domain.models.dataset.events import (
|
|
12
|
+
DatasetSkipped,
|
|
13
|
+
RevisionAdded,
|
|
14
|
+
SelectorSkipped,
|
|
15
|
+
)
|
|
16
|
+
from ..domain.models.event import DomainEvent
|
|
9
17
|
|
|
10
18
|
logger = logging.getLogger(__name__)
|
|
11
19
|
|
|
12
20
|
|
|
21
|
+
class AutoIngestConfig(TypedDict, total=False):
|
|
22
|
+
"""Configuration options for auto-ingestion feature."""
|
|
23
|
+
|
|
24
|
+
enabled: bool
|
|
25
|
+
streaming: bool
|
|
26
|
+
use_open_data: bool
|
|
27
|
+
|
|
28
|
+
|
|
13
29
|
class IngestionEngine:
|
|
14
30
|
def __init__(self, store: DatasetStore):
|
|
15
31
|
|
|
@@ -26,8 +42,73 @@ class IngestionEngine:
|
|
|
26
42
|
dry_run: bool = False,
|
|
27
43
|
provider: Optional[str] = None,
|
|
28
44
|
source: Optional[str] = None,
|
|
29
|
-
|
|
30
|
-
|
|
45
|
+
dataset_type: Optional[str] = None,
|
|
46
|
+
auto_ingest_config: Optional[AutoIngestConfig] = None,
|
|
47
|
+
async_yield_events: bool = False,
|
|
48
|
+
**selector_filters,
|
|
49
|
+
) -> Optional[Iterator[DomainEvent]]:
|
|
50
|
+
"""
|
|
51
|
+
Execute data ingestion from configured sources.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
dry_run: If True, perform validation but skip actual data ingestion
|
|
55
|
+
provider: Filter ingestion to specific data provider
|
|
56
|
+
source: Filter ingestion to specific source name
|
|
57
|
+
dataset_type: Filter ingestion to specific dataset type (e.g., 'match', 'lineups')
|
|
58
|
+
auto_ingest_config: Configuration for auto-discovery of ingestion plans
|
|
59
|
+
async_yield_events: If True, run ingestion in background and yield domain events
|
|
60
|
+
**selector_filters: Additional selector criteria (e.g., competition_id=43)
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Iterator of DomainEvent objects if async_yield_events=True, None otherwise
|
|
64
|
+
|
|
65
|
+
Examples:
|
|
66
|
+
# Standard synchronous ingestion
|
|
67
|
+
engine.load(dataset_type="match", competition_id=43)
|
|
68
|
+
|
|
69
|
+
# Real-time event streaming during ingestion
|
|
70
|
+
for event in engine.load(async_yield_events=True):
|
|
71
|
+
if isinstance(event, DatasetSkipped):
|
|
72
|
+
print(f"Dataset ready: {event.dataset.name}")
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def do_load():
|
|
76
|
+
self.loader.collect_and_run(
|
|
77
|
+
dry_run=dry_run,
|
|
78
|
+
provider=provider,
|
|
79
|
+
source=source,
|
|
80
|
+
dataset_type=dataset_type,
|
|
81
|
+
auto_ingest_config=auto_ingest_config or {},
|
|
82
|
+
**selector_filters,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
if async_yield_events:
|
|
86
|
+
queue = Queue()
|
|
87
|
+
|
|
88
|
+
def load_in_background():
|
|
89
|
+
unregister = self.store.event_bus.register_queue(queue)
|
|
90
|
+
do_load()
|
|
91
|
+
unregister()
|
|
92
|
+
|
|
93
|
+
# Done.
|
|
94
|
+
queue.put(None)
|
|
95
|
+
|
|
96
|
+
thread = threading.Thread(target=load_in_background)
|
|
97
|
+
thread.start()
|
|
98
|
+
|
|
99
|
+
def _iter():
|
|
100
|
+
# Required to prevent this function to be a generator. Otherwise,
|
|
101
|
+
# do_load won't be called when async_yield_events=False
|
|
102
|
+
while True:
|
|
103
|
+
event = queue.get()
|
|
104
|
+
if event is None:
|
|
105
|
+
break
|
|
106
|
+
|
|
107
|
+
yield event
|
|
108
|
+
|
|
109
|
+
return _iter()
|
|
110
|
+
else:
|
|
111
|
+
do_load()
|
|
31
112
|
|
|
32
113
|
def list_datasets(self, as_count: bool = False):
|
|
33
114
|
"""Consider moving this to DataStore"""
|
|
@@ -52,16 +133,157 @@ class IngestionEngine:
|
|
|
52
133
|
print(f" {dataset_type}:")
|
|
53
134
|
for dataset in datasets_per_type:
|
|
54
135
|
print(
|
|
55
|
-
f" {dataset.identifier}
|
|
136
|
+
f" {dataset.identifier}\t{dataset.dataset_id}\t{dataset.name} / {dataset.state}"
|
|
56
137
|
)
|
|
57
138
|
# print(dataset.dataset_id)
|
|
58
139
|
|
|
59
140
|
def destroy_dataset(
|
|
60
141
|
self, dataset_id: Optional[str] = None, **selector
|
|
61
142
|
) -> List[str]:
|
|
62
|
-
|
|
143
|
+
dataset_collection = self.store.iter_dataset_collection_batches(
|
|
144
|
+
dataset_id=dataset_id,
|
|
145
|
+
**selector,
|
|
146
|
+
)
|
|
63
147
|
dataset_ids = []
|
|
64
|
-
for dataset in
|
|
148
|
+
for dataset in dataset_collection:
|
|
65
149
|
self.store.destroy_dataset(dataset)
|
|
66
150
|
dataset_ids.append(dataset.dataset_id)
|
|
67
151
|
return dataset_ids
|
|
152
|
+
|
|
153
|
+
def iter_datasets(
|
|
154
|
+
self,
|
|
155
|
+
auto_ingest: Union[bool, AutoIngestConfig] = False,
|
|
156
|
+
dataset_type: Optional[str] = None,
|
|
157
|
+
provider: Optional[str] = None,
|
|
158
|
+
dataset_id: Optional[str] = None,
|
|
159
|
+
batch_size: int = 1000,
|
|
160
|
+
yield_dataset_collection: bool = False,
|
|
161
|
+
dataset_state: Optional[Any] = None,
|
|
162
|
+
**selector_filters,
|
|
163
|
+
) -> Iterator[Dataset]:
|
|
164
|
+
"""
|
|
165
|
+
Iterate over datasets with optional auto-ingestion.
|
|
166
|
+
|
|
167
|
+
This method combines dataset discovery, ingestion, and retrieval in a single
|
|
168
|
+
streaming interface. When auto_ingest=True, it will first run the ingestion
|
|
169
|
+
pipeline to discover and ingest matching datasets before yielding results.
|
|
170
|
+
|
|
171
|
+
Examples:
|
|
172
|
+
# Basic iteration over existing datasets
|
|
173
|
+
for dataset in engine.iter_datasets(provider="statsbomb"):
|
|
174
|
+
process(dataset)
|
|
175
|
+
|
|
176
|
+
# Auto-ingest new data matching criteria before iteration
|
|
177
|
+
for dataset in engine.iter_datasets(
|
|
178
|
+
auto_ingest=True,
|
|
179
|
+
provider="statsbomb",
|
|
180
|
+
competition_id=11
|
|
181
|
+
):
|
|
182
|
+
process(dataset) # Includes newly ingested datasets
|
|
183
|
+
|
|
184
|
+
# Real-time streaming with open data auto-discovery
|
|
185
|
+
for dataset in engine.iter_datasets(
|
|
186
|
+
auto_ingest={
|
|
187
|
+
"streaming": True,
|
|
188
|
+
"use_open_data": True
|
|
189
|
+
},
|
|
190
|
+
dataset_type="match"
|
|
191
|
+
):
|
|
192
|
+
process(dataset) # Yields datasets as they become available
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
auto_ingest: Enable auto-ingestion before yielding datasets.
|
|
196
|
+
Can be True/False or AutoIngestConfig dict with options:
|
|
197
|
+
- enabled: bool (default: True)
|
|
198
|
+
- streaming: bool (default: False) - enables real-time event streaming
|
|
199
|
+
- use_open_data: bool (default: False) - auto-discover open data sources
|
|
200
|
+
dataset_type: Filter by dataset type (e.g., "match", "competition")
|
|
201
|
+
provider: Filter by data provider (e.g., "statsbomb", "wyscout")
|
|
202
|
+
dataset_id: Filter by specific dataset ID
|
|
203
|
+
batch_size: Number of datasets to fetch per batch for pagination
|
|
204
|
+
yield_dataset_collection: If True, yield DatasetCollection objects instead of individual datasets
|
|
205
|
+
dataset_state: Filter by dataset state (e.g., "COMPLETE", "PARTIAL")
|
|
206
|
+
**selector_filters: Additional selector criteria (competition_id, season_id, match_id, etc.)
|
|
207
|
+
|
|
208
|
+
Yields:
|
|
209
|
+
Dataset objects matching the specified criteria. If auto_ingest=True,
|
|
210
|
+
includes both existing datasets and newly ingested ones.
|
|
211
|
+
|
|
212
|
+
Note:
|
|
213
|
+
Auto-ingestion will only discover datasets that match configured
|
|
214
|
+
IngestionPlans. Requests outside the scope of existing plans will
|
|
215
|
+
not trigger ingestion.
|
|
216
|
+
"""
|
|
217
|
+
# Parse auto_ingest config
|
|
218
|
+
if isinstance(auto_ingest, dict):
|
|
219
|
+
auto_ingest_enabled = auto_ingest.get("enabled", True)
|
|
220
|
+
auto_ingest_config = auto_ingest
|
|
221
|
+
else:
|
|
222
|
+
auto_ingest_enabled = bool(auto_ingest)
|
|
223
|
+
auto_ingest_config = {}
|
|
224
|
+
|
|
225
|
+
# Run auto-ingestion if enabled
|
|
226
|
+
if auto_ingest_enabled:
|
|
227
|
+
if auto_ingest_config.get("streaming", False):
|
|
228
|
+
if yield_dataset_collection:
|
|
229
|
+
raise ValueError(
|
|
230
|
+
"Cannot yield_dataset_collection when "
|
|
231
|
+
"auto_ingest_enabled. In case of streaming mode"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# Start background loading immediately - don't return a generator
|
|
235
|
+
event_iter = self.load(
|
|
236
|
+
provider=provider,
|
|
237
|
+
dataset_type=dataset_type,
|
|
238
|
+
auto_ingest_config=auto_ingest_config,
|
|
239
|
+
async_yield_events=True,
|
|
240
|
+
**selector_filters,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
for event in event_iter:
|
|
244
|
+
if isinstance(event, DatasetSkipped):
|
|
245
|
+
yield event.dataset
|
|
246
|
+
elif isinstance(event, RevisionAdded):
|
|
247
|
+
yield event.dataset
|
|
248
|
+
elif isinstance(event, SelectorSkipped):
|
|
249
|
+
yield from self.store.iter_dataset_collection_batches(
|
|
250
|
+
dataset_type=dataset_type,
|
|
251
|
+
provider=provider,
|
|
252
|
+
batch_size=batch_size,
|
|
253
|
+
# We can't yield Dataset (from DatasetSkipped and RevisionAdded) and
|
|
254
|
+
# DatasetCollection in the same run
|
|
255
|
+
yield_dataset_collection=False,
|
|
256
|
+
dataset_state=dataset_state,
|
|
257
|
+
**event.selector.filtered_attributes,
|
|
258
|
+
)
|
|
259
|
+
return
|
|
260
|
+
else:
|
|
261
|
+
self.load(
|
|
262
|
+
provider=provider,
|
|
263
|
+
dataset_type=dataset_type,
|
|
264
|
+
auto_ingest_config=auto_ingest_config,
|
|
265
|
+
**selector_filters,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
yield from self.store.iter_dataset_collection_batches(
|
|
269
|
+
dataset_type=dataset_type,
|
|
270
|
+
provider=provider,
|
|
271
|
+
dataset_id=dataset_id,
|
|
272
|
+
batch_size=batch_size,
|
|
273
|
+
yield_dataset_collection=yield_dataset_collection,
|
|
274
|
+
dataset_state=dataset_state,
|
|
275
|
+
**selector_filters,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
def load_dataset_with_kloppy(self, dataset: Dataset, **kwargs):
|
|
279
|
+
"""
|
|
280
|
+
Load a dataset using kloppy.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
dataset: The dataset to load
|
|
284
|
+
**kwargs: Additional arguments passed to kloppy's load function
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
Kloppy dataset object
|
|
288
|
+
"""
|
|
289
|
+
return self.store.load_with_kloppy(dataset, **kwargs)
|