ingestify 0.6.4__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. ingestify/__init__.py +2 -1
  2. ingestify/application/dataset_store.py +228 -11
  3. ingestify/application/ingestion_engine.py +232 -7
  4. ingestify/application/loader.py +163 -28
  5. ingestify/cmdline.py +0 -48
  6. ingestify/domain/models/__init__.py +2 -0
  7. ingestify/domain/models/dataset/collection.py +0 -9
  8. ingestify/domain/models/dataset/dataset_repository.py +4 -0
  9. ingestify/domain/models/dataset/dataset_state.py +5 -0
  10. ingestify/domain/models/dataset/events.py +13 -0
  11. ingestify/domain/models/dataset/file.py +7 -1
  12. ingestify/domain/models/dataset/selector.py +8 -1
  13. ingestify/domain/models/event/event_bus.py +16 -1
  14. ingestify/domain/models/ingestion/ingestion_job.py +23 -4
  15. ingestify/domain/models/resources/dataset_resource.py +0 -1
  16. ingestify/infra/source/statsbomb/base.py +36 -0
  17. ingestify/infra/source/statsbomb/match.py +137 -0
  18. ingestify/infra/source/statsbomb_github.py +46 -44
  19. ingestify/infra/store/dataset/sqlalchemy/repository.py +77 -10
  20. ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
  21. ingestify/main.py +190 -10
  22. ingestify/utils.py +2 -32
  23. ingestify-0.8.0.dist-info/METADATA +257 -0
  24. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/RECORD +28 -36
  25. ingestify/infra/source/wyscout.py +0 -175
  26. ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
  27. ingestify/static/templates/statsbomb_github/database/README.md +0 -1
  28. ingestify/static/templates/statsbomb_github/query.py +0 -14
  29. ingestify/static/templates/wyscout/.env +0 -5
  30. ingestify/static/templates/wyscout/.gitignore +0 -2
  31. ingestify/static/templates/wyscout/README.md +0 -0
  32. ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
  33. ingestify/static/templates/wyscout/database/README.md +0 -1
  34. ingestify/static/templates/wyscout/query.py +0 -14
  35. ingestify-0.6.4.dist-info/METADATA +0 -266
  36. /ingestify/{static/templates/statsbomb_github/README.md → infra/source/statsbomb/__init__.py} +0 -0
  37. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/WHEEL +0 -0
  38. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/entry_points.txt +0 -0
  39. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/top_level.txt +0 -0
ingestify/__init__.py CHANGED
@@ -7,5 +7,6 @@ except NameError:
7
7
  if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
+ from .main import debug_source
10
11
 
11
- __version__ = "0.6.4"
12
+ __version__ = "0.8.0"
@@ -1,13 +1,22 @@
1
1
  import gzip
2
- import hashlib
3
2
  import logging
4
- import mimetypes
5
3
  import os
6
4
  import shutil
7
- from dataclasses import asdict
5
+ from contextlib import contextmanager
6
+ import threading
8
7
  from io import BytesIO
9
8
 
10
- from typing import Dict, List, Optional, Union, Callable, BinaryIO, Awaitable
9
+ from typing import (
10
+ Dict,
11
+ List,
12
+ Optional,
13
+ Union,
14
+ Callable,
15
+ BinaryIO,
16
+ Awaitable,
17
+ NewType,
18
+ Iterable,
19
+ )
11
20
 
12
21
  from ingestify.domain.models.dataset.dataset import DatasetState
13
22
  from ingestify.domain.models.dataset.events import RevisionAdded, MetadataUpdated
@@ -19,7 +28,6 @@ from ingestify.domain.models import (
19
28
  Dataset,
20
29
  DatasetCollection,
21
30
  DatasetRepository,
22
- DatasetResource,
23
31
  DraftFile,
24
32
  File,
25
33
  LoadedFile,
@@ -34,6 +42,67 @@ from ingestify.utils import utcnow
34
42
 
35
43
  logger = logging.getLogger(__name__)
36
44
 
45
+ # Type definition for dataset state parameters that can be strings or DatasetState objects
46
+ DatasetStateParam = NewType(
47
+ "DatasetStateParam", Union[str, Iterable[str], DatasetState, Iterable[DatasetState]]
48
+ )
49
+
50
+
51
+ def normalize_dataset_state(
52
+ dataset_state: Optional[DatasetStateParam],
53
+ ) -> Optional[List[DatasetState]]:
54
+ """
55
+ Normalize dataset_state parameter to a list of DatasetState objects.
56
+
57
+ Args:
58
+ dataset_state: Can be None, a string, a DatasetState enum,
59
+ or a list of strings or DatasetState enums
60
+
61
+ Returns:
62
+ None if input is None, otherwise a list of DatasetState objects
63
+
64
+ Raises:
65
+ ValueError: If an invalid state value is provided
66
+ TypeError: If dataset_state contains elements of invalid types
67
+ Warning: If an empty list is provided
68
+ """
69
+ if dataset_state is None:
70
+ return None
71
+
72
+ # Check for empty list
73
+ if isinstance(dataset_state, list) and len(dataset_state) == 0:
74
+ logger.warning(
75
+ "Empty list provided for dataset_state, this will not filter any states"
76
+ )
77
+ return None
78
+
79
+ normalized_states = []
80
+ states_to_process = (
81
+ [dataset_state] if not isinstance(dataset_state, list) else dataset_state
82
+ )
83
+
84
+ for state in states_to_process:
85
+ if isinstance(state, str):
86
+ # Handle case-insensitive string matching
87
+ try:
88
+ # Try to match the string to a DatasetState enum value
89
+ normalized_state = DatasetState(state.upper())
90
+ normalized_states.append(normalized_state)
91
+ except ValueError:
92
+ valid_states = ", ".join([s.value for s in DatasetState])
93
+ raise ValueError(
94
+ f"Invalid dataset state: '{state}'. Valid states are: {valid_states}"
95
+ )
96
+ elif isinstance(state, DatasetState):
97
+ # Already a DatasetState enum, just add it
98
+ normalized_states.append(state)
99
+ else:
100
+ raise TypeError(
101
+ f"Dataset state must be a string or DatasetState enum, got {type(state).__name__}"
102
+ )
103
+
104
+ return normalized_states
105
+
37
106
 
38
107
  class DatasetStore:
39
108
  def __init__(
@@ -48,8 +117,26 @@ class DatasetStore:
48
117
  self.bucket = bucket
49
118
  self.event_bus: Optional[EventBus] = None
50
119
 
51
- # def __getstate__(self):
52
- # return {"file_repository": self.file_repository, "bucket": self.bucket}
120
+ # Pass current version to repository for validation/migration
121
+ from ingestify import __version__
122
+
123
+ self.dataset_repository.ensure_compatible_version(__version__)
124
+
125
+ @property
126
+ def _thread_local(self):
127
+ if not hasattr(self, "_thread_local_"):
128
+ self._thread_local_ = threading.local()
129
+ return self._thread_local_
130
+
131
+ def __getstate__(self):
132
+ """When pickling this instance, don't pass EventBus. EventBus can contain all
133
+ kind of dispatchers, which may, or may not can be pickled."""
134
+ return {
135
+ "dataset_repository": self.dataset_repository,
136
+ "storage_compression_method": self.storage_compression_method,
137
+ "file_repository": self.file_repository,
138
+ "bucket": self.bucket,
139
+ }
53
140
 
54
141
  def set_event_bus(self, event_bus: EventBus):
55
142
  self.event_bus = event_bus
@@ -58,6 +145,34 @@ class DatasetStore:
58
145
  if self.event_bus:
59
146
  self.event_bus.dispatch(event)
60
147
 
148
+ @contextmanager
149
+ def with_file_cache(self):
150
+ """Context manager to enable file caching during its scope.
151
+
152
+ Files loaded within this context will be cached and reused,
153
+ avoiding multiple downloads of the same file.
154
+
155
+ Example:
156
+ # Without caching (loads files twice)
157
+ analyzer1 = StatsAnalyzer(store, dataset)
158
+ analyzer2 = VisualizationTool(store, dataset)
159
+
160
+ # With caching (files are loaded once and shared)
161
+ with store.with_file_cache():
162
+ analyzer1 = StatsAnalyzer(store, dataset)
163
+ analyzer2 = VisualizationTool(store, dataset)
164
+ """
165
+ # Enable caching for this thread
166
+ self._thread_local.use_file_cache = True
167
+ self._thread_local.file_cache = {}
168
+
169
+ try:
170
+ yield
171
+ finally:
172
+ # Disable caching for this thread
173
+ self._thread_local.use_file_cache = False
174
+ self._thread_local.file_cache = {}
175
+
61
176
  def save_ingestion_job_summary(self, ingestion_job_summary):
62
177
  self.dataset_repository.save_ingestion_job_summary(ingestion_job_summary)
63
178
 
@@ -67,6 +182,9 @@ class DatasetStore:
67
182
  provider: Optional[str] = None,
68
183
  dataset_id: Optional[str] = None,
69
184
  metadata_only: Optional[bool] = False,
185
+ page: Optional[int] = None,
186
+ page_size: Optional[int] = None,
187
+ dataset_state: Optional[DatasetStateParam] = None,
70
188
  **selector,
71
189
  ) -> DatasetCollection:
72
190
  if "selector" in selector:
@@ -82,6 +200,9 @@ class DatasetStore:
82
200
  # Convert all selector dicts to Selectors
83
201
  selector = [Selector(_) for _ in selector]
84
202
 
203
+ # Normalize dataset_state to a list of DatasetState objects
204
+ normalized_dataset_state = normalize_dataset_state(dataset_state)
205
+
85
206
  dataset_collection = self.dataset_repository.get_dataset_collection(
86
207
  bucket=self.bucket,
87
208
  dataset_type=dataset_type,
@@ -89,9 +210,89 @@ class DatasetStore:
89
210
  provider=provider,
90
211
  metadata_only=metadata_only,
91
212
  selector=selector,
213
+ dataset_state=normalized_dataset_state,
214
+ page=page,
215
+ page_size=page_size,
92
216
  )
93
217
  return dataset_collection
94
218
 
219
+ def iter_dataset_collection_batches(
220
+ self,
221
+ dataset_type: Optional[str] = None,
222
+ provider: Optional[str] = None,
223
+ dataset_id: Optional[str] = None,
224
+ batch_size: int = 1000,
225
+ yield_dataset_collection: bool = False,
226
+ dataset_state: Optional[DatasetStateParam] = None,
227
+ **selector,
228
+ ):
229
+ """
230
+ Iterate through all datasets matching the criteria with automatic pagination.
231
+
232
+ Examples:
233
+ ```
234
+ # Iterate through individual datasets
235
+ for dataset in store.iter_dataset_collection_batches(dataset_type="match", provider="statsbomb"):
236
+ process(dataset)
237
+
238
+ # Iterate through DatasetCollection objects (pages)
239
+ for collection in store.iter_dataset_collection(
240
+ dataset_type="match",
241
+ provider="statsbomb",
242
+ yield_dataset_collection=True
243
+ ):
244
+ process_collection(collection)
245
+
246
+ # Filter by dataset state
247
+ for dataset in store.iter_dataset_collection(
248
+ dataset_type="match",
249
+ dataset_state="COMPLETE" # Can also use DatasetState.COMPLETE or ["COMPLETE", "PARTIAL"]
250
+ ):
251
+ process_completed_dataset(dataset)
252
+ ```
253
+
254
+ Args:
255
+ dataset_type: Optional dataset type filter
256
+ provider: Optional provider filter
257
+ dataset_id: Optional dataset ID filter
258
+ batch_size: Number of datasets to fetch per batch
259
+ yield_dataset_collection: If True, yields entire DatasetCollection objects
260
+ instead of individual Dataset objects
261
+ dataset_state: Optional filter for dataset state. Can be a string, DatasetState enum,
262
+ or a list of strings or DatasetState enums
263
+ **selector: Additional selector criteria
264
+
265
+ Yields:
266
+ If yield_dataset_collection is False (default): Dataset objects one by one
267
+ If yield_dataset_collection is True: DatasetCollection objects (pages)
268
+ """
269
+ page = 1
270
+ while True:
271
+ collection = self.get_dataset_collection(
272
+ dataset_type=dataset_type,
273
+ provider=provider,
274
+ dataset_id=dataset_id,
275
+ page=page,
276
+ page_size=batch_size,
277
+ dataset_state=dataset_state,
278
+ **selector,
279
+ )
280
+
281
+ if not collection or len(collection) == 0:
282
+ break
283
+
284
+ if yield_dataset_collection:
285
+ yield collection
286
+ else:
287
+ for dataset in collection:
288
+ yield dataset
289
+
290
+ # If we got fewer results than page_size, we've reached the end
291
+ if len(collection) < batch_size:
292
+ break
293
+
294
+ page += 1
295
+
95
296
  #
96
297
  # def destroy_dataset(self, dataset_id: str):
97
298
  # dataset = self.dataset_repository.
@@ -311,10 +512,21 @@ class DatasetStore:
311
512
  self.file_repository.load_content(storage_path=file_.storage_path)
312
513
  )
313
514
 
314
- loaded_file = LoadedFile(
315
- stream_=get_stream if lazy else get_stream(file),
316
- **file.model_dump(),
317
- )
515
+ def make_loaded_file():
516
+ return LoadedFile(
517
+ stream_=get_stream if lazy else get_stream(file),
518
+ **file.model_dump(),
519
+ )
520
+
521
+ # Using getattr with a default value of False - simple one-liner
522
+ if getattr(self._thread_local, "use_file_cache", False):
523
+ key = (dataset.dataset_id, current_revision.revision_id, file.file_id)
524
+ if key not in self._thread_local.file_cache:
525
+ self._thread_local.file_cache[key] = make_loaded_file()
526
+ loaded_file = self._thread_local.file_cache[key]
527
+ else:
528
+ loaded_file = make_loaded_file()
529
+
318
530
  files[file.file_id] = loaded_file
319
531
  return FileCollection(files, auto_rewind=auto_rewind)
320
532
 
@@ -324,9 +536,14 @@ class DatasetStore:
324
536
  from kloppy import statsbomb
325
537
 
326
538
  try:
539
+ three_sixty_data = None
540
+ if tmp_file := files.get_file("360-frames"):
541
+ three_sixty_data = tmp_file.stream
542
+
327
543
  return statsbomb.load(
328
544
  event_data=(files.get_file("events")).stream,
329
545
  lineup_data=(files.get_file("lineups")).stream,
546
+ three_sixty_data=three_sixty_data,
330
547
  **kwargs,
331
548
  )
332
549
  except Exception as e:
@@ -1,15 +1,31 @@
1
1
  import itertools
2
2
  import logging
3
- from typing import Optional, List
4
-
3
+ import threading
4
+ from queue import Queue
5
+ from typing import Optional, List, Union, Any, Iterator, TypedDict
5
6
 
6
7
  from .loader import Loader
7
8
  from .dataset_store import DatasetStore
8
9
  from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
10
+ from ingestify.domain.models import Dataset
11
+ from ..domain.models.dataset.events import (
12
+ DatasetSkipped,
13
+ RevisionAdded,
14
+ SelectorSkipped,
15
+ )
16
+ from ..domain.models.event import DomainEvent
9
17
 
10
18
  logger = logging.getLogger(__name__)
11
19
 
12
20
 
21
+ class AutoIngestConfig(TypedDict, total=False):
22
+ """Configuration options for auto-ingestion feature."""
23
+
24
+ enabled: bool
25
+ streaming: bool
26
+ use_open_data: bool
27
+
28
+
13
29
  class IngestionEngine:
14
30
  def __init__(self, store: DatasetStore):
15
31
 
@@ -26,8 +42,76 @@ class IngestionEngine:
26
42
  dry_run: bool = False,
27
43
  provider: Optional[str] = None,
28
44
  source: Optional[str] = None,
29
- ):
30
- self.loader.collect_and_run(dry_run=dry_run, provider=provider, source=source)
45
+ dataset_type: Optional[str] = None,
46
+ auto_ingest_config: Optional[AutoIngestConfig] = None,
47
+ async_yield_events: bool = False,
48
+ **selector_filters,
49
+ ) -> Optional[Iterator[DomainEvent]]:
50
+ """
51
+ Execute data ingestion from configured sources.
52
+
53
+ Args:
54
+ dry_run: If True, perform validation but skip actual data ingestion
55
+ provider: Filter ingestion to specific data provider
56
+ source: Filter ingestion to specific source name
57
+ dataset_type: Filter ingestion to specific dataset type (e.g., 'match', 'lineups')
58
+ auto_ingest_config: Configuration for auto-discovery of ingestion plans
59
+ async_yield_events: If True, run ingestion in background and yield domain events
60
+ **selector_filters: Additional selector criteria (e.g., competition_id=43)
61
+
62
+ Returns:
63
+ Iterator of DomainEvent objects if async_yield_events=True, None otherwise
64
+
65
+ Examples:
66
+ # Standard synchronous ingestion
67
+ engine.load(dataset_type="match", competition_id=43)
68
+
69
+ # Real-time event streaming during ingestion
70
+ for event in engine.load(async_yield_events=True):
71
+ if isinstance(event, DatasetSkipped):
72
+ print(f"Dataset ready: {event.dataset.name}")
73
+ """
74
+
75
+ def do_load():
76
+ self.loader.collect_and_run(
77
+ dry_run=dry_run,
78
+ provider=provider,
79
+ source=source,
80
+ dataset_type=dataset_type,
81
+ auto_ingest_config=auto_ingest_config or {},
82
+ **selector_filters,
83
+ )
84
+
85
+ if async_yield_events:
86
+ queue = Queue()
87
+
88
+ def load_in_background():
89
+ unregister = self.store.event_bus.register_queue(queue)
90
+ do_load()
91
+ unregister()
92
+
93
+ # Done.
94
+ queue.put(None)
95
+
96
+ thread = threading.Thread(target=load_in_background)
97
+ thread.start()
98
+
99
+ def _iter():
100
+ # Required to prevent this function to be a generator. Otherwise,
101
+ # do_load won't be called when async_yield_events=False
102
+ while True:
103
+ event = queue.get()
104
+ if event is None:
105
+ break
106
+
107
+ yield event
108
+
109
+ return _iter()
110
+ else:
111
+ do_load()
112
+
113
+ # Alias for load() - more intuitive name for running ingestion
114
+ run = load
31
115
 
32
116
  def list_datasets(self, as_count: bool = False):
33
117
  """Consider moving this to DataStore"""
@@ -52,16 +136,157 @@ class IngestionEngine:
52
136
  print(f" {dataset_type}:")
53
137
  for dataset in datasets_per_type:
54
138
  print(
55
- f" {dataset.identifier}: {dataset.name} / {dataset.state} {dataset.dataset_id}"
139
+ f" {dataset.identifier}\t{dataset.dataset_id}\t{dataset.name} / {dataset.state}"
56
140
  )
57
141
  # print(dataset.dataset_id)
58
142
 
59
143
  def destroy_dataset(
60
144
  self, dataset_id: Optional[str] = None, **selector
61
145
  ) -> List[str]:
62
- datasets = self.store.get_dataset_collection(dataset_id=dataset_id, **selector)
146
+ dataset_collection = self.store.iter_dataset_collection_batches(
147
+ dataset_id=dataset_id,
148
+ **selector,
149
+ )
63
150
  dataset_ids = []
64
- for dataset in datasets:
151
+ for dataset in dataset_collection:
65
152
  self.store.destroy_dataset(dataset)
66
153
  dataset_ids.append(dataset.dataset_id)
67
154
  return dataset_ids
155
+
156
+ def iter_datasets(
157
+ self,
158
+ auto_ingest: Union[bool, AutoIngestConfig] = False,
159
+ dataset_type: Optional[str] = None,
160
+ provider: Optional[str] = None,
161
+ dataset_id: Optional[str] = None,
162
+ batch_size: int = 1000,
163
+ yield_dataset_collection: bool = False,
164
+ dataset_state: Optional[Any] = None,
165
+ **selector_filters,
166
+ ) -> Iterator[Dataset]:
167
+ """
168
+ Iterate over datasets with optional auto-ingestion.
169
+
170
+ This method combines dataset discovery, ingestion, and retrieval in a single
171
+ streaming interface. When auto_ingest=True, it will first run the ingestion
172
+ pipeline to discover and ingest matching datasets before yielding results.
173
+
174
+ Examples:
175
+ # Basic iteration over existing datasets
176
+ for dataset in engine.iter_datasets(provider="statsbomb"):
177
+ process(dataset)
178
+
179
+ # Auto-ingest new data matching criteria before iteration
180
+ for dataset in engine.iter_datasets(
181
+ auto_ingest=True,
182
+ provider="statsbomb",
183
+ competition_id=11
184
+ ):
185
+ process(dataset) # Includes newly ingested datasets
186
+
187
+ # Real-time streaming with open data auto-discovery
188
+ for dataset in engine.iter_datasets(
189
+ auto_ingest={
190
+ "streaming": True,
191
+ "use_open_data": True
192
+ },
193
+ dataset_type="match"
194
+ ):
195
+ process(dataset) # Yields datasets as they become available
196
+
197
+ Args:
198
+ auto_ingest: Enable auto-ingestion before yielding datasets.
199
+ Can be True/False or AutoIngestConfig dict with options:
200
+ - enabled: bool (default: True)
201
+ - streaming: bool (default: False) - enables real-time event streaming
202
+ - use_open_data: bool (default: False) - auto-discover open data sources
203
+ dataset_type: Filter by dataset type (e.g., "match", "competition")
204
+ provider: Filter by data provider (e.g., "statsbomb", "wyscout")
205
+ dataset_id: Filter by specific dataset ID
206
+ batch_size: Number of datasets to fetch per batch for pagination
207
+ yield_dataset_collection: If True, yield DatasetCollection objects instead of individual datasets
208
+ dataset_state: Filter by dataset state (e.g., "COMPLETE", "PARTIAL")
209
+ **selector_filters: Additional selector criteria (competition_id, season_id, match_id, etc.)
210
+
211
+ Yields:
212
+ Dataset objects matching the specified criteria. If auto_ingest=True,
213
+ includes both existing datasets and newly ingested ones.
214
+
215
+ Note:
216
+ Auto-ingestion will only discover datasets that match configured
217
+ IngestionPlans. Requests outside the scope of existing plans will
218
+ not trigger ingestion.
219
+ """
220
+ # Parse auto_ingest config
221
+ if isinstance(auto_ingest, dict):
222
+ auto_ingest_enabled = auto_ingest.get("enabled", True)
223
+ auto_ingest_config = auto_ingest
224
+ else:
225
+ auto_ingest_enabled = bool(auto_ingest)
226
+ auto_ingest_config = {}
227
+
228
+ # Run auto-ingestion if enabled
229
+ if auto_ingest_enabled:
230
+ if auto_ingest_config.get("streaming", False):
231
+ if yield_dataset_collection:
232
+ raise ValueError(
233
+ "Cannot yield_dataset_collection when "
234
+ "auto_ingest_enabled. In case of streaming mode"
235
+ )
236
+
237
+ # Start background loading immediately - don't return a generator
238
+ event_iter = self.load(
239
+ provider=provider,
240
+ dataset_type=dataset_type,
241
+ auto_ingest_config=auto_ingest_config,
242
+ async_yield_events=True,
243
+ **selector_filters,
244
+ )
245
+
246
+ for event in event_iter:
247
+ if isinstance(event, DatasetSkipped):
248
+ yield event.dataset
249
+ elif isinstance(event, RevisionAdded):
250
+ yield event.dataset
251
+ elif isinstance(event, SelectorSkipped):
252
+ yield from self.store.iter_dataset_collection_batches(
253
+ dataset_type=dataset_type,
254
+ provider=provider,
255
+ batch_size=batch_size,
256
+ # We can't yield Dataset (from DatasetSkipped and RevisionAdded) and
257
+ # DatasetCollection in the same run
258
+ yield_dataset_collection=False,
259
+ dataset_state=dataset_state,
260
+ **event.selector.filtered_attributes,
261
+ )
262
+ return
263
+ else:
264
+ self.load(
265
+ provider=provider,
266
+ dataset_type=dataset_type,
267
+ auto_ingest_config=auto_ingest_config,
268
+ **selector_filters,
269
+ )
270
+
271
+ yield from self.store.iter_dataset_collection_batches(
272
+ dataset_type=dataset_type,
273
+ provider=provider,
274
+ dataset_id=dataset_id,
275
+ batch_size=batch_size,
276
+ yield_dataset_collection=yield_dataset_collection,
277
+ dataset_state=dataset_state,
278
+ **selector_filters,
279
+ )
280
+
281
+ def load_dataset_with_kloppy(self, dataset: Dataset, **kwargs):
282
+ """
283
+ Load a dataset using kloppy.
284
+
285
+ Args:
286
+ dataset: The dataset to load
287
+ **kwargs: Additional arguments passed to kloppy's load function
288
+
289
+ Returns:
290
+ Kloppy dataset object
291
+ """
292
+ return self.store.load_with_kloppy(dataset, **kwargs)