ingestify 0.6.4__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. ingestify/__init__.py +2 -1
  2. ingestify/application/dataset_store.py +228 -11
  3. ingestify/application/ingestion_engine.py +232 -7
  4. ingestify/application/loader.py +163 -28
  5. ingestify/cmdline.py +0 -48
  6. ingestify/domain/models/__init__.py +2 -0
  7. ingestify/domain/models/dataset/collection.py +0 -9
  8. ingestify/domain/models/dataset/dataset_repository.py +4 -0
  9. ingestify/domain/models/dataset/dataset_state.py +5 -0
  10. ingestify/domain/models/dataset/events.py +13 -0
  11. ingestify/domain/models/dataset/file.py +7 -1
  12. ingestify/domain/models/dataset/selector.py +8 -1
  13. ingestify/domain/models/event/event_bus.py +16 -1
  14. ingestify/domain/models/ingestion/ingestion_job.py +23 -4
  15. ingestify/domain/models/resources/dataset_resource.py +0 -1
  16. ingestify/infra/source/statsbomb/base.py +36 -0
  17. ingestify/infra/source/statsbomb/match.py +137 -0
  18. ingestify/infra/source/statsbomb_github.py +46 -44
  19. ingestify/infra/store/dataset/sqlalchemy/repository.py +77 -10
  20. ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
  21. ingestify/main.py +190 -10
  22. ingestify/utils.py +2 -32
  23. ingestify-0.8.0.dist-info/METADATA +257 -0
  24. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/RECORD +28 -36
  25. ingestify/infra/source/wyscout.py +0 -175
  26. ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
  27. ingestify/static/templates/statsbomb_github/database/README.md +0 -1
  28. ingestify/static/templates/statsbomb_github/query.py +0 -14
  29. ingestify/static/templates/wyscout/.env +0 -5
  30. ingestify/static/templates/wyscout/.gitignore +0 -2
  31. ingestify/static/templates/wyscout/README.md +0 -0
  32. ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
  33. ingestify/static/templates/wyscout/database/README.md +0 -1
  34. ingestify/static/templates/wyscout/query.py +0 -14
  35. ingestify-0.6.4.dist-info/METADATA +0 -266
  36. /ingestify/{static/templates/statsbomb_github/README.md → infra/source/statsbomb/__init__.py} +0 -0
  37. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/WHEEL +0 -0
  38. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/entry_points.txt +0 -0
  39. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,137 @@
1
+ from datetime import datetime
2
+
3
+ from ingestify import DatasetResource
4
+ from ingestify.domain.models.dataset.dataset import DatasetState
5
+
6
+ from .base import StatsBombBaseAPI
7
+
8
+
9
+ class StatsBombMatchAPI(StatsBombBaseAPI):
10
+ def discover_selectors(self, dataset_type: str):
11
+ assert dataset_type == "match"
12
+
13
+ competitions = self.get(data_spec_version="v4", path="competitions")
14
+
15
+ def get_last_modified(competition):
16
+ if not competition["match_updated"]:
17
+ return None
18
+
19
+ last_modified = datetime.fromisoformat(
20
+ competition["match_updated"] + "+00:00"
21
+ )
22
+ if competition["match_updated_360"]:
23
+ last_modified = max(
24
+ last_modified,
25
+ datetime.fromisoformat(competition["match_updated_360"] + "+00:00"),
26
+ )
27
+ return last_modified
28
+
29
+ return [
30
+ dict(
31
+ competition_id=competition["competition_id"],
32
+ season_id=competition["season_id"],
33
+ # Passing the LastModified for an entire competition allows Ingestify to entirely skip
34
+ # this Selector based on a datetime based check. Dataset comparison won't happen. When the
35
+ # DataSpecVersion is changed, but LastModified isn't changed on the Source, new files ARE NOT ingested!
36
+ _last_modified=get_last_modified(competition),
37
+ )
38
+ for competition in competitions
39
+ ]
40
+
41
+ def find_datasets(
42
+ self,
43
+ dataset_type: str,
44
+ competition_id: str,
45
+ season_id: str,
46
+ match_id: str = None,
47
+ data_spec_versions=None,
48
+ dataset_collection_metadata=None,
49
+ ):
50
+ assert dataset_type == "match"
51
+
52
+ match_data_spec_version = data_spec_versions.get_version("match")
53
+
54
+ matches = self.get(
55
+ path=f"competitions/{competition_id}/seasons/{season_id}/matches",
56
+ data_spec_version=match_data_spec_version,
57
+ )
58
+
59
+ for match in matches:
60
+ if match_id:
61
+ if match["match_id"] != match_id:
62
+ continue
63
+
64
+ last_modified = datetime.fromisoformat(match["last_updated"] + "+00:00")
65
+
66
+ if match["collection_status"] == "Complete":
67
+ if match["match_status"] == "available":
68
+ state = DatasetState.COMPLETE
69
+ else:
70
+ # This could be "processing"
71
+ state = DatasetState.PARTIAL
72
+ else:
73
+ state = DatasetState.SCHEDULED
74
+
75
+ name = (
76
+ f"{match['match_date']} / "
77
+ f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
78
+ )
79
+
80
+ dataset_resource = DatasetResource(
81
+ dataset_resource_id=dict(
82
+ competition_id=competition_id,
83
+ season_id=season_id,
84
+ match_id=match["match_id"],
85
+ ),
86
+ dataset_type=dataset_type,
87
+ provider=self.provider,
88
+ name=name,
89
+ metadata=match,
90
+ state=state,
91
+ )
92
+
93
+ dataset_resource.add_file(
94
+ last_modified=last_modified,
95
+ data_feed_key="match",
96
+ data_spec_version=match_data_spec_version,
97
+ json_content=match,
98
+ )
99
+
100
+ if state.is_complete:
101
+ name += f" / {match['home_score']}-{match['away_score']}"
102
+
103
+ for data_feed_key in ["lineups", "events"]:
104
+ for data_spec_version in data_spec_versions[data_feed_key]:
105
+ dataset_resource.add_file(
106
+ # Note: the LastModified value can be incorrect when only match Metadata (match file)
107
+ # is changed. Use it anyway for indication. Ingestify will also use the
108
+ # Dataset.last_modified_at value to determine if a file should be refetched
109
+ last_modified=last_modified,
110
+ data_feed_key=data_feed_key,
111
+ data_spec_version=data_spec_version,
112
+ url=self.get_url(
113
+ data_feed_key, data_spec_version, match["match_id"]
114
+ ),
115
+ http_options=dict(auth=(self.username, self.password)),
116
+ data_serialization_format="json",
117
+ )
118
+
119
+ if (
120
+ match["last_updated_360"]
121
+ and match["match_status_360"] == "available"
122
+ ):
123
+ for data_spec_version in data_spec_versions.get("360-frames", []):
124
+ dataset_resource.add_file(
125
+ last_modified=datetime.fromisoformat(
126
+ match["last_updated_360"] + "+00:00"
127
+ ),
128
+ data_feed_key="360-frames",
129
+ data_spec_version=data_spec_version,
130
+ url=self.get_url(
131
+ "360-frames", data_spec_version, match["match_id"]
132
+ ),
133
+ http_options=dict(auth=(self.username, self.password)),
134
+ data_serialization_format="json",
135
+ )
136
+
137
+ yield dataset_resource
@@ -21,6 +21,7 @@ class StatsbombGithub(Source):
21
21
  dict(
22
22
  competition_id=competition["competition_id"],
23
23
  season_id=competition["season_id"],
24
+ _name=f"{competition['competition_name']} - {competition['season_name']}",
24
25
  )
25
26
  for competition in competitions
26
27
  ]
@@ -53,53 +54,54 @@ class StatsbombGithub(Source):
53
54
  name = (
54
55
  f"{match['match_date']} / "
55
56
  f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
57
+ f" / {match['home_score']}-{match['away_score']}"
56
58
  )
57
59
 
58
- dataset_resource = DatasetResource(
59
- dataset_resource_id=dict(
60
- competition_id=competition_id,
61
- season_id=season_id,
62
- match_id=match["match_id"],
63
- ),
64
- dataset_type=dataset_type,
65
- provider=self.provider,
66
- name=name,
67
- metadata=match,
68
- state=state,
60
+ dataset_resource = (
61
+ DatasetResource(
62
+ dataset_resource_id=dict(
63
+ competition_id=competition_id,
64
+ season_id=season_id,
65
+ match_id=match["match_id"],
66
+ ),
67
+ dataset_type=dataset_type,
68
+ provider=self.provider,
69
+ name=name,
70
+ metadata=match,
71
+ state=state,
72
+ )
73
+ .add_file(
74
+ last_modified=last_modified,
75
+ data_feed_key="match",
76
+ data_spec_version=DATA_SPEC_VERSION,
77
+ json_content=match,
78
+ )
79
+ .add_file(
80
+ last_modified=last_modified,
81
+ data_feed_key="lineups",
82
+ data_spec_version=DATA_SPEC_VERSION,
83
+ url=f"{BASE_URL}/lineups/{match['match_id']}.json",
84
+ data_serialization_format="json",
85
+ )
86
+ .add_file(
87
+ last_modified=last_modified,
88
+ data_feed_key="events",
89
+ data_spec_version=DATA_SPEC_VERSION,
90
+ url=f"{BASE_URL}/events/{match['match_id']}.json",
91
+ data_serialization_format="json",
92
+ )
69
93
  )
70
94
 
71
- dataset_resource.add_file(
72
- last_modified=last_modified,
73
- data_feed_key="match",
74
- data_spec_version=DATA_SPEC_VERSION,
75
- json_content=match,
76
- )
77
-
78
- if state.is_complete:
79
- name += f" / {match['home_score']}-{match['away_score']}"
80
-
81
- for data_feed_key in ["lineups", "events"]:
82
- dataset_resource.add_file(
83
- last_modified=last_modified,
84
- data_feed_key=data_feed_key,
85
- data_spec_version=DATA_SPEC_VERSION,
86
- url=f"{BASE_URL}/{data_feed_key}/{match['match_id']}.json",
87
- data_serialization_format="json",
88
- )
89
-
90
- if (
91
- match["last_updated_360"]
92
- and match["match_status_360"] == "available"
93
- ):
94
- dataset_resource.add_file(
95
- last_modified=datetime.fromisoformat(
96
- match["last_updated_360"] + "+00:00"
97
- ),
98
- data_feed_key="360-frames",
99
- data_spec_version=DATA_SPEC_VERSION,
100
- url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
101
- data_serialization_format="json",
102
- http_options={"ignore_not_found": True},
103
- )
95
+ if match["last_updated_360"] and match["match_status_360"] == "available":
96
+ dataset_resource.add_file(
97
+ last_modified=datetime.fromisoformat(
98
+ match["last_updated_360"] + "+00:00"
99
+ ),
100
+ data_feed_key="360-frames",
101
+ data_spec_version=DATA_SPEC_VERSION,
102
+ url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
103
+ data_serialization_format="json",
104
+ http_options={"ignore_not_found": True},
105
+ )
104
106
 
105
107
  yield dataset_resource
@@ -29,6 +29,7 @@ from ingestify.domain.models import (
29
29
  Dataset,
30
30
  DatasetCollection,
31
31
  DatasetRepository,
32
+ DatasetState,
32
33
  Selector,
33
34
  )
34
35
  from ingestify.domain.models.dataset.collection_metadata import (
@@ -46,6 +47,7 @@ from .tables import (
46
47
  revision_table,
47
48
  ingestion_job_summary_table,
48
49
  task_summary_table,
50
+ store_version_table,
49
51
  )
50
52
 
51
53
  logger = logging.getLogger(__name__)
@@ -159,6 +161,10 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
159
161
  entities: list[dict],
160
162
  immutable_rows: bool = False,
161
163
  ):
164
+ if not entities:
165
+ # Nothing to do
166
+ return
167
+
162
168
  dialect = self.dialect.name
163
169
  if dialect == "mysql":
164
170
  from sqlalchemy.dialects.mysql import insert
@@ -230,6 +236,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
230
236
  provider: Optional[str] = None,
231
237
  dataset_id: Optional[Union[str, List[str]]] = None,
232
238
  selector: Optional[Union[Selector, List[Selector]]] = None,
239
+ dataset_state: Optional[List[DatasetState]] = None,
233
240
  ):
234
241
  if dataset_id is not None:
235
242
  if isinstance(dataset_id, list):
@@ -306,6 +313,8 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
306
313
  query = query.filter(dataset_table.c.dataset_type == dataset_type)
307
314
  if provider:
308
315
  query = query.filter(dataset_table.c.provider == provider)
316
+ if dataset_state:
317
+ query = query.filter(dataset_table.c.state.in_(dataset_state))
309
318
 
310
319
  return query
311
320
 
@@ -395,6 +404,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
395
404
  dataset_id: Optional[Union[str, List[str]]] = None,
396
405
  selector: Optional[Union[Selector, List[Selector]]] = None,
397
406
  metadata_only: bool = False,
407
+ page: Optional[int] = None,
408
+ page_size: Optional[int] = None,
409
+ dataset_state: Optional[List[DatasetState]] = None,
398
410
  ) -> DatasetCollection:
399
411
  def apply_query_filter(query):
400
412
  return self._filter_query(
@@ -404,15 +416,23 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
404
416
  provider=provider,
405
417
  dataset_id=dataset_id,
406
418
  selector=selector,
419
+ dataset_state=dataset_state,
407
420
  )
408
421
 
409
422
  with self.session:
410
423
  # Use a contextmanager to make sure it's closed afterwards
411
424
 
412
425
  if not metadata_only:
426
+ # Apply sorting by created_at in ascending order
413
427
  dataset_query = apply_query_filter(
414
428
  self.session.query(dataset_table.c.dataset_id)
415
- )
429
+ ).order_by(dataset_table.c.created_at.asc())
430
+
431
+ # Apply pagination if both page and page_size are provided
432
+ if page is not None and page_size is not None:
433
+ offset = (page - 1) * page_size
434
+ dataset_query = dataset_query.offset(offset).limit(page_size)
435
+
416
436
  self._debug_query(dataset_query)
417
437
  dataset_ids = [row.dataset_id for row in dataset_query]
418
438
  datasets = self._load_datasets(dataset_ids)
@@ -503,19 +523,25 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
503
523
  with self.connect() as connection:
504
524
  try:
505
525
  # Delete modified files related to the dataset
506
- file_table.delete().where(
507
- file_table.c.dataset_id == dataset.dataset_id
508
- ).execute()
526
+ connection.execute(
527
+ file_table.delete().where(
528
+ file_table.c.dataset_id == dataset.dataset_id
529
+ )
530
+ )
509
531
 
510
532
  # Delete revisions related to the dataset
511
- revision_table.delete().where(
512
- revision_table.c.dataset_id == dataset.dataset_id
513
- ).execute()
533
+ connection.execute(
534
+ revision_table.delete().where(
535
+ revision_table.c.dataset_id == dataset.dataset_id
536
+ )
537
+ )
514
538
 
515
539
  # Delete the dataset itself
516
- dataset_table.delete().where(
517
- dataset_table.c.dataset_id == dataset.dataset_id
518
- ).execute()
540
+ connection.execute(
541
+ dataset_table.delete().where(
542
+ dataset_table.c.dataset_id == dataset.dataset_id
543
+ )
544
+ )
519
545
 
520
546
  connection.commit()
521
547
  except Exception:
@@ -606,3 +632,44 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
606
632
  )
607
633
  )
608
634
  return ingestion_job_summaries
635
+
636
+ def get_store_version(self) -> Optional[str]:
637
+ """Get the current Ingestify version stored for this store."""
638
+ with self.session:
639
+ row = self.session.query(store_version_table.c.ingestify_version).first()
640
+ return row.ingestify_version if row else None
641
+
642
+ def set_store_version(self, version: str):
643
+ """Set the Ingestify version for this store."""
644
+ from ingestify.utils import utcnow
645
+
646
+ now = utcnow()
647
+ entity = {
648
+ "id": 1,
649
+ "ingestify_version": version,
650
+ "created_at": now,
651
+ "updated_at": now,
652
+ }
653
+
654
+ with self.connect() as connection:
655
+ try:
656
+ self._upsert(connection, store_version_table, [entity])
657
+ connection.commit()
658
+ except Exception:
659
+ connection.rollback()
660
+ raise
661
+
662
+ def ensure_compatible_version(self, current_version: str):
663
+ """Ensure the store is compatible with the current Ingestify version."""
664
+ stored_version = self.get_store_version()
665
+
666
+ if stored_version is None:
667
+ # First time setup - store the current version
668
+ self.set_store_version(current_version)
669
+ logger.info(f"Initialized store with Ingestify version {current_version}")
670
+ elif stored_version != current_version:
671
+ # Version mismatch - for now just log, future: trigger migration
672
+ logger.warning(
673
+ f"Store version mismatch: stored={stored_version}, current={current_version}. "
674
+ f"Future versions may require migration."
675
+ )
@@ -84,6 +84,7 @@ class PathString(TypeDecorator):
84
84
 
85
85
 
86
86
  class DatasetStateString(TypeDecorator):
87
+ cache_ok = True
87
88
  impl = String(255)
88
89
 
89
90
  def process_bind_param(self, value: DatasetState, dialect):
@@ -318,6 +319,15 @@ task_summary_table = Table(
318
319
  # Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
319
320
  # Column("source", JSONType()),
320
321
  )
322
+
323
+ store_version_table = Table(
324
+ "store_version",
325
+ metadata,
326
+ Column("id", Integer, primary_key=True, default=1),
327
+ Column("ingestify_version", String(255), nullable=False),
328
+ Column("created_at", TZDateTime(6), nullable=False),
329
+ Column("updated_at", TZDateTime(6), nullable=False),
330
+ )
321
331
  #
322
332
  #
323
333
  # mapper_registry = registry()
ingestify/main.py CHANGED
@@ -138,12 +138,16 @@ def get_remote_datastore(url: str, bucket: str, **kwargs) -> DatasetStore:
138
138
 
139
139
  def get_source_cls(key: str) -> Type[Source]:
140
140
  if key.startswith("ingestify."):
141
- _, type_ = key.split(".")
141
+ _, type_ = key.split(".", maxsplit=1)
142
142
  if type_ == "wyscout":
143
143
  from ingestify.infra.source.wyscout import Wyscout
144
144
 
145
145
  return Wyscout
146
146
 
147
+ elif type_ == "statsbomb.match":
148
+ from ingestify.infra.source.statsbomb.match import StatsBombMatchAPI
149
+
150
+ return StatsBombMatchAPI
147
151
  elif type_ == "statsbomb_github":
148
152
  from ingestify.infra.source.statsbomb_github import StatsbombGithub
149
153
 
@@ -183,15 +187,36 @@ def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
183
187
 
184
188
 
185
189
  def get_engine(
186
- config_file, bucket: Optional[str] = None, disable_events: bool = False
190
+ config_file: Optional[str] = None,
191
+ bucket: Optional[str] = None,
192
+ disable_events: bool = False,
193
+ metadata_url: Optional[str] = None,
194
+ file_url: Optional[str] = None,
187
195
  ) -> IngestionEngine:
188
- config = parse_config(config_file, default_value="")
189
-
190
- logger.info("Initializing sources")
191
196
  sources = {}
192
- sys.path.append(os.path.dirname(config_file))
193
- for name, source_args in config["sources"].items():
194
- sources[name] = build_source(name=name, source_args=source_args)
197
+
198
+ if not config_file:
199
+ if not metadata_url or not file_url:
200
+ raise ValueError(
201
+ f"You must specify metadata_url and file_url in case you don't use a config_file"
202
+ )
203
+
204
+ config = {
205
+ "main": {
206
+ "metadata_url": metadata_url,
207
+ "file_url": file_url,
208
+ "default_bucket": bucket or "main",
209
+ }
210
+ }
211
+ elif not config_file:
212
+ raise ValueError("You must specify a config file")
213
+ else:
214
+ config = parse_config(config_file, default_value="")
215
+
216
+ logger.info("Initializing sources")
217
+ sys.path.append(os.path.dirname(config_file))
218
+ for name, source_args in config.get("sources", {}).items():
219
+ sources[name] = build_source(name=name, source_args=source_args)
195
220
 
196
221
  logger.info("Initializing IngestionEngine")
197
222
  store = get_dataset_store_by_urls(
@@ -244,13 +269,168 @@ def get_engine(
244
269
  # but makes it easier later one where we loop over selectors.
245
270
  selectors = [Selector.build({}, data_spec_versions=data_spec_versions)]
246
271
 
247
- ingestion_plan = IngestionPlan(
272
+ ingestion_plan_ = IngestionPlan(
248
273
  source=sources[ingestion_plan["source"]],
249
274
  dataset_type=ingestion_plan["dataset_type"],
250
275
  selectors=selectors,
251
276
  fetch_policy=fetch_policy,
252
277
  data_spec_versions=data_spec_versions,
253
278
  )
254
- ingestion_engine.add_ingestion_plan(ingestion_plan)
279
+ ingestion_engine.add_ingestion_plan(ingestion_plan_)
255
280
 
256
281
  return ingestion_engine
282
+
283
+
284
+ def get_dev_engine(
285
+ source: Source,
286
+ dataset_type: str,
287
+ data_spec_versions: dict,
288
+ ephemeral: bool = True,
289
+ configure_logging: bool = True,
290
+ dev_dir: Optional[str] = None,
291
+ ) -> IngestionEngine:
292
+ """
293
+ Quick development helper - creates an engine with minimal setup.
294
+
295
+ Args:
296
+ source: The source to test
297
+ dataset_type: Dataset type to ingest
298
+ data_spec_versions: Dict like {"hops": "v1"}
299
+ ephemeral: If True, uses temp dir that gets cleaned. If False, uses persistent /tmp storage.
300
+ configure_logging: If True, configures basic logging (default: True)
301
+ dev_dir: Optional custom directory for data storage (overrides ephemeral)
302
+
303
+ Returns:
304
+ IngestionEngine configured for development
305
+
306
+ Example:
307
+ >>> source = MySource(name="test", ...)
308
+ >>> engine = get_dev_engine(source, "hops", {"hops": "v1"})
309
+ >>> engine.run()
310
+ >>>
311
+ >>> # Access the datasets
312
+ >>> datasets = engine.store.get_dataset_collection()
313
+ >>> print(f"Ingested {len(datasets)} datasets")
314
+ """
315
+ import tempfile
316
+ from pathlib import Path
317
+
318
+ if configure_logging:
319
+ logging.basicConfig(
320
+ level=logging.INFO,
321
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
322
+ )
323
+
324
+ if dev_dir:
325
+ # Use provided directory
326
+ dev_dir = Path(dev_dir)
327
+ elif ephemeral:
328
+ # Use temp directory that will be cleaned up
329
+ import uuid
330
+
331
+ dev_dir = Path(tempfile.gettempdir()) / f"ingestify-dev-{uuid.uuid4().hex[:8]}"
332
+ else:
333
+ # Use persistent directory
334
+ dev_dir = Path(tempfile.gettempdir()) / "ingestify-dev"
335
+
336
+ dev_dir.mkdir(parents=True, exist_ok=True)
337
+ metadata_url = f"sqlite:///{dev_dir / 'database.db'}"
338
+ file_url = f"file://{dev_dir}"
339
+
340
+ logger.info(f"Dev mode: storing data in {dev_dir}")
341
+
342
+ engine = get_engine(
343
+ metadata_url=metadata_url,
344
+ file_url=file_url,
345
+ bucket="main",
346
+ disable_events=True,
347
+ )
348
+
349
+ data_spec_versions_obj = DataSpecVersionCollection.from_dict(data_spec_versions)
350
+
351
+ engine.add_ingestion_plan(
352
+ IngestionPlan(
353
+ source=source,
354
+ dataset_type=dataset_type,
355
+ selectors=[Selector.build({}, data_spec_versions=data_spec_versions_obj)],
356
+ fetch_policy=FetchPolicy(),
357
+ data_spec_versions=data_spec_versions_obj,
358
+ )
359
+ )
360
+
361
+ return engine
362
+
363
+
364
+ def debug_source(
365
+ source: Source,
366
+ *,
367
+ dataset_type: str,
368
+ data_spec_versions: dict,
369
+ ephemeral: bool = True,
370
+ configure_logging: bool = True,
371
+ dev_dir: Optional[str] = None,
372
+ **kwargs,
373
+ ) -> IngestionEngine:
374
+ """
375
+ Debug helper - creates a dev engine, runs ingestion, and shows results.
376
+
377
+ This is a convenience wrapper around get_dev_engine() that does everything:
378
+ creates the engine, runs ingestion, and displays results.
379
+
380
+ Args:
381
+ source: The source to debug
382
+ dataset_type: Dataset type (e.g., "match")
383
+ data_spec_versions: Dict like {"match": "v1"} - explicit, no defaults!
384
+ ephemeral: If True, uses temp dir. If False, uses persistent /tmp storage.
385
+ configure_logging: If True, configures basic logging (default: True)
386
+ dev_dir: Optional custom directory for data storage (overrides ephemeral)
387
+ **kwargs: Selector arguments. For sources with discover_selectors(), these
388
+ filter discovered selectors. Otherwise passed to find_datasets().
389
+
390
+ Returns:
391
+ IngestionEngine: The engine used for ingestion (for further inspection)
392
+
393
+ Example:
394
+ >>> # Simple source without discover_selectors
395
+ >>> source = StatsBombHOPSS3(name="test", s3_bucket="my-bucket", s3_prefix="HOPS")
396
+ >>> engine = debug_source(source, dataset_type="hops", data_spec_versions={"hops": "v1"})
397
+
398
+ >>> # Source with discover_selectors - discovers all competitions
399
+ >>> source = StatsBombMatchAPI(name="test", ...)
400
+ >>> engine = debug_source(
401
+ ... source,
402
+ ... dataset_type="match",
403
+ ... data_spec_versions={"match": "v6"}
404
+ ... )
405
+
406
+ >>> # Filter discovered selectors
407
+ >>> engine = debug_source(
408
+ ... source,
409
+ ... dataset_type="match",
410
+ ... data_spec_versions={"match": "v6"},
411
+ ... competition_id=46 # Filters to specific competition
412
+ ... )
413
+ """
414
+ logger.info(f"Debug mode for source: {source.name}")
415
+
416
+ engine = get_dev_engine(
417
+ source=source,
418
+ dataset_type=dataset_type,
419
+ data_spec_versions=data_spec_versions,
420
+ ephemeral=ephemeral,
421
+ configure_logging=configure_logging,
422
+ dev_dir=dev_dir,
423
+ )
424
+
425
+ # Run ingestion
426
+ # Empty selector {} automatically triggers discover_selectors() if available
427
+ # kwargs filter discovered selectors or are passed to find_datasets()
428
+ engine.run(**kwargs)
429
+
430
+ # Show results
431
+ datasets = engine.store.get_dataset_collection()
432
+ logger.info("=" * 60)
433
+ logger.info(f"✓ Ingestion complete: {len(datasets)} dataset(s)")
434
+ logger.info("=" * 60)
435
+
436
+ return engine