ingestify 0.6.4__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +2 -1
- ingestify/application/dataset_store.py +228 -11
- ingestify/application/ingestion_engine.py +232 -7
- ingestify/application/loader.py +163 -28
- ingestify/cmdline.py +0 -48
- ingestify/domain/models/__init__.py +2 -0
- ingestify/domain/models/dataset/collection.py +0 -9
- ingestify/domain/models/dataset/dataset_repository.py +4 -0
- ingestify/domain/models/dataset/dataset_state.py +5 -0
- ingestify/domain/models/dataset/events.py +13 -0
- ingestify/domain/models/dataset/file.py +7 -1
- ingestify/domain/models/dataset/selector.py +8 -1
- ingestify/domain/models/event/event_bus.py +16 -1
- ingestify/domain/models/ingestion/ingestion_job.py +23 -4
- ingestify/domain/models/resources/dataset_resource.py +0 -1
- ingestify/infra/source/statsbomb/base.py +36 -0
- ingestify/infra/source/statsbomb/match.py +137 -0
- ingestify/infra/source/statsbomb_github.py +46 -44
- ingestify/infra/store/dataset/sqlalchemy/repository.py +77 -10
- ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
- ingestify/main.py +190 -10
- ingestify/utils.py +2 -32
- ingestify-0.8.0.dist-info/METADATA +257 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/RECORD +28 -36
- ingestify/infra/source/wyscout.py +0 -175
- ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
- ingestify/static/templates/statsbomb_github/database/README.md +0 -1
- ingestify/static/templates/statsbomb_github/query.py +0 -14
- ingestify/static/templates/wyscout/.env +0 -5
- ingestify/static/templates/wyscout/.gitignore +0 -2
- ingestify/static/templates/wyscout/README.md +0 -0
- ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
- ingestify/static/templates/wyscout/database/README.md +0 -1
- ingestify/static/templates/wyscout/query.py +0 -14
- ingestify-0.6.4.dist-info/METADATA +0 -266
- /ingestify/{static/templates/statsbomb_github/README.md → infra/source/statsbomb/__init__.py} +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/WHEEL +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
3
|
+
from ingestify import DatasetResource
|
|
4
|
+
from ingestify.domain.models.dataset.dataset import DatasetState
|
|
5
|
+
|
|
6
|
+
from .base import StatsBombBaseAPI
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class StatsBombMatchAPI(StatsBombBaseAPI):
|
|
10
|
+
def discover_selectors(self, dataset_type: str):
|
|
11
|
+
assert dataset_type == "match"
|
|
12
|
+
|
|
13
|
+
competitions = self.get(data_spec_version="v4", path="competitions")
|
|
14
|
+
|
|
15
|
+
def get_last_modified(competition):
|
|
16
|
+
if not competition["match_updated"]:
|
|
17
|
+
return None
|
|
18
|
+
|
|
19
|
+
last_modified = datetime.fromisoformat(
|
|
20
|
+
competition["match_updated"] + "+00:00"
|
|
21
|
+
)
|
|
22
|
+
if competition["match_updated_360"]:
|
|
23
|
+
last_modified = max(
|
|
24
|
+
last_modified,
|
|
25
|
+
datetime.fromisoformat(competition["match_updated_360"] + "+00:00"),
|
|
26
|
+
)
|
|
27
|
+
return last_modified
|
|
28
|
+
|
|
29
|
+
return [
|
|
30
|
+
dict(
|
|
31
|
+
competition_id=competition["competition_id"],
|
|
32
|
+
season_id=competition["season_id"],
|
|
33
|
+
# Passing the LastModified for an entire competition allows Ingestify to entirely skip
|
|
34
|
+
# this Selector based on a datetime based check. Dataset comparison won't happen. When the
|
|
35
|
+
# DataSpecVersion is changed, but LastModified isn't changed on the Source, new files ARE NOT ingested!
|
|
36
|
+
_last_modified=get_last_modified(competition),
|
|
37
|
+
)
|
|
38
|
+
for competition in competitions
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
def find_datasets(
|
|
42
|
+
self,
|
|
43
|
+
dataset_type: str,
|
|
44
|
+
competition_id: str,
|
|
45
|
+
season_id: str,
|
|
46
|
+
match_id: str = None,
|
|
47
|
+
data_spec_versions=None,
|
|
48
|
+
dataset_collection_metadata=None,
|
|
49
|
+
):
|
|
50
|
+
assert dataset_type == "match"
|
|
51
|
+
|
|
52
|
+
match_data_spec_version = data_spec_versions.get_version("match")
|
|
53
|
+
|
|
54
|
+
matches = self.get(
|
|
55
|
+
path=f"competitions/{competition_id}/seasons/{season_id}/matches",
|
|
56
|
+
data_spec_version=match_data_spec_version,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
for match in matches:
|
|
60
|
+
if match_id:
|
|
61
|
+
if match["match_id"] != match_id:
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
last_modified = datetime.fromisoformat(match["last_updated"] + "+00:00")
|
|
65
|
+
|
|
66
|
+
if match["collection_status"] == "Complete":
|
|
67
|
+
if match["match_status"] == "available":
|
|
68
|
+
state = DatasetState.COMPLETE
|
|
69
|
+
else:
|
|
70
|
+
# This could be "processing"
|
|
71
|
+
state = DatasetState.PARTIAL
|
|
72
|
+
else:
|
|
73
|
+
state = DatasetState.SCHEDULED
|
|
74
|
+
|
|
75
|
+
name = (
|
|
76
|
+
f"{match['match_date']} / "
|
|
77
|
+
f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
dataset_resource = DatasetResource(
|
|
81
|
+
dataset_resource_id=dict(
|
|
82
|
+
competition_id=competition_id,
|
|
83
|
+
season_id=season_id,
|
|
84
|
+
match_id=match["match_id"],
|
|
85
|
+
),
|
|
86
|
+
dataset_type=dataset_type,
|
|
87
|
+
provider=self.provider,
|
|
88
|
+
name=name,
|
|
89
|
+
metadata=match,
|
|
90
|
+
state=state,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
dataset_resource.add_file(
|
|
94
|
+
last_modified=last_modified,
|
|
95
|
+
data_feed_key="match",
|
|
96
|
+
data_spec_version=match_data_spec_version,
|
|
97
|
+
json_content=match,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
if state.is_complete:
|
|
101
|
+
name += f" / {match['home_score']}-{match['away_score']}"
|
|
102
|
+
|
|
103
|
+
for data_feed_key in ["lineups", "events"]:
|
|
104
|
+
for data_spec_version in data_spec_versions[data_feed_key]:
|
|
105
|
+
dataset_resource.add_file(
|
|
106
|
+
# Note: the LastModified value can be incorrect when only match Metadata (match file)
|
|
107
|
+
# is changed. Use it anyway for indication. Ingestify will also use the
|
|
108
|
+
# Dataset.last_modified_at value to determine if a file should be refetched
|
|
109
|
+
last_modified=last_modified,
|
|
110
|
+
data_feed_key=data_feed_key,
|
|
111
|
+
data_spec_version=data_spec_version,
|
|
112
|
+
url=self.get_url(
|
|
113
|
+
data_feed_key, data_spec_version, match["match_id"]
|
|
114
|
+
),
|
|
115
|
+
http_options=dict(auth=(self.username, self.password)),
|
|
116
|
+
data_serialization_format="json",
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
if (
|
|
120
|
+
match["last_updated_360"]
|
|
121
|
+
and match["match_status_360"] == "available"
|
|
122
|
+
):
|
|
123
|
+
for data_spec_version in data_spec_versions.get("360-frames", []):
|
|
124
|
+
dataset_resource.add_file(
|
|
125
|
+
last_modified=datetime.fromisoformat(
|
|
126
|
+
match["last_updated_360"] + "+00:00"
|
|
127
|
+
),
|
|
128
|
+
data_feed_key="360-frames",
|
|
129
|
+
data_spec_version=data_spec_version,
|
|
130
|
+
url=self.get_url(
|
|
131
|
+
"360-frames", data_spec_version, match["match_id"]
|
|
132
|
+
),
|
|
133
|
+
http_options=dict(auth=(self.username, self.password)),
|
|
134
|
+
data_serialization_format="json",
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
yield dataset_resource
|
|
@@ -21,6 +21,7 @@ class StatsbombGithub(Source):
|
|
|
21
21
|
dict(
|
|
22
22
|
competition_id=competition["competition_id"],
|
|
23
23
|
season_id=competition["season_id"],
|
|
24
|
+
_name=f"{competition['competition_name']} - {competition['season_name']}",
|
|
24
25
|
)
|
|
25
26
|
for competition in competitions
|
|
26
27
|
]
|
|
@@ -53,53 +54,54 @@ class StatsbombGithub(Source):
|
|
|
53
54
|
name = (
|
|
54
55
|
f"{match['match_date']} / "
|
|
55
56
|
f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
|
|
57
|
+
f" / {match['home_score']}-{match['away_score']}"
|
|
56
58
|
)
|
|
57
59
|
|
|
58
|
-
dataset_resource =
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
60
|
+
dataset_resource = (
|
|
61
|
+
DatasetResource(
|
|
62
|
+
dataset_resource_id=dict(
|
|
63
|
+
competition_id=competition_id,
|
|
64
|
+
season_id=season_id,
|
|
65
|
+
match_id=match["match_id"],
|
|
66
|
+
),
|
|
67
|
+
dataset_type=dataset_type,
|
|
68
|
+
provider=self.provider,
|
|
69
|
+
name=name,
|
|
70
|
+
metadata=match,
|
|
71
|
+
state=state,
|
|
72
|
+
)
|
|
73
|
+
.add_file(
|
|
74
|
+
last_modified=last_modified,
|
|
75
|
+
data_feed_key="match",
|
|
76
|
+
data_spec_version=DATA_SPEC_VERSION,
|
|
77
|
+
json_content=match,
|
|
78
|
+
)
|
|
79
|
+
.add_file(
|
|
80
|
+
last_modified=last_modified,
|
|
81
|
+
data_feed_key="lineups",
|
|
82
|
+
data_spec_version=DATA_SPEC_VERSION,
|
|
83
|
+
url=f"{BASE_URL}/lineups/{match['match_id']}.json",
|
|
84
|
+
data_serialization_format="json",
|
|
85
|
+
)
|
|
86
|
+
.add_file(
|
|
87
|
+
last_modified=last_modified,
|
|
88
|
+
data_feed_key="events",
|
|
89
|
+
data_spec_version=DATA_SPEC_VERSION,
|
|
90
|
+
url=f"{BASE_URL}/events/{match['match_id']}.json",
|
|
91
|
+
data_serialization_format="json",
|
|
92
|
+
)
|
|
69
93
|
)
|
|
70
94
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
dataset_resource.add_file(
|
|
83
|
-
last_modified=last_modified,
|
|
84
|
-
data_feed_key=data_feed_key,
|
|
85
|
-
data_spec_version=DATA_SPEC_VERSION,
|
|
86
|
-
url=f"{BASE_URL}/{data_feed_key}/{match['match_id']}.json",
|
|
87
|
-
data_serialization_format="json",
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
if (
|
|
91
|
-
match["last_updated_360"]
|
|
92
|
-
and match["match_status_360"] == "available"
|
|
93
|
-
):
|
|
94
|
-
dataset_resource.add_file(
|
|
95
|
-
last_modified=datetime.fromisoformat(
|
|
96
|
-
match["last_updated_360"] + "+00:00"
|
|
97
|
-
),
|
|
98
|
-
data_feed_key="360-frames",
|
|
99
|
-
data_spec_version=DATA_SPEC_VERSION,
|
|
100
|
-
url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
|
|
101
|
-
data_serialization_format="json",
|
|
102
|
-
http_options={"ignore_not_found": True},
|
|
103
|
-
)
|
|
95
|
+
if match["last_updated_360"] and match["match_status_360"] == "available":
|
|
96
|
+
dataset_resource.add_file(
|
|
97
|
+
last_modified=datetime.fromisoformat(
|
|
98
|
+
match["last_updated_360"] + "+00:00"
|
|
99
|
+
),
|
|
100
|
+
data_feed_key="360-frames",
|
|
101
|
+
data_spec_version=DATA_SPEC_VERSION,
|
|
102
|
+
url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
|
|
103
|
+
data_serialization_format="json",
|
|
104
|
+
http_options={"ignore_not_found": True},
|
|
105
|
+
)
|
|
104
106
|
|
|
105
107
|
yield dataset_resource
|
|
@@ -29,6 +29,7 @@ from ingestify.domain.models import (
|
|
|
29
29
|
Dataset,
|
|
30
30
|
DatasetCollection,
|
|
31
31
|
DatasetRepository,
|
|
32
|
+
DatasetState,
|
|
32
33
|
Selector,
|
|
33
34
|
)
|
|
34
35
|
from ingestify.domain.models.dataset.collection_metadata import (
|
|
@@ -46,6 +47,7 @@ from .tables import (
|
|
|
46
47
|
revision_table,
|
|
47
48
|
ingestion_job_summary_table,
|
|
48
49
|
task_summary_table,
|
|
50
|
+
store_version_table,
|
|
49
51
|
)
|
|
50
52
|
|
|
51
53
|
logger = logging.getLogger(__name__)
|
|
@@ -159,6 +161,10 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
159
161
|
entities: list[dict],
|
|
160
162
|
immutable_rows: bool = False,
|
|
161
163
|
):
|
|
164
|
+
if not entities:
|
|
165
|
+
# Nothing to do
|
|
166
|
+
return
|
|
167
|
+
|
|
162
168
|
dialect = self.dialect.name
|
|
163
169
|
if dialect == "mysql":
|
|
164
170
|
from sqlalchemy.dialects.mysql import insert
|
|
@@ -230,6 +236,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
230
236
|
provider: Optional[str] = None,
|
|
231
237
|
dataset_id: Optional[Union[str, List[str]]] = None,
|
|
232
238
|
selector: Optional[Union[Selector, List[Selector]]] = None,
|
|
239
|
+
dataset_state: Optional[List[DatasetState]] = None,
|
|
233
240
|
):
|
|
234
241
|
if dataset_id is not None:
|
|
235
242
|
if isinstance(dataset_id, list):
|
|
@@ -306,6 +313,8 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
306
313
|
query = query.filter(dataset_table.c.dataset_type == dataset_type)
|
|
307
314
|
if provider:
|
|
308
315
|
query = query.filter(dataset_table.c.provider == provider)
|
|
316
|
+
if dataset_state:
|
|
317
|
+
query = query.filter(dataset_table.c.state.in_(dataset_state))
|
|
309
318
|
|
|
310
319
|
return query
|
|
311
320
|
|
|
@@ -395,6 +404,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
395
404
|
dataset_id: Optional[Union[str, List[str]]] = None,
|
|
396
405
|
selector: Optional[Union[Selector, List[Selector]]] = None,
|
|
397
406
|
metadata_only: bool = False,
|
|
407
|
+
page: Optional[int] = None,
|
|
408
|
+
page_size: Optional[int] = None,
|
|
409
|
+
dataset_state: Optional[List[DatasetState]] = None,
|
|
398
410
|
) -> DatasetCollection:
|
|
399
411
|
def apply_query_filter(query):
|
|
400
412
|
return self._filter_query(
|
|
@@ -404,15 +416,23 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
404
416
|
provider=provider,
|
|
405
417
|
dataset_id=dataset_id,
|
|
406
418
|
selector=selector,
|
|
419
|
+
dataset_state=dataset_state,
|
|
407
420
|
)
|
|
408
421
|
|
|
409
422
|
with self.session:
|
|
410
423
|
# Use a contextmanager to make sure it's closed afterwards
|
|
411
424
|
|
|
412
425
|
if not metadata_only:
|
|
426
|
+
# Apply sorting by created_at in ascending order
|
|
413
427
|
dataset_query = apply_query_filter(
|
|
414
428
|
self.session.query(dataset_table.c.dataset_id)
|
|
415
|
-
)
|
|
429
|
+
).order_by(dataset_table.c.created_at.asc())
|
|
430
|
+
|
|
431
|
+
# Apply pagination if both page and page_size are provided
|
|
432
|
+
if page is not None and page_size is not None:
|
|
433
|
+
offset = (page - 1) * page_size
|
|
434
|
+
dataset_query = dataset_query.offset(offset).limit(page_size)
|
|
435
|
+
|
|
416
436
|
self._debug_query(dataset_query)
|
|
417
437
|
dataset_ids = [row.dataset_id for row in dataset_query]
|
|
418
438
|
datasets = self._load_datasets(dataset_ids)
|
|
@@ -503,19 +523,25 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
503
523
|
with self.connect() as connection:
|
|
504
524
|
try:
|
|
505
525
|
# Delete modified files related to the dataset
|
|
506
|
-
|
|
507
|
-
file_table.
|
|
508
|
-
|
|
526
|
+
connection.execute(
|
|
527
|
+
file_table.delete().where(
|
|
528
|
+
file_table.c.dataset_id == dataset.dataset_id
|
|
529
|
+
)
|
|
530
|
+
)
|
|
509
531
|
|
|
510
532
|
# Delete revisions related to the dataset
|
|
511
|
-
|
|
512
|
-
revision_table.
|
|
513
|
-
|
|
533
|
+
connection.execute(
|
|
534
|
+
revision_table.delete().where(
|
|
535
|
+
revision_table.c.dataset_id == dataset.dataset_id
|
|
536
|
+
)
|
|
537
|
+
)
|
|
514
538
|
|
|
515
539
|
# Delete the dataset itself
|
|
516
|
-
|
|
517
|
-
dataset_table.
|
|
518
|
-
|
|
540
|
+
connection.execute(
|
|
541
|
+
dataset_table.delete().where(
|
|
542
|
+
dataset_table.c.dataset_id == dataset.dataset_id
|
|
543
|
+
)
|
|
544
|
+
)
|
|
519
545
|
|
|
520
546
|
connection.commit()
|
|
521
547
|
except Exception:
|
|
@@ -606,3 +632,44 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
|
|
|
606
632
|
)
|
|
607
633
|
)
|
|
608
634
|
return ingestion_job_summaries
|
|
635
|
+
|
|
636
|
+
def get_store_version(self) -> Optional[str]:
|
|
637
|
+
"""Get the current Ingestify version stored for this store."""
|
|
638
|
+
with self.session:
|
|
639
|
+
row = self.session.query(store_version_table.c.ingestify_version).first()
|
|
640
|
+
return row.ingestify_version if row else None
|
|
641
|
+
|
|
642
|
+
def set_store_version(self, version: str):
|
|
643
|
+
"""Set the Ingestify version for this store."""
|
|
644
|
+
from ingestify.utils import utcnow
|
|
645
|
+
|
|
646
|
+
now = utcnow()
|
|
647
|
+
entity = {
|
|
648
|
+
"id": 1,
|
|
649
|
+
"ingestify_version": version,
|
|
650
|
+
"created_at": now,
|
|
651
|
+
"updated_at": now,
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
with self.connect() as connection:
|
|
655
|
+
try:
|
|
656
|
+
self._upsert(connection, store_version_table, [entity])
|
|
657
|
+
connection.commit()
|
|
658
|
+
except Exception:
|
|
659
|
+
connection.rollback()
|
|
660
|
+
raise
|
|
661
|
+
|
|
662
|
+
def ensure_compatible_version(self, current_version: str):
|
|
663
|
+
"""Ensure the store is compatible with the current Ingestify version."""
|
|
664
|
+
stored_version = self.get_store_version()
|
|
665
|
+
|
|
666
|
+
if stored_version is None:
|
|
667
|
+
# First time setup - store the current version
|
|
668
|
+
self.set_store_version(current_version)
|
|
669
|
+
logger.info(f"Initialized store with Ingestify version {current_version}")
|
|
670
|
+
elif stored_version != current_version:
|
|
671
|
+
# Version mismatch - for now just log, future: trigger migration
|
|
672
|
+
logger.warning(
|
|
673
|
+
f"Store version mismatch: stored={stored_version}, current={current_version}. "
|
|
674
|
+
f"Future versions may require migration."
|
|
675
|
+
)
|
|
@@ -84,6 +84,7 @@ class PathString(TypeDecorator):
|
|
|
84
84
|
|
|
85
85
|
|
|
86
86
|
class DatasetStateString(TypeDecorator):
|
|
87
|
+
cache_ok = True
|
|
87
88
|
impl = String(255)
|
|
88
89
|
|
|
89
90
|
def process_bind_param(self, value: DatasetState, dialect):
|
|
@@ -318,6 +319,15 @@ task_summary_table = Table(
|
|
|
318
319
|
# Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
|
|
319
320
|
# Column("source", JSONType()),
|
|
320
321
|
)
|
|
322
|
+
|
|
323
|
+
store_version_table = Table(
|
|
324
|
+
"store_version",
|
|
325
|
+
metadata,
|
|
326
|
+
Column("id", Integer, primary_key=True, default=1),
|
|
327
|
+
Column("ingestify_version", String(255), nullable=False),
|
|
328
|
+
Column("created_at", TZDateTime(6), nullable=False),
|
|
329
|
+
Column("updated_at", TZDateTime(6), nullable=False),
|
|
330
|
+
)
|
|
321
331
|
#
|
|
322
332
|
#
|
|
323
333
|
# mapper_registry = registry()
|
ingestify/main.py
CHANGED
|
@@ -138,12 +138,16 @@ def get_remote_datastore(url: str, bucket: str, **kwargs) -> DatasetStore:
|
|
|
138
138
|
|
|
139
139
|
def get_source_cls(key: str) -> Type[Source]:
|
|
140
140
|
if key.startswith("ingestify."):
|
|
141
|
-
_, type_ = key.split(".")
|
|
141
|
+
_, type_ = key.split(".", maxsplit=1)
|
|
142
142
|
if type_ == "wyscout":
|
|
143
143
|
from ingestify.infra.source.wyscout import Wyscout
|
|
144
144
|
|
|
145
145
|
return Wyscout
|
|
146
146
|
|
|
147
|
+
elif type_ == "statsbomb.match":
|
|
148
|
+
from ingestify.infra.source.statsbomb.match import StatsBombMatchAPI
|
|
149
|
+
|
|
150
|
+
return StatsBombMatchAPI
|
|
147
151
|
elif type_ == "statsbomb_github":
|
|
148
152
|
from ingestify.infra.source.statsbomb_github import StatsbombGithub
|
|
149
153
|
|
|
@@ -183,15 +187,36 @@ def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
|
|
|
183
187
|
|
|
184
188
|
|
|
185
189
|
def get_engine(
|
|
186
|
-
config_file
|
|
190
|
+
config_file: Optional[str] = None,
|
|
191
|
+
bucket: Optional[str] = None,
|
|
192
|
+
disable_events: bool = False,
|
|
193
|
+
metadata_url: Optional[str] = None,
|
|
194
|
+
file_url: Optional[str] = None,
|
|
187
195
|
) -> IngestionEngine:
|
|
188
|
-
config = parse_config(config_file, default_value="")
|
|
189
|
-
|
|
190
|
-
logger.info("Initializing sources")
|
|
191
196
|
sources = {}
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
197
|
+
|
|
198
|
+
if not config_file:
|
|
199
|
+
if not metadata_url or not file_url:
|
|
200
|
+
raise ValueError(
|
|
201
|
+
f"You must specify metadata_url and file_url in case you don't use a config_file"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
config = {
|
|
205
|
+
"main": {
|
|
206
|
+
"metadata_url": metadata_url,
|
|
207
|
+
"file_url": file_url,
|
|
208
|
+
"default_bucket": bucket or "main",
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
elif not config_file:
|
|
212
|
+
raise ValueError("You must specify a config file")
|
|
213
|
+
else:
|
|
214
|
+
config = parse_config(config_file, default_value="")
|
|
215
|
+
|
|
216
|
+
logger.info("Initializing sources")
|
|
217
|
+
sys.path.append(os.path.dirname(config_file))
|
|
218
|
+
for name, source_args in config.get("sources", {}).items():
|
|
219
|
+
sources[name] = build_source(name=name, source_args=source_args)
|
|
195
220
|
|
|
196
221
|
logger.info("Initializing IngestionEngine")
|
|
197
222
|
store = get_dataset_store_by_urls(
|
|
@@ -244,13 +269,168 @@ def get_engine(
|
|
|
244
269
|
# but makes it easier later one where we loop over selectors.
|
|
245
270
|
selectors = [Selector.build({}, data_spec_versions=data_spec_versions)]
|
|
246
271
|
|
|
247
|
-
|
|
272
|
+
ingestion_plan_ = IngestionPlan(
|
|
248
273
|
source=sources[ingestion_plan["source"]],
|
|
249
274
|
dataset_type=ingestion_plan["dataset_type"],
|
|
250
275
|
selectors=selectors,
|
|
251
276
|
fetch_policy=fetch_policy,
|
|
252
277
|
data_spec_versions=data_spec_versions,
|
|
253
278
|
)
|
|
254
|
-
ingestion_engine.add_ingestion_plan(
|
|
279
|
+
ingestion_engine.add_ingestion_plan(ingestion_plan_)
|
|
255
280
|
|
|
256
281
|
return ingestion_engine
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def get_dev_engine(
|
|
285
|
+
source: Source,
|
|
286
|
+
dataset_type: str,
|
|
287
|
+
data_spec_versions: dict,
|
|
288
|
+
ephemeral: bool = True,
|
|
289
|
+
configure_logging: bool = True,
|
|
290
|
+
dev_dir: Optional[str] = None,
|
|
291
|
+
) -> IngestionEngine:
|
|
292
|
+
"""
|
|
293
|
+
Quick development helper - creates an engine with minimal setup.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
source: The source to test
|
|
297
|
+
dataset_type: Dataset type to ingest
|
|
298
|
+
data_spec_versions: Dict like {"hops": "v1"}
|
|
299
|
+
ephemeral: If True, uses temp dir that gets cleaned. If False, uses persistent /tmp storage.
|
|
300
|
+
configure_logging: If True, configures basic logging (default: True)
|
|
301
|
+
dev_dir: Optional custom directory for data storage (overrides ephemeral)
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
IngestionEngine configured for development
|
|
305
|
+
|
|
306
|
+
Example:
|
|
307
|
+
>>> source = MySource(name="test", ...)
|
|
308
|
+
>>> engine = get_dev_engine(source, "hops", {"hops": "v1"})
|
|
309
|
+
>>> engine.run()
|
|
310
|
+
>>>
|
|
311
|
+
>>> # Access the datasets
|
|
312
|
+
>>> datasets = engine.store.get_dataset_collection()
|
|
313
|
+
>>> print(f"Ingested {len(datasets)} datasets")
|
|
314
|
+
"""
|
|
315
|
+
import tempfile
|
|
316
|
+
from pathlib import Path
|
|
317
|
+
|
|
318
|
+
if configure_logging:
|
|
319
|
+
logging.basicConfig(
|
|
320
|
+
level=logging.INFO,
|
|
321
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
if dev_dir:
|
|
325
|
+
# Use provided directory
|
|
326
|
+
dev_dir = Path(dev_dir)
|
|
327
|
+
elif ephemeral:
|
|
328
|
+
# Use temp directory that will be cleaned up
|
|
329
|
+
import uuid
|
|
330
|
+
|
|
331
|
+
dev_dir = Path(tempfile.gettempdir()) / f"ingestify-dev-{uuid.uuid4().hex[:8]}"
|
|
332
|
+
else:
|
|
333
|
+
# Use persistent directory
|
|
334
|
+
dev_dir = Path(tempfile.gettempdir()) / "ingestify-dev"
|
|
335
|
+
|
|
336
|
+
dev_dir.mkdir(parents=True, exist_ok=True)
|
|
337
|
+
metadata_url = f"sqlite:///{dev_dir / 'database.db'}"
|
|
338
|
+
file_url = f"file://{dev_dir}"
|
|
339
|
+
|
|
340
|
+
logger.info(f"Dev mode: storing data in {dev_dir}")
|
|
341
|
+
|
|
342
|
+
engine = get_engine(
|
|
343
|
+
metadata_url=metadata_url,
|
|
344
|
+
file_url=file_url,
|
|
345
|
+
bucket="main",
|
|
346
|
+
disable_events=True,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
data_spec_versions_obj = DataSpecVersionCollection.from_dict(data_spec_versions)
|
|
350
|
+
|
|
351
|
+
engine.add_ingestion_plan(
|
|
352
|
+
IngestionPlan(
|
|
353
|
+
source=source,
|
|
354
|
+
dataset_type=dataset_type,
|
|
355
|
+
selectors=[Selector.build({}, data_spec_versions=data_spec_versions_obj)],
|
|
356
|
+
fetch_policy=FetchPolicy(),
|
|
357
|
+
data_spec_versions=data_spec_versions_obj,
|
|
358
|
+
)
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
return engine
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def debug_source(
|
|
365
|
+
source: Source,
|
|
366
|
+
*,
|
|
367
|
+
dataset_type: str,
|
|
368
|
+
data_spec_versions: dict,
|
|
369
|
+
ephemeral: bool = True,
|
|
370
|
+
configure_logging: bool = True,
|
|
371
|
+
dev_dir: Optional[str] = None,
|
|
372
|
+
**kwargs,
|
|
373
|
+
) -> IngestionEngine:
|
|
374
|
+
"""
|
|
375
|
+
Debug helper - creates a dev engine, runs ingestion, and shows results.
|
|
376
|
+
|
|
377
|
+
This is a convenience wrapper around get_dev_engine() that does everything:
|
|
378
|
+
creates the engine, runs ingestion, and displays results.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
source: The source to debug
|
|
382
|
+
dataset_type: Dataset type (e.g., "match")
|
|
383
|
+
data_spec_versions: Dict like {"match": "v1"} - explicit, no defaults!
|
|
384
|
+
ephemeral: If True, uses temp dir. If False, uses persistent /tmp storage.
|
|
385
|
+
configure_logging: If True, configures basic logging (default: True)
|
|
386
|
+
dev_dir: Optional custom directory for data storage (overrides ephemeral)
|
|
387
|
+
**kwargs: Selector arguments. For sources with discover_selectors(), these
|
|
388
|
+
filter discovered selectors. Otherwise passed to find_datasets().
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
IngestionEngine: The engine used for ingestion (for further inspection)
|
|
392
|
+
|
|
393
|
+
Example:
|
|
394
|
+
>>> # Simple source without discover_selectors
|
|
395
|
+
>>> source = StatsBombHOPSS3(name="test", s3_bucket="my-bucket", s3_prefix="HOPS")
|
|
396
|
+
>>> engine = debug_source(source, dataset_type="hops", data_spec_versions={"hops": "v1"})
|
|
397
|
+
|
|
398
|
+
>>> # Source with discover_selectors - discovers all competitions
|
|
399
|
+
>>> source = StatsBombMatchAPI(name="test", ...)
|
|
400
|
+
>>> engine = debug_source(
|
|
401
|
+
... source,
|
|
402
|
+
... dataset_type="match",
|
|
403
|
+
... data_spec_versions={"match": "v6"}
|
|
404
|
+
... )
|
|
405
|
+
|
|
406
|
+
>>> # Filter discovered selectors
|
|
407
|
+
>>> engine = debug_source(
|
|
408
|
+
... source,
|
|
409
|
+
... dataset_type="match",
|
|
410
|
+
... data_spec_versions={"match": "v6"},
|
|
411
|
+
... competition_id=46 # Filters to specific competition
|
|
412
|
+
... )
|
|
413
|
+
"""
|
|
414
|
+
logger.info(f"Debug mode for source: {source.name}")
|
|
415
|
+
|
|
416
|
+
engine = get_dev_engine(
|
|
417
|
+
source=source,
|
|
418
|
+
dataset_type=dataset_type,
|
|
419
|
+
data_spec_versions=data_spec_versions,
|
|
420
|
+
ephemeral=ephemeral,
|
|
421
|
+
configure_logging=configure_logging,
|
|
422
|
+
dev_dir=dev_dir,
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
# Run ingestion
|
|
426
|
+
# Empty selector {} automatically triggers discover_selectors() if available
|
|
427
|
+
# kwargs filter discovered selectors or are passed to find_datasets()
|
|
428
|
+
engine.run(**kwargs)
|
|
429
|
+
|
|
430
|
+
# Show results
|
|
431
|
+
datasets = engine.store.get_dataset_collection()
|
|
432
|
+
logger.info("=" * 60)
|
|
433
|
+
logger.info(f"✓ Ingestion complete: {len(datasets)} dataset(s)")
|
|
434
|
+
logger.info("=" * 60)
|
|
435
|
+
|
|
436
|
+
return engine
|