ingestify 0.6.4__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify-0.7.0/PKG-INFO +202 -0
- ingestify-0.7.0/README.md +192 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/__init__.py +1 -1
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/application/dataset_store.py +228 -11
- ingestify-0.7.0/ingestify/application/ingestion_engine.py +289 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/application/loader.py +153 -28
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/cmdline.py +0 -48
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/__init__.py +2 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/collection.py +0 -9
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/dataset_repository.py +4 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/dataset_state.py +5 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/events.py +13 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/file.py +1 -1
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/selector.py +8 -1
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/event_bus.py +16 -1
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/ingestion/ingestion_job.py +23 -4
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/resources/dataset_resource.py +0 -1
- ingestify-0.7.0/ingestify/infra/source/statsbomb/base.py +36 -0
- ingestify-0.7.0/ingestify/infra/source/statsbomb/match.py +137 -0
- ingestify-0.7.0/ingestify/infra/source/statsbomb_github.py +107 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +77 -10
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/main.py +35 -10
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/utils.py +2 -32
- ingestify-0.7.0/ingestify.egg-info/PKG-INFO +202 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify.egg-info/SOURCES.txt +4 -12
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify.egg-info/requires.txt +3 -6
- {ingestify-0.6.4 → ingestify-0.7.0}/setup.py +3 -10
- ingestify-0.6.4/PKG-INFO +0 -254
- ingestify-0.6.4/README.md +0 -244
- ingestify-0.6.4/ingestify/application/ingestion_engine.py +0 -67
- ingestify-0.6.4/ingestify/infra/source/statsbomb_github.py +0 -105
- ingestify-0.6.4/ingestify/infra/source/wyscout.py +0 -175
- ingestify-0.6.4/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
- ingestify-0.6.4/ingestify/static/templates/statsbomb_github/database/README.md +0 -1
- ingestify-0.6.4/ingestify/static/templates/statsbomb_github/query.py +0 -14
- ingestify-0.6.4/ingestify/static/templates/wyscout/.env +0 -5
- ingestify-0.6.4/ingestify/static/templates/wyscout/.gitignore +0 -2
- ingestify-0.6.4/ingestify/static/templates/wyscout/README.md +0 -0
- ingestify-0.6.4/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
- ingestify-0.6.4/ingestify/static/templates/wyscout/database/README.md +0 -1
- ingestify-0.6.4/ingestify/static/templates/wyscout/query.py +0 -14
- ingestify-0.6.4/ingestify.egg-info/PKG-INFO +0 -254
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/application/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/application/secrets_manager.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/base.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/data_spec_version_collection.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/dataset.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/identifier.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/revision.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/_old_event.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/dispatcher.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/domain_event.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/publisher.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/subscriber.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/fetch_policy.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/resources/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/sink.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/source.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/task/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/task/set.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/task/task.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/task/task_summary.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/timing.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/services/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/services/transformers/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/exceptions.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/fetch/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/fetch/http.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/serialization/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/sink/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/sink/postgresql.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/source/__init__.py +0 -0
- {ingestify-0.6.4/ingestify/infra/store/dataset → ingestify-0.7.0/ingestify/infra/source/statsbomb}/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/__init__.py +0 -0
- /ingestify-0.6.4/ingestify/static/templates/statsbomb_github/README.md → /ingestify-0.7.0/ingestify/infra/store/dataset/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/file/__init__.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/server.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/source_base.py +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify.egg-info/dependency_links.txt +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify.egg-info/entry_points.txt +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/ingestify.egg-info/top_level.txt +0 -0
- {ingestify-0.6.4 → ingestify-0.7.0}/setup.cfg +0 -0
ingestify-0.7.0/PKG-INFO
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ingestify
|
|
3
|
+
Version: 0.7.0
|
|
4
|
+
Summary: Data Ingestion Framework
|
|
5
|
+
Author: Koen Vossen
|
|
6
|
+
Author-email: info@koenvossen.nl
|
|
7
|
+
License: AGPL
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Provides-Extra: test
|
|
10
|
+
|
|
11
|
+
# Ingestify
|
|
12
|
+
|
|
13
|
+
_Ingest everything – JSON, CSV, tracking ZIPs, even MP4 – keep it version‑safe, sync only what changed, and analyse while you ingest._
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Why Ingestify?
|
|
18
|
+
|
|
19
|
+
Football‐data APIs are often **slow**, **rate‑limited** or just **down**. One parsing bug and you’re forced to pull tens of gigabytes again.
|
|
20
|
+
Ingestify fixes that by building **your own data lake** of untouched provider files and fetching only what’s new:
|
|
21
|
+
|
|
22
|
+
* **Own your lake** – The first time you ask for a match, Ingestify downloads the original files (metadata, line‑ups, events, tracking, video) and stores them untouched in local disk, S3, GCS… every later query hits *your* lake, not the provider.
|
|
23
|
+
* **Never re‑fetch the world** – A file‑level checksum / timestamp check moves only changed bundles across the wire.
|
|
24
|
+
* **Atomic, complete packages** – A *Dataset* is all‑or‑nothing:
|
|
25
|
+
|
|
26
|
+
| Dataset type | Always contains |
|
|
27
|
+
|--------------|-----------------|
|
|
28
|
+
| **Match Dataset** | metadata + line‑ups + events |
|
|
29
|
+
| **Tracking Dataset** | metadata + raw tracking frames |
|
|
30
|
+
|
|
31
|
+
You never analyse events v2 with lineups v1, or yesterday’s first half with today’s second half.
|
|
32
|
+
* **Query while ingesting** – Datasets stream out of the engine the moment their files land, so notebooks or downstream services can start before the full season is in.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## The Ingestify Workflow
|
|
37
|
+
<img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## What you gain
|
|
42
|
+
|
|
43
|
+
### For football‑analytics practitioners
|
|
44
|
+
|
|
45
|
+
| Pain | Ingestify fix |
|
|
46
|
+
|------|---------------|
|
|
47
|
+
| API slowness / downtime | One request → lake; retries and parallelism happen behind the scenes. |
|
|
48
|
+
| Full re‑ingest after a bug | File‑level deltas mean you fetch only the corrected bundles. |
|
|
49
|
+
| Partial / drifting data | Dataset is atomic, versioned, and validated before it becomes visible. |
|
|
50
|
+
| Waiting hours for a season to sync | Stream each Dataset as soon as it lands; analyse while you ingest. |
|
|
51
|
+
| Boilerplate joins | `engine.load_dataset_with_kloppy(dataset)` → analysis‑ready object. |
|
|
52
|
+
|
|
53
|
+
### For software engineers
|
|
54
|
+
|
|
55
|
+
| Need | How Ingestify helps |
|
|
56
|
+
|------|---------------------|
|
|
57
|
+
| **Domain‑Driven Design** | `Dataset`, `Revision`, `Selector` plus rich domain events read like the problem space. |
|
|
58
|
+
| **Event‑driven integrations** | Subscribe to `RevisionAdded` and push to Kafka, AWS Lambda, Airflow… |
|
|
59
|
+
| **Pluggable everything** | Swap `Source`, `FetchPolicy`, `DatasetStore` subclasses to add providers, change delta logic, or move storage back‑ends. |
|
|
60
|
+
| **Safety & speed** | Multiprocessing downloader with temp‑dir commits – no half‑written matches; near‑linear I/O speed‑ups. |
|
|
61
|
+
| **Any file type** | JSON, CSV, MP4, proprietary binaries – stored verbatim so you parse / transcode later under version control. |
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## Quick start
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Minimal `config.yaml`
|
|
72
|
+
|
|
73
|
+
```yaml
|
|
74
|
+
main:
|
|
75
|
+
metadata_url: sqlite:///database/catalog.db # where revision metadata lives
|
|
76
|
+
file_url: file://database/files/ # where raw files live
|
|
77
|
+
default_bucket: main
|
|
78
|
+
|
|
79
|
+
sources:
|
|
80
|
+
statsbomb:
|
|
81
|
+
type: ingestify.statsbomb_github # open‑data provider
|
|
82
|
+
|
|
83
|
+
ingestion_plans:
|
|
84
|
+
- source: statsbomb
|
|
85
|
+
dataset_type: match
|
|
86
|
+
# selectors can narrow the scope
|
|
87
|
+
# selectors:
|
|
88
|
+
# - competition_id: 11
|
|
89
|
+
# season_id: [90]
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### First ingest
|
|
93
|
+
|
|
94
|
+
When you configured event subscribers, all domain events are dispatched to the subscriber. Publishing the events to
|
|
95
|
+
Kafka, RabbitMQ or any other system becomes trivial.
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
mkdir -p database
|
|
99
|
+
pip install kloppy
|
|
100
|
+
|
|
101
|
+
ingestify run # fills your data lake
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Using the data
|
|
107
|
+
|
|
108
|
+
By default, Ingestify will search in your DatasetStore when you request data. You can pass several filters to only fetch what you need.
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from ingestify.main import get_engine
|
|
112
|
+
|
|
113
|
+
engine = get_engine("config.yaml")
|
|
114
|
+
|
|
115
|
+
for dataset in engine.iter_datasets(
|
|
116
|
+
dataset_state="complete",
|
|
117
|
+
provider="statsbomb",
|
|
118
|
+
dataset_type="match",
|
|
119
|
+
competition_id=11,
|
|
120
|
+
season_id=90):
|
|
121
|
+
df = (
|
|
122
|
+
engine
|
|
123
|
+
.load_dataset_with_kloppy(dataset)
|
|
124
|
+
.to_df(engine="polars")
|
|
125
|
+
)
|
|
126
|
+
df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
#### Auto Ingestion
|
|
130
|
+
|
|
131
|
+
When you don't want to use event driven architecture but just want to work with the latest data, ingestify got you covered. With the `auto_ingest` option, ingestify syncs the data in the background when you ask for the data.
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from ingestify.main import get_engine
|
|
136
|
+
|
|
137
|
+
engine = get_engine("config.yaml")
|
|
138
|
+
|
|
139
|
+
for dataset in engine.iter_datasets(
|
|
140
|
+
# When set to True it will first do a full sync and then start yielding datasets
|
|
141
|
+
auto_ingest=True,
|
|
142
|
+
|
|
143
|
+
# With streaming enabled all Datasets are yielded when they are up-to-date (not changed, or refetched)
|
|
144
|
+
# auto_ingest={"streaming": True}
|
|
145
|
+
|
|
146
|
+
dataset_state="complete",
|
|
147
|
+
provider="statsbomb",
|
|
148
|
+
dataset_type="match",
|
|
149
|
+
competition_id=11,
|
|
150
|
+
season_id=90):
|
|
151
|
+
df = (
|
|
152
|
+
engine
|
|
153
|
+
.load_dataset_with_kloppy(dataset)
|
|
154
|
+
.to_df(engine="polars")
|
|
155
|
+
)
|
|
156
|
+
df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
#### Open data
|
|
160
|
+
|
|
161
|
+
Ingestify has build-in support for StatsBomb Open Data (more to come).
|
|
162
|
+
|
|
163
|
+
```shell
|
|
164
|
+
mkdir database_open_data
|
|
165
|
+
pip install kloppy
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
from ingestify.main import get_engine
|
|
170
|
+
|
|
171
|
+
engine = get_engine(
|
|
172
|
+
metadata_url="sqlite:///database_open_data/catalog.db",
|
|
173
|
+
file_url="file://database_open_data/files/"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
dataset_iter = engine.iter_datasets(
|
|
177
|
+
# This will tell ingestify to look for an Open Data provider
|
|
178
|
+
auto_ingest={"use_open_data": True, "streaming": True},
|
|
179
|
+
|
|
180
|
+
provider="statsbomb",
|
|
181
|
+
dataset_type="match",
|
|
182
|
+
competition_id=43,
|
|
183
|
+
season_id=281
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
for dataset in dataset_iter:
|
|
187
|
+
kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
## Roadmap
|
|
194
|
+
|
|
195
|
+
* Workflow orchestration helpers (Airflow, Dagster, Prefect)
|
|
196
|
+
* Built‑in Kafka / Kinesis event emitters
|
|
197
|
+
* Streaming data ingestion
|
|
198
|
+
* Data quality hooks (Great Expectations)
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
**Stop refetching the world. Own your data lake, keep it version‑safe, and analyse football faster with Ingestify.**
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
# Ingestify
|
|
2
|
+
|
|
3
|
+
_Ingest everything – JSON, CSV, tracking ZIPs, even MP4 – keep it version‑safe, sync only what changed, and analyse while you ingest._
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Why Ingestify?
|
|
8
|
+
|
|
9
|
+
Football‐data APIs are often **slow**, **rate‑limited** or just **down**. One parsing bug and you’re forced to pull tens of gigabytes again.
|
|
10
|
+
Ingestify fixes that by building **your own data lake** of untouched provider files and fetching only what’s new:
|
|
11
|
+
|
|
12
|
+
* **Own your lake** – The first time you ask for a match, Ingestify downloads the original files (metadata, line‑ups, events, tracking, video) and stores them untouched in local disk, S3, GCS… every later query hits *your* lake, not the provider.
|
|
13
|
+
* **Never re‑fetch the world** – A file‑level checksum / timestamp check moves only changed bundles across the wire.
|
|
14
|
+
* **Atomic, complete packages** – A *Dataset* is all‑or‑nothing:
|
|
15
|
+
|
|
16
|
+
| Dataset type | Always contains |
|
|
17
|
+
|--------------|-----------------|
|
|
18
|
+
| **Match Dataset** | metadata + line‑ups + events |
|
|
19
|
+
| **Tracking Dataset** | metadata + raw tracking frames |
|
|
20
|
+
|
|
21
|
+
You never analyse events v2 with lineups v1, or yesterday’s first half with today’s second half.
|
|
22
|
+
* **Query while ingesting** – Datasets stream out of the engine the moment their files land, so notebooks or downstream services can start before the full season is in.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## The Ingestify Workflow
|
|
27
|
+
<img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## What you gain
|
|
32
|
+
|
|
33
|
+
### For football‑analytics practitioners
|
|
34
|
+
|
|
35
|
+
| Pain | Ingestify fix |
|
|
36
|
+
|------|---------------|
|
|
37
|
+
| API slowness / downtime | One request → lake; retries and parallelism happen behind the scenes. |
|
|
38
|
+
| Full re‑ingest after a bug | File‑level deltas mean you fetch only the corrected bundles. |
|
|
39
|
+
| Partial / drifting data | Dataset is atomic, versioned, and validated before it becomes visible. |
|
|
40
|
+
| Waiting hours for a season to sync | Stream each Dataset as soon as it lands; analyse while you ingest. |
|
|
41
|
+
| Boilerplate joins | `engine.load_dataset_with_kloppy(dataset)` → analysis‑ready object. |
|
|
42
|
+
|
|
43
|
+
### For software engineers
|
|
44
|
+
|
|
45
|
+
| Need | How Ingestify helps |
|
|
46
|
+
|------|---------------------|
|
|
47
|
+
| **Domain‑Driven Design** | `Dataset`, `Revision`, `Selector` plus rich domain events read like the problem space. |
|
|
48
|
+
| **Event‑driven integrations** | Subscribe to `RevisionAdded` and push to Kafka, AWS Lambda, Airflow… |
|
|
49
|
+
| **Pluggable everything** | Swap `Source`, `FetchPolicy`, `DatasetStore` subclasses to add providers, change delta logic, or move storage back‑ends. |
|
|
50
|
+
| **Safety & speed** | Multiprocessing downloader with temp‑dir commits – no half‑written matches; near‑linear I/O speed‑ups. |
|
|
51
|
+
| **Any file type** | JSON, CSV, MP4, proprietary binaries – stored verbatim so you parse / transcode later under version control. |
|
|
52
|
+
|
|
53
|
+
---
|
|
54
|
+
|
|
55
|
+
## Quick start
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Minimal `config.yaml`
|
|
62
|
+
|
|
63
|
+
```yaml
|
|
64
|
+
main:
|
|
65
|
+
metadata_url: sqlite:///database/catalog.db # where revision metadata lives
|
|
66
|
+
file_url: file://database/files/ # where raw files live
|
|
67
|
+
default_bucket: main
|
|
68
|
+
|
|
69
|
+
sources:
|
|
70
|
+
statsbomb:
|
|
71
|
+
type: ingestify.statsbomb_github # open‑data provider
|
|
72
|
+
|
|
73
|
+
ingestion_plans:
|
|
74
|
+
- source: statsbomb
|
|
75
|
+
dataset_type: match
|
|
76
|
+
# selectors can narrow the scope
|
|
77
|
+
# selectors:
|
|
78
|
+
# - competition_id: 11
|
|
79
|
+
# season_id: [90]
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### First ingest
|
|
83
|
+
|
|
84
|
+
When you configured event subscribers, all domain events are dispatched to the subscriber. Publishing the events to
|
|
85
|
+
Kafka, RabbitMQ or any other system becomes trivial.
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
mkdir -p database
|
|
89
|
+
pip install kloppy
|
|
90
|
+
|
|
91
|
+
ingestify run # fills your data lake
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Using the data
|
|
97
|
+
|
|
98
|
+
By default, Ingestify will search in your DatasetStore when you request data. You can pass several filters to only fetch what you need.
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from ingestify.main import get_engine
|
|
102
|
+
|
|
103
|
+
engine = get_engine("config.yaml")
|
|
104
|
+
|
|
105
|
+
for dataset in engine.iter_datasets(
|
|
106
|
+
dataset_state="complete",
|
|
107
|
+
provider="statsbomb",
|
|
108
|
+
dataset_type="match",
|
|
109
|
+
competition_id=11,
|
|
110
|
+
season_id=90):
|
|
111
|
+
df = (
|
|
112
|
+
engine
|
|
113
|
+
.load_dataset_with_kloppy(dataset)
|
|
114
|
+
.to_df(engine="polars")
|
|
115
|
+
)
|
|
116
|
+
df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
#### Auto Ingestion
|
|
120
|
+
|
|
121
|
+
When you don't want to use event driven architecture but just want to work with the latest data, ingestify got you covered. With the `auto_ingest` option, ingestify syncs the data in the background when you ask for the data.
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from ingestify.main import get_engine
|
|
126
|
+
|
|
127
|
+
engine = get_engine("config.yaml")
|
|
128
|
+
|
|
129
|
+
for dataset in engine.iter_datasets(
|
|
130
|
+
# When set to True it will first do a full sync and then start yielding datasets
|
|
131
|
+
auto_ingest=True,
|
|
132
|
+
|
|
133
|
+
# With streaming enabled all Datasets are yielded when they are up-to-date (not changed, or refetched)
|
|
134
|
+
# auto_ingest={"streaming": True}
|
|
135
|
+
|
|
136
|
+
dataset_state="complete",
|
|
137
|
+
provider="statsbomb",
|
|
138
|
+
dataset_type="match",
|
|
139
|
+
competition_id=11,
|
|
140
|
+
season_id=90):
|
|
141
|
+
df = (
|
|
142
|
+
engine
|
|
143
|
+
.load_dataset_with_kloppy(dataset)
|
|
144
|
+
.to_df(engine="polars")
|
|
145
|
+
)
|
|
146
|
+
df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
#### Open data
|
|
150
|
+
|
|
151
|
+
Ingestify has build-in support for StatsBomb Open Data (more to come).
|
|
152
|
+
|
|
153
|
+
```shell
|
|
154
|
+
mkdir database_open_data
|
|
155
|
+
pip install kloppy
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
from ingestify.main import get_engine
|
|
160
|
+
|
|
161
|
+
engine = get_engine(
|
|
162
|
+
metadata_url="sqlite:///database_open_data/catalog.db",
|
|
163
|
+
file_url="file://database_open_data/files/"
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
dataset_iter = engine.iter_datasets(
|
|
167
|
+
# This will tell ingestify to look for an Open Data provider
|
|
168
|
+
auto_ingest={"use_open_data": True, "streaming": True},
|
|
169
|
+
|
|
170
|
+
provider="statsbomb",
|
|
171
|
+
dataset_type="match",
|
|
172
|
+
competition_id=43,
|
|
173
|
+
season_id=281
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
for dataset in dataset_iter:
|
|
177
|
+
kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## Roadmap
|
|
184
|
+
|
|
185
|
+
* Workflow orchestration helpers (Airflow, Dagster, Prefect)
|
|
186
|
+
* Built‑in Kafka / Kinesis event emitters
|
|
187
|
+
* Streaming data ingestion
|
|
188
|
+
* Data quality hooks (Great Expectations)
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
**Stop refetching the world. Own your data lake, keep it version‑safe, and analyse football faster with Ingestify.**
|