ingestify 0.6.4__tar.gz → 0.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. ingestify-0.8.0/PKG-INFO +248 -0
  2. ingestify-0.8.0/README.md +238 -0
  3. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/__init__.py +2 -1
  4. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/application/dataset_store.py +228 -11
  5. ingestify-0.8.0/ingestify/application/ingestion_engine.py +292 -0
  6. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/application/loader.py +163 -28
  7. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/cmdline.py +0 -48
  8. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/__init__.py +2 -0
  9. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/collection.py +0 -9
  10. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/dataset_repository.py +4 -0
  11. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/dataset_state.py +5 -0
  12. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/events.py +13 -0
  13. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/file.py +7 -1
  14. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/selector.py +8 -1
  15. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/event/event_bus.py +16 -1
  16. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/ingestion/ingestion_job.py +23 -4
  17. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/resources/dataset_resource.py +0 -1
  18. ingestify-0.8.0/ingestify/infra/source/statsbomb/base.py +36 -0
  19. ingestify-0.8.0/ingestify/infra/source/statsbomb/match.py +137 -0
  20. ingestify-0.8.0/ingestify/infra/source/statsbomb_github.py +107 -0
  21. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +77 -10
  22. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
  23. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/main.py +190 -10
  24. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/utils.py +2 -32
  25. ingestify-0.8.0/ingestify.egg-info/PKG-INFO +248 -0
  26. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify.egg-info/SOURCES.txt +4 -12
  27. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify.egg-info/requires.txt +3 -6
  28. {ingestify-0.6.4 → ingestify-0.8.0}/setup.py +3 -10
  29. ingestify-0.6.4/PKG-INFO +0 -254
  30. ingestify-0.6.4/README.md +0 -244
  31. ingestify-0.6.4/ingestify/application/ingestion_engine.py +0 -67
  32. ingestify-0.6.4/ingestify/infra/source/statsbomb_github.py +0 -105
  33. ingestify-0.6.4/ingestify/infra/source/wyscout.py +0 -175
  34. ingestify-0.6.4/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
  35. ingestify-0.6.4/ingestify/static/templates/statsbomb_github/database/README.md +0 -1
  36. ingestify-0.6.4/ingestify/static/templates/statsbomb_github/query.py +0 -14
  37. ingestify-0.6.4/ingestify/static/templates/wyscout/.env +0 -5
  38. ingestify-0.6.4/ingestify/static/templates/wyscout/.gitignore +0 -2
  39. ingestify-0.6.4/ingestify/static/templates/wyscout/README.md +0 -0
  40. ingestify-0.6.4/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
  41. ingestify-0.6.4/ingestify/static/templates/wyscout/database/README.md +0 -1
  42. ingestify-0.6.4/ingestify/static/templates/wyscout/query.py +0 -14
  43. ingestify-0.6.4/ingestify.egg-info/PKG-INFO +0 -254
  44. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/application/__init__.py +0 -0
  45. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/application/secrets_manager.py +0 -0
  46. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/__init__.py +0 -0
  47. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/base.py +0 -0
  48. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  49. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/__init__.py +0 -0
  50. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  51. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/dataset.py +0 -0
  52. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
  53. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
  54. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/identifier.py +0 -0
  55. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/dataset/revision.py +0 -0
  56. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/event/__init__.py +0 -0
  57. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/event/_old_event.py +0 -0
  58. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/event/dispatcher.py +0 -0
  59. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/event/domain_event.py +0 -0
  60. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/event/publisher.py +0 -0
  61. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/event/subscriber.py +0 -0
  62. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/fetch_policy.py +0 -0
  63. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
  64. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
  65. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  66. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/resources/__init__.py +0 -0
  67. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/sink.py +0 -0
  68. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/source.py +0 -0
  69. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/task/__init__.py +0 -0
  70. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/task/set.py +0 -0
  71. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/task/task.py +0 -0
  72. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/task/task_summary.py +0 -0
  73. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/models/timing.py +0 -0
  74. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/services/__init__.py +0 -0
  75. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  76. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/services/transformers/__init__.py +0 -0
  77. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  78. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/exceptions.py +0 -0
  79. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/__init__.py +0 -0
  80. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/fetch/__init__.py +0 -0
  81. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/fetch/http.py +0 -0
  82. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/serialization/__init__.py +0 -0
  83. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/sink/__init__.py +0 -0
  84. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/sink/postgresql.py +0 -0
  85. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/source/__init__.py +0 -0
  86. {ingestify-0.6.4/ingestify/infra/store/dataset → ingestify-0.8.0/ingestify/infra/source/statsbomb}/__init__.py +0 -0
  87. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/store/__init__.py +0 -0
  88. /ingestify-0.6.4/ingestify/static/templates/statsbomb_github/README.md → /ingestify-0.8.0/ingestify/infra/store/dataset/__init__.py +0 -0
  89. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  90. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/store/file/__init__.py +0 -0
  91. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  92. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
  93. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  94. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/server.py +0 -0
  95. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify/source_base.py +0 -0
  96. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify.egg-info/dependency_links.txt +0 -0
  97. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify.egg-info/entry_points.txt +0 -0
  98. {ingestify-0.6.4 → ingestify-0.8.0}/ingestify.egg-info/top_level.txt +0 -0
  99. {ingestify-0.6.4 → ingestify-0.8.0}/setup.cfg +0 -0
@@ -0,0 +1,248 @@
1
+ Metadata-Version: 2.1
2
+ Name: ingestify
3
+ Version: 0.8.0
4
+ Summary: Data Ingestion Framework
5
+ Author: Koen Vossen
6
+ Author-email: info@koenvossen.nl
7
+ License: AGPL
8
+ Description-Content-Type: text/markdown
9
+ Provides-Extra: test
10
+
11
+ # Ingestify
12
+
13
+ _Ingest everything – JSON, CSV, tracking ZIPs, even MP4 – keep it version‑safe, sync only what changed, and analyse while you ingest._
14
+
15
+ ---
16
+
17
+ ## Why Ingestify?
18
+
19
+ Football‐data APIs are often **slow**, **rate‑limited** or just **down**. One parsing bug and you’re forced to pull tens of gigabytes again.
20
+ Ingestify fixes that by building **your own data lake** of untouched provider files and fetching only what’s new:
21
+
22
+ * **Own your lake** – The first time you ask for a match, Ingestify downloads the original files (metadata, line‑ups, events, tracking, video) and stores them untouched in local disk, S3, GCS… every later query hits *your* lake, not the provider.
23
+ * **Never re‑fetch the world** – A file‑level checksum / timestamp check moves only changed bundles across the wire.
24
+ * **Atomic, complete packages** – A *Dataset* is all‑or‑nothing:
25
+
26
+ | Dataset type | Always contains |
27
+ |--------------|-----------------|
28
+ | **Match Dataset** | metadata + line‑ups + events |
29
+ | **Tracking Dataset** | metadata + raw tracking frames |
30
+
31
+ You never analyse events v2 with lineups v1, or yesterday’s first half with today’s second half.
32
+ * **Query while ingesting** – Datasets stream out of the engine the moment their files land, so notebooks or downstream services can start before the full season is in.
33
+
34
+ ---
35
+
36
+ ## The Ingestify Workflow
37
+ <img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
38
+
39
+ ---
40
+
41
+ ## What you gain
42
+
43
+ ### For football‑analytics practitioners
44
+
45
+ | Pain | Ingestify fix |
46
+ |------|---------------|
47
+ | API slowness / downtime | One request → lake; retries and parallelism happen behind the scenes. |
48
+ | Full re‑ingest after a bug | File‑level deltas mean you fetch only the corrected bundles. |
49
+ | Partial / drifting data | Dataset is atomic, versioned, and validated before it becomes visible. |
50
+ | Waiting hours for a season to sync | Stream each Dataset as soon as it lands; analyse while you ingest. |
51
+ | Boilerplate joins | `engine.load_dataset_with_kloppy(dataset)` → analysis‑ready object. |
52
+
53
+ ### For software engineers
54
+
55
+ | Need | How Ingestify helps |
56
+ |------|---------------------|
57
+ | **Domain‑Driven Design** | `Dataset`, `Revision`, `Selector` plus rich domain events read like the problem space. |
58
+ | **Event‑driven integrations** | Subscribe to `RevisionAdded` and push to Kafka, AWS Lambda, Airflow… |
59
+ | **Pluggable everything** | Swap `Source`, `FetchPolicy`, `DatasetStore` subclasses to add providers, change delta logic, or move storage back‑ends. |
60
+ | **Safety & speed** | Multiprocessing downloader with temp‑dir commits – no half‑written matches; near‑linear I/O speed‑ups. |
61
+ | **Any file type** | JSON, CSV, MP4, proprietary binaries – stored verbatim so you parse / transcode later under version control. |
62
+
63
+ ---
64
+
65
+ ## Quick start
66
+
67
+ ```bash
68
+ pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
69
+ ```
70
+
71
+ ### Developing a new Source
72
+
73
+ When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
74
+
75
+ ```python
76
+ from ingestify import Source, debug_source
77
+
78
+ class MyCustomSource(Source):
79
+ provider = "my_provider"
80
+
81
+ def __init__(self, name: str, api_key: str):
82
+ super().__init__(name)
83
+ self.api_key = api_key
84
+
85
+ def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
86
+ # Your source implementation
87
+ ...
88
+
89
+ # Quick debug - runs full ingestion with temp storage
90
+ if __name__ == "__main__":
91
+ source = MyCustomSource(name="test", api_key="...")
92
+
93
+ debug_source(
94
+ source,
95
+ dataset_type="match",
96
+ data_spec_versions={"events": "v1"},
97
+ )
98
+ ```
99
+
100
+ The `debug_source()` helper:
101
+ - ✅ Creates an ephemeral dev engine with temp storage
102
+ - ✅ Configures logging automatically
103
+ - ✅ Runs the full ingestion cycle
104
+ - ✅ Shows storage location and results
105
+
106
+ Perfect for testing your source before adding it to production config!
107
+
108
+ ### Minimal `config.yaml`
109
+
110
+ ```yaml
111
+ main:
112
+ metadata_url: sqlite:///database/catalog.db # where revision metadata lives
113
+ file_url: file://database/files/ # where raw files live
114
+ default_bucket: main
115
+
116
+ sources:
117
+ statsbomb:
118
+ type: ingestify.statsbomb_github # open‑data provider
119
+
120
+ ingestion_plans:
121
+ - source: statsbomb
122
+ dataset_type: match
123
+ # selectors can narrow the scope
124
+ # selectors:
125
+ # - competition_id: 11
126
+ # season_id: [90]
127
+ ```
128
+
129
+ ### First ingest
130
+
131
+ When you configured event subscribers, all domain events are dispatched to the subscriber. Publishing the events to
132
+ Kafka, RabbitMQ or any other system becomes trivial.
133
+
134
+ ```bash
135
+ mkdir -p database
136
+ pip install kloppy
137
+
138
+ ingestify run # fills your data lake
139
+ ```
140
+
141
+ ---
142
+
143
+ ## Using the data
144
+
145
+ By default, Ingestify will search in your DatasetStore when you request data. You can pass several filters to only fetch what you need.
146
+
147
+ ```python
148
+ from ingestify.main import get_engine
149
+
150
+ engine = get_engine("config.yaml")
151
+
152
+ for dataset in engine.iter_datasets(
153
+ dataset_state="complete",
154
+ provider="statsbomb",
155
+ dataset_type="match",
156
+ competition_id=11,
157
+ season_id=90):
158
+ df = (
159
+ engine
160
+ .load_dataset_with_kloppy(dataset)
161
+ .to_df(engine="polars")
162
+ )
163
+ df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
164
+ ```
165
+
166
+ #### Auto Ingestion
167
+
168
+ When you don't want to use event driven architecture but just want to work with the latest data, ingestify got you covered. With the `auto_ingest` option, ingestify syncs the data in the background when you ask for the data.
169
+
170
+
171
+ ```python
172
+ from ingestify.main import get_engine
173
+
174
+ engine = get_engine("config.yaml")
175
+
176
+ for dataset in engine.iter_datasets(
177
+ # When set to True it will first do a full sync and then start yielding datasets
178
+ auto_ingest=True,
179
+
180
+ # With streaming enabled all Datasets are yielded when they are up-to-date (not changed, or refetched)
181
+ # auto_ingest={"streaming": True}
182
+
183
+ dataset_state="complete",
184
+ provider="statsbomb",
185
+ dataset_type="match",
186
+ competition_id=11,
187
+ season_id=90):
188
+ df = (
189
+ engine
190
+ .load_dataset_with_kloppy(dataset)
191
+ .to_df(engine="polars")
192
+ )
193
+ df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
194
+ ```
195
+
196
+ #### Open data
197
+
198
+ Ingestify has build-in support for StatsBomb Open Data (more to come).
199
+
200
+ ```shell
201
+ mkdir database_open_data
202
+ pip install kloppy
203
+ ```
204
+
205
+ ```python
206
+ import logging, sys
207
+
208
+ from ingestify.main import get_engine
209
+
210
+ logging.basicConfig(
211
+ level=logging.INFO,
212
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
213
+ stream=sys.stderr,
214
+ )
215
+
216
+ engine = get_engine(
217
+ metadata_url="sqlite:///database_open_data/catalog.db",
218
+ file_url="file://database_open_data/files/"
219
+ )
220
+
221
+ dataset_iter = engine.iter_datasets(
222
+ # This will tell ingestify to look for an Open Data provider
223
+ auto_ingest={"use_open_data": True, "streaming": True},
224
+
225
+ provider="statsbomb",
226
+ dataset_type="match",
227
+ competition_id=43, # "FIFA World Cup"
228
+ #season_id=281
229
+ )
230
+
231
+ for dataset in dataset_iter:
232
+ kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
233
+ logging.info(f"Loaded {kloppy_dataset}")
234
+ ```
235
+
236
+
237
+ ---
238
+
239
+ ## Roadmap
240
+
241
+ * Workflow orchestration helpers (Airflow, Dagster, Prefect)
242
+ * Built‑in Kafka / Kinesis event emitters
243
+ * Streaming data ingestion
244
+ * Data quality hooks (Great Expectations)
245
+
246
+ ---
247
+
248
+ **Stop refetching the world. Own your data lake, keep it version‑safe, and analyse football faster with Ingestify.**
@@ -0,0 +1,238 @@
1
+ # Ingestify
2
+
3
+ _Ingest everything – JSON, CSV, tracking ZIPs, even MP4 – keep it version‑safe, sync only what changed, and analyse while you ingest._
4
+
5
+ ---
6
+
7
+ ## Why Ingestify?
8
+
9
+ Football‐data APIs are often **slow**, **rate‑limited** or just **down**. One parsing bug and you’re forced to pull tens of gigabytes again.
10
+ Ingestify fixes that by building **your own data lake** of untouched provider files and fetching only what’s new:
11
+
12
+ * **Own your lake** – The first time you ask for a match, Ingestify downloads the original files (metadata, line‑ups, events, tracking, video) and stores them untouched in local disk, S3, GCS… every later query hits *your* lake, not the provider.
13
+ * **Never re‑fetch the world** – A file‑level checksum / timestamp check moves only changed bundles across the wire.
14
+ * **Atomic, complete packages** – A *Dataset* is all‑or‑nothing:
15
+
16
+ | Dataset type | Always contains |
17
+ |--------------|-----------------|
18
+ | **Match Dataset** | metadata + line‑ups + events |
19
+ | **Tracking Dataset** | metadata + raw tracking frames |
20
+
21
+ You never analyse events v2 with lineups v1, or yesterday’s first half with today’s second half.
22
+ * **Query while ingesting** – Datasets stream out of the engine the moment their files land, so notebooks or downstream services can start before the full season is in.
23
+
24
+ ---
25
+
26
+ ## The Ingestify Workflow
27
+ <img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
28
+
29
+ ---
30
+
31
+ ## What you gain
32
+
33
+ ### For football‑analytics practitioners
34
+
35
+ | Pain | Ingestify fix |
36
+ |------|---------------|
37
+ | API slowness / downtime | One request → lake; retries and parallelism happen behind the scenes. |
38
+ | Full re‑ingest after a bug | File‑level deltas mean you fetch only the corrected bundles. |
39
+ | Partial / drifting data | Dataset is atomic, versioned, and validated before it becomes visible. |
40
+ | Waiting hours for a season to sync | Stream each Dataset as soon as it lands; analyse while you ingest. |
41
+ | Boilerplate joins | `engine.load_dataset_with_kloppy(dataset)` → analysis‑ready object. |
42
+
43
+ ### For software engineers
44
+
45
+ | Need | How Ingestify helps |
46
+ |------|---------------------|
47
+ | **Domain‑Driven Design** | `Dataset`, `Revision`, `Selector` plus rich domain events read like the problem space. |
48
+ | **Event‑driven integrations** | Subscribe to `RevisionAdded` and push to Kafka, AWS Lambda, Airflow… |
49
+ | **Pluggable everything** | Swap `Source`, `FetchPolicy`, `DatasetStore` subclasses to add providers, change delta logic, or move storage back‑ends. |
50
+ | **Safety & speed** | Multiprocessing downloader with temp‑dir commits – no half‑written matches; near‑linear I/O speed‑ups. |
51
+ | **Any file type** | JSON, CSV, MP4, proprietary binaries – stored verbatim so you parse / transcode later under version control. |
52
+
53
+ ---
54
+
55
+ ## Quick start
56
+
57
+ ```bash
58
+ pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
59
+ ```
60
+
61
+ ### Developing a new Source
62
+
63
+ When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
64
+
65
+ ```python
66
+ from ingestify import Source, debug_source
67
+
68
+ class MyCustomSource(Source):
69
+ provider = "my_provider"
70
+
71
+ def __init__(self, name: str, api_key: str):
72
+ super().__init__(name)
73
+ self.api_key = api_key
74
+
75
+ def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
76
+ # Your source implementation
77
+ ...
78
+
79
+ # Quick debug - runs full ingestion with temp storage
80
+ if __name__ == "__main__":
81
+ source = MyCustomSource(name="test", api_key="...")
82
+
83
+ debug_source(
84
+ source,
85
+ dataset_type="match",
86
+ data_spec_versions={"events": "v1"},
87
+ )
88
+ ```
89
+
90
+ The `debug_source()` helper:
91
+ - ✅ Creates an ephemeral dev engine with temp storage
92
+ - ✅ Configures logging automatically
93
+ - ✅ Runs the full ingestion cycle
94
+ - ✅ Shows storage location and results
95
+
96
+ Perfect for testing your source before adding it to production config!
97
+
98
+ ### Minimal `config.yaml`
99
+
100
+ ```yaml
101
+ main:
102
+ metadata_url: sqlite:///database/catalog.db # where revision metadata lives
103
+ file_url: file://database/files/ # where raw files live
104
+ default_bucket: main
105
+
106
+ sources:
107
+ statsbomb:
108
+ type: ingestify.statsbomb_github # open‑data provider
109
+
110
+ ingestion_plans:
111
+ - source: statsbomb
112
+ dataset_type: match
113
+ # selectors can narrow the scope
114
+ # selectors:
115
+ # - competition_id: 11
116
+ # season_id: [90]
117
+ ```
118
+
119
+ ### First ingest
120
+
121
+ When you configured event subscribers, all domain events are dispatched to the subscriber. Publishing the events to
122
+ Kafka, RabbitMQ or any other system becomes trivial.
123
+
124
+ ```bash
125
+ mkdir -p database
126
+ pip install kloppy
127
+
128
+ ingestify run # fills your data lake
129
+ ```
130
+
131
+ ---
132
+
133
+ ## Using the data
134
+
135
+ By default, Ingestify will search in your DatasetStore when you request data. You can pass several filters to only fetch what you need.
136
+
137
+ ```python
138
+ from ingestify.main import get_engine
139
+
140
+ engine = get_engine("config.yaml")
141
+
142
+ for dataset in engine.iter_datasets(
143
+ dataset_state="complete",
144
+ provider="statsbomb",
145
+ dataset_type="match",
146
+ competition_id=11,
147
+ season_id=90):
148
+ df = (
149
+ engine
150
+ .load_dataset_with_kloppy(dataset)
151
+ .to_df(engine="polars")
152
+ )
153
+ df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
154
+ ```
155
+
156
+ #### Auto Ingestion
157
+
158
+ When you don't want to use event driven architecture but just want to work with the latest data, ingestify got you covered. With the `auto_ingest` option, ingestify syncs the data in the background when you ask for the data.
159
+
160
+
161
+ ```python
162
+ from ingestify.main import get_engine
163
+
164
+ engine = get_engine("config.yaml")
165
+
166
+ for dataset in engine.iter_datasets(
167
+ # When set to True it will first do a full sync and then start yielding datasets
168
+ auto_ingest=True,
169
+
170
+ # With streaming enabled all Datasets are yielded when they are up-to-date (not changed, or refetched)
171
+ # auto_ingest={"streaming": True}
172
+
173
+ dataset_state="complete",
174
+ provider="statsbomb",
175
+ dataset_type="match",
176
+ competition_id=11,
177
+ season_id=90):
178
+ df = (
179
+ engine
180
+ .load_dataset_with_kloppy(dataset)
181
+ .to_df(engine="polars")
182
+ )
183
+ df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
184
+ ```
185
+
186
+ #### Open data
187
+
188
+ Ingestify has build-in support for StatsBomb Open Data (more to come).
189
+
190
+ ```shell
191
+ mkdir database_open_data
192
+ pip install kloppy
193
+ ```
194
+
195
+ ```python
196
+ import logging, sys
197
+
198
+ from ingestify.main import get_engine
199
+
200
+ logging.basicConfig(
201
+ level=logging.INFO,
202
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
203
+ stream=sys.stderr,
204
+ )
205
+
206
+ engine = get_engine(
207
+ metadata_url="sqlite:///database_open_data/catalog.db",
208
+ file_url="file://database_open_data/files/"
209
+ )
210
+
211
+ dataset_iter = engine.iter_datasets(
212
+ # This will tell ingestify to look for an Open Data provider
213
+ auto_ingest={"use_open_data": True, "streaming": True},
214
+
215
+ provider="statsbomb",
216
+ dataset_type="match",
217
+ competition_id=43, # "FIFA World Cup"
218
+ #season_id=281
219
+ )
220
+
221
+ for dataset in dataset_iter:
222
+ kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
223
+ logging.info(f"Loaded {kloppy_dataset}")
224
+ ```
225
+
226
+
227
+ ---
228
+
229
+ ## Roadmap
230
+
231
+ * Workflow orchestration helpers (Airflow, Dagster, Prefect)
232
+ * Built‑in Kafka / Kinesis event emitters
233
+ * Streaming data ingestion
234
+ * Data quality hooks (Great Expectations)
235
+
236
+ ---
237
+
238
+ **Stop refetching the world. Own your data lake, keep it version‑safe, and analyse football faster with Ingestify.**
@@ -7,5 +7,6 @@ except NameError:
7
7
  if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
+ from .main import debug_source
10
11
 
11
- __version__ = "0.6.4"
12
+ __version__ = "0.8.0"