ingestify 0.6.4__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. ingestify-0.7.0/PKG-INFO +202 -0
  2. ingestify-0.7.0/README.md +192 -0
  3. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/__init__.py +1 -1
  4. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/application/dataset_store.py +228 -11
  5. ingestify-0.7.0/ingestify/application/ingestion_engine.py +289 -0
  6. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/application/loader.py +153 -28
  7. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/cmdline.py +0 -48
  8. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/__init__.py +2 -0
  9. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/collection.py +0 -9
  10. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/dataset_repository.py +4 -0
  11. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/dataset_state.py +5 -0
  12. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/events.py +13 -0
  13. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/file.py +1 -1
  14. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/selector.py +8 -1
  15. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/event_bus.py +16 -1
  16. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/ingestion/ingestion_job.py +23 -4
  17. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/resources/dataset_resource.py +0 -1
  18. ingestify-0.7.0/ingestify/infra/source/statsbomb/base.py +36 -0
  19. ingestify-0.7.0/ingestify/infra/source/statsbomb/match.py +137 -0
  20. ingestify-0.7.0/ingestify/infra/source/statsbomb_github.py +107 -0
  21. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py +77 -10
  22. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
  23. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/main.py +35 -10
  24. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/utils.py +2 -32
  25. ingestify-0.7.0/ingestify.egg-info/PKG-INFO +202 -0
  26. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify.egg-info/SOURCES.txt +4 -12
  27. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify.egg-info/requires.txt +3 -6
  28. {ingestify-0.6.4 → ingestify-0.7.0}/setup.py +3 -10
  29. ingestify-0.6.4/PKG-INFO +0 -254
  30. ingestify-0.6.4/README.md +0 -244
  31. ingestify-0.6.4/ingestify/application/ingestion_engine.py +0 -67
  32. ingestify-0.6.4/ingestify/infra/source/statsbomb_github.py +0 -105
  33. ingestify-0.6.4/ingestify/infra/source/wyscout.py +0 -175
  34. ingestify-0.6.4/ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
  35. ingestify-0.6.4/ingestify/static/templates/statsbomb_github/database/README.md +0 -1
  36. ingestify-0.6.4/ingestify/static/templates/statsbomb_github/query.py +0 -14
  37. ingestify-0.6.4/ingestify/static/templates/wyscout/.env +0 -5
  38. ingestify-0.6.4/ingestify/static/templates/wyscout/.gitignore +0 -2
  39. ingestify-0.6.4/ingestify/static/templates/wyscout/README.md +0 -0
  40. ingestify-0.6.4/ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
  41. ingestify-0.6.4/ingestify/static/templates/wyscout/database/README.md +0 -1
  42. ingestify-0.6.4/ingestify/static/templates/wyscout/query.py +0 -14
  43. ingestify-0.6.4/ingestify.egg-info/PKG-INFO +0 -254
  44. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/application/__init__.py +0 -0
  45. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/application/secrets_manager.py +0 -0
  46. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/__init__.py +0 -0
  47. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/base.py +0 -0
  48. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/data_spec_version_collection.py +0 -0
  49. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/__init__.py +0 -0
  50. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/collection_metadata.py +0 -0
  51. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/dataset.py +0 -0
  52. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/file_collection.py +0 -0
  53. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/file_repository.py +0 -0
  54. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/identifier.py +0 -0
  55. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/dataset/revision.py +0 -0
  56. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/__init__.py +0 -0
  57. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/_old_event.py +0 -0
  58. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/dispatcher.py +0 -0
  59. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/domain_event.py +0 -0
  60. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/publisher.py +0 -0
  61. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/event/subscriber.py +0 -0
  62. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/fetch_policy.py +0 -0
  63. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/ingestion/__init__.py +0 -0
  64. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/ingestion/ingestion_job_summary.py +0 -0
  65. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/ingestion/ingestion_plan.py +0 -0
  66. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/resources/__init__.py +0 -0
  67. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/sink.py +0 -0
  68. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/source.py +0 -0
  69. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/task/__init__.py +0 -0
  70. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/task/set.py +0 -0
  71. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/task/task.py +0 -0
  72. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/task/task_summary.py +0 -0
  73. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/models/timing.py +0 -0
  74. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/services/__init__.py +0 -0
  75. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/services/identifier_key_transformer.py +0 -0
  76. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/services/transformers/__init__.py +0 -0
  77. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/domain/services/transformers/kloppy_to_pandas.py +0 -0
  78. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/exceptions.py +0 -0
  79. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/__init__.py +0 -0
  80. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/fetch/__init__.py +0 -0
  81. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/fetch/http.py +0 -0
  82. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/serialization/__init__.py +0 -0
  83. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/sink/__init__.py +0 -0
  84. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/sink/postgresql.py +0 -0
  85. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/source/__init__.py +0 -0
  86. {ingestify-0.6.4/ingestify/infra/store/dataset → ingestify-0.7.0/ingestify/infra/source/statsbomb}/__init__.py +0 -0
  87. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/__init__.py +0 -0
  88. /ingestify-0.6.4/ingestify/static/templates/statsbomb_github/README.md → /ingestify-0.7.0/ingestify/infra/store/dataset/__init__.py +0 -0
  89. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/dataset/sqlalchemy/__init__.py +0 -0
  90. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/file/__init__.py +0 -0
  91. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/file/dummy_file_repository.py +0 -0
  92. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/file/local_file_repository.py +0 -0
  93. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/infra/store/file/s3_file_repository.py +0 -0
  94. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/server.py +0 -0
  95. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify/source_base.py +0 -0
  96. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify.egg-info/dependency_links.txt +0 -0
  97. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify.egg-info/entry_points.txt +0 -0
  98. {ingestify-0.6.4 → ingestify-0.7.0}/ingestify.egg-info/top_level.txt +0 -0
  99. {ingestify-0.6.4 → ingestify-0.7.0}/setup.cfg +0 -0
@@ -0,0 +1,202 @@
1
+ Metadata-Version: 2.1
2
+ Name: ingestify
3
+ Version: 0.7.0
4
+ Summary: Data Ingestion Framework
5
+ Author: Koen Vossen
6
+ Author-email: info@koenvossen.nl
7
+ License: AGPL
8
+ Description-Content-Type: text/markdown
9
+ Provides-Extra: test
10
+
11
+ # Ingestify
12
+
13
+ _Ingest everything – JSON, CSV, tracking ZIPs, even MP4 – keep it version‑safe, sync only what changed, and analyse while you ingest._
14
+
15
+ ---
16
+
17
+ ## Why Ingestify?
18
+
19
+ Football‐data APIs are often **slow**, **rate‑limited** or just **down**. One parsing bug and you’re forced to pull tens of gigabytes again.
20
+ Ingestify fixes that by building **your own data lake** of untouched provider files and fetching only what’s new:
21
+
22
+ * **Own your lake** – The first time you ask for a match, Ingestify downloads the original files (metadata, line‑ups, events, tracking, video) and stores them untouched in local disk, S3, GCS… every later query hits *your* lake, not the provider.
23
+ * **Never re‑fetch the world** – A file‑level checksum / timestamp check moves only changed bundles across the wire.
24
+ * **Atomic, complete packages** – A *Dataset* is all‑or‑nothing:
25
+
26
+ | Dataset type | Always contains |
27
+ |--------------|-----------------|
28
+ | **Match Dataset** | metadata + line‑ups + events |
29
+ | **Tracking Dataset** | metadata + raw tracking frames |
30
+
31
+ You never analyse events v2 with lineups v1, or yesterday’s first half with today’s second half.
32
+ * **Query while ingesting** – Datasets stream out of the engine the moment their files land, so notebooks or downstream services can start before the full season is in.
33
+
34
+ ---
35
+
36
+ ## The Ingestify Workflow
37
+ <img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
38
+
39
+ ---
40
+
41
+ ## What you gain
42
+
43
+ ### For football‑analytics practitioners
44
+
45
+ | Pain | Ingestify fix |
46
+ |------|---------------|
47
+ | API slowness / downtime | One request → lake; retries and parallelism happen behind the scenes. |
48
+ | Full re‑ingest after a bug | File‑level deltas mean you fetch only the corrected bundles. |
49
+ | Partial / drifting data | Dataset is atomic, versioned, and validated before it becomes visible. |
50
+ | Waiting hours for a season to sync | Stream each Dataset as soon as it lands; analyse while you ingest. |
51
+ | Boilerplate joins | `engine.load_dataset_with_kloppy(dataset)` → analysis‑ready object. |
52
+
53
+ ### For software engineers
54
+
55
+ | Need | How Ingestify helps |
56
+ |------|---------------------|
57
+ | **Domain‑Driven Design** | `Dataset`, `Revision`, `Selector` plus rich domain events read like the problem space. |
58
+ | **Event‑driven integrations** | Subscribe to `RevisionAdded` and push to Kafka, AWS Lambda, Airflow… |
59
+ | **Pluggable everything** | Swap `Source`, `FetchPolicy`, `DatasetStore` subclasses to add providers, change delta logic, or move storage back‑ends. |
60
+ | **Safety & speed** | Multiprocessing downloader with temp‑dir commits – no half‑written matches; near‑linear I/O speed‑ups. |
61
+ | **Any file type** | JSON, CSV, MP4, proprietary binaries – stored verbatim so you parse / transcode later under version control. |
62
+
63
+ ---
64
+
65
+ ## Quick start
66
+
67
+ ```bash
68
+ pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
69
+ ```
70
+
71
+ ### Minimal `config.yaml`
72
+
73
+ ```yaml
74
+ main:
75
+ metadata_url: sqlite:///database/catalog.db # where revision metadata lives
76
+ file_url: file://database/files/ # where raw files live
77
+ default_bucket: main
78
+
79
+ sources:
80
+ statsbomb:
81
+ type: ingestify.statsbomb_github # open‑data provider
82
+
83
+ ingestion_plans:
84
+ - source: statsbomb
85
+ dataset_type: match
86
+ # selectors can narrow the scope
87
+ # selectors:
88
+ # - competition_id: 11
89
+ # season_id: [90]
90
+ ```
91
+
92
+ ### First ingest
93
+
94
+ When you configured event subscribers, all domain events are dispatched to the subscriber. Publishing the events to
95
+ Kafka, RabbitMQ or any other system becomes trivial.
96
+
97
+ ```bash
98
+ mkdir -p database
99
+ pip install kloppy
100
+
101
+ ingestify run # fills your data lake
102
+ ```
103
+
104
+ ---
105
+
106
+ ## Using the data
107
+
108
+ By default, Ingestify will search in your DatasetStore when you request data. You can pass several filters to only fetch what you need.
109
+
110
+ ```python
111
+ from ingestify.main import get_engine
112
+
113
+ engine = get_engine("config.yaml")
114
+
115
+ for dataset in engine.iter_datasets(
116
+ dataset_state="complete",
117
+ provider="statsbomb",
118
+ dataset_type="match",
119
+ competition_id=11,
120
+ season_id=90):
121
+ df = (
122
+ engine
123
+ .load_dataset_with_kloppy(dataset)
124
+ .to_df(engine="polars")
125
+ )
126
+ df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
127
+ ```
128
+
129
+ #### Auto Ingestion
130
+
131
+ When you don't want to use event driven architecture but just want to work with the latest data, ingestify got you covered. With the `auto_ingest` option, ingestify syncs the data in the background when you ask for the data.
132
+
133
+
134
+ ```python
135
+ from ingestify.main import get_engine
136
+
137
+ engine = get_engine("config.yaml")
138
+
139
+ for dataset in engine.iter_datasets(
140
+ # When set to True it will first do a full sync and then start yielding datasets
141
+ auto_ingest=True,
142
+
143
+ # With streaming enabled all Datasets are yielded when they are up-to-date (not changed, or refetched)
144
+ # auto_ingest={"streaming": True}
145
+
146
+ dataset_state="complete",
147
+ provider="statsbomb",
148
+ dataset_type="match",
149
+ competition_id=11,
150
+ season_id=90):
151
+ df = (
152
+ engine
153
+ .load_dataset_with_kloppy(dataset)
154
+ .to_df(engine="polars")
155
+ )
156
+ df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
157
+ ```
158
+
159
+ #### Open data
160
+
161
+ Ingestify has build-in support for StatsBomb Open Data (more to come).
162
+
163
+ ```shell
164
+ mkdir database_open_data
165
+ pip install kloppy
166
+ ```
167
+
168
+ ```python
169
+ from ingestify.main import get_engine
170
+
171
+ engine = get_engine(
172
+ metadata_url="sqlite:///database_open_data/catalog.db",
173
+ file_url="file://database_open_data/files/"
174
+ )
175
+
176
+ dataset_iter = engine.iter_datasets(
177
+ # This will tell ingestify to look for an Open Data provider
178
+ auto_ingest={"use_open_data": True, "streaming": True},
179
+
180
+ provider="statsbomb",
181
+ dataset_type="match",
182
+ competition_id=43,
183
+ season_id=281
184
+ )
185
+
186
+ for dataset in dataset_iter:
187
+ kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
188
+ ```
189
+
190
+
191
+ ---
192
+
193
+ ## Roadmap
194
+
195
+ * Workflow orchestration helpers (Airflow, Dagster, Prefect)
196
+ * Built‑in Kafka / Kinesis event emitters
197
+ * Streaming data ingestion
198
+ * Data quality hooks (Great Expectations)
199
+
200
+ ---
201
+
202
+ **Stop refetching the world. Own your data lake, keep it version‑safe, and analyse football faster with Ingestify.**
@@ -0,0 +1,192 @@
1
+ # Ingestify
2
+
3
+ _Ingest everything – JSON, CSV, tracking ZIPs, even MP4 – keep it version‑safe, sync only what changed, and analyse while you ingest._
4
+
5
+ ---
6
+
7
+ ## Why Ingestify?
8
+
9
+ Football‐data APIs are often **slow**, **rate‑limited** or just **down**. One parsing bug and you’re forced to pull tens of gigabytes again.
10
+ Ingestify fixes that by building **your own data lake** of untouched provider files and fetching only what’s new:
11
+
12
+ * **Own your lake** – The first time you ask for a match, Ingestify downloads the original files (metadata, line‑ups, events, tracking, video) and stores them untouched in local disk, S3, GCS… every later query hits *your* lake, not the provider.
13
+ * **Never re‑fetch the world** – A file‑level checksum / timestamp check moves only changed bundles across the wire.
14
+ * **Atomic, complete packages** – A *Dataset* is all‑or‑nothing:
15
+
16
+ | Dataset type | Always contains |
17
+ |--------------|-----------------|
18
+ | **Match Dataset** | metadata + line‑ups + events |
19
+ | **Tracking Dataset** | metadata + raw tracking frames |
20
+
21
+ You never analyse events v2 with lineups v1, or yesterday’s first half with today’s second half.
22
+ * **Query while ingesting** – Datasets stream out of the engine the moment their files land, so notebooks or downstream services can start before the full season is in.
23
+
24
+ ---
25
+
26
+ ## The Ingestify Workflow
27
+ <img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
28
+
29
+ ---
30
+
31
+ ## What you gain
32
+
33
+ ### For football‑analytics practitioners
34
+
35
+ | Pain | Ingestify fix |
36
+ |------|---------------|
37
+ | API slowness / downtime | One request → lake; retries and parallelism happen behind the scenes. |
38
+ | Full re‑ingest after a bug | File‑level deltas mean you fetch only the corrected bundles. |
39
+ | Partial / drifting data | Dataset is atomic, versioned, and validated before it becomes visible. |
40
+ | Waiting hours for a season to sync | Stream each Dataset as soon as it lands; analyse while you ingest. |
41
+ | Boilerplate joins | `engine.load_dataset_with_kloppy(dataset)` → analysis‑ready object. |
42
+
43
+ ### For software engineers
44
+
45
+ | Need | How Ingestify helps |
46
+ |------|---------------------|
47
+ | **Domain‑Driven Design** | `Dataset`, `Revision`, `Selector` plus rich domain events read like the problem space. |
48
+ | **Event‑driven integrations** | Subscribe to `RevisionAdded` and push to Kafka, AWS Lambda, Airflow… |
49
+ | **Pluggable everything** | Swap `Source`, `FetchPolicy`, `DatasetStore` subclasses to add providers, change delta logic, or move storage back‑ends. |
50
+ | **Safety & speed** | Multiprocessing downloader with temp‑dir commits – no half‑written matches; near‑linear I/O speed‑ups. |
51
+ | **Any file type** | JSON, CSV, MP4, proprietary binaries – stored verbatim so you parse / transcode later under version control. |
52
+
53
+ ---
54
+
55
+ ## Quick start
56
+
57
+ ```bash
58
+ pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
59
+ ```
60
+
61
+ ### Minimal `config.yaml`
62
+
63
+ ```yaml
64
+ main:
65
+ metadata_url: sqlite:///database/catalog.db # where revision metadata lives
66
+ file_url: file://database/files/ # where raw files live
67
+ default_bucket: main
68
+
69
+ sources:
70
+ statsbomb:
71
+ type: ingestify.statsbomb_github # open‑data provider
72
+
73
+ ingestion_plans:
74
+ - source: statsbomb
75
+ dataset_type: match
76
+ # selectors can narrow the scope
77
+ # selectors:
78
+ # - competition_id: 11
79
+ # season_id: [90]
80
+ ```
81
+
82
+ ### First ingest
83
+
84
+ When you configured event subscribers, all domain events are dispatched to the subscriber. Publishing the events to
85
+ Kafka, RabbitMQ or any other system becomes trivial.
86
+
87
+ ```bash
88
+ mkdir -p database
89
+ pip install kloppy
90
+
91
+ ingestify run # fills your data lake
92
+ ```
93
+
94
+ ---
95
+
96
+ ## Using the data
97
+
98
+ By default, Ingestify will search in your DatasetStore when you request data. You can pass several filters to only fetch what you need.
99
+
100
+ ```python
101
+ from ingestify.main import get_engine
102
+
103
+ engine = get_engine("config.yaml")
104
+
105
+ for dataset in engine.iter_datasets(
106
+ dataset_state="complete",
107
+ provider="statsbomb",
108
+ dataset_type="match",
109
+ competition_id=11,
110
+ season_id=90):
111
+ df = (
112
+ engine
113
+ .load_dataset_with_kloppy(dataset)
114
+ .to_df(engine="polars")
115
+ )
116
+ df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
117
+ ```
118
+
119
+ #### Auto Ingestion
120
+
121
+ When you don't want to use event driven architecture but just want to work with the latest data, ingestify got you covered. With the `auto_ingest` option, ingestify syncs the data in the background when you ask for the data.
122
+
123
+
124
+ ```python
125
+ from ingestify.main import get_engine
126
+
127
+ engine = get_engine("config.yaml")
128
+
129
+ for dataset in engine.iter_datasets(
130
+ # When set to True it will first do a full sync and then start yielding datasets
131
+ auto_ingest=True,
132
+
133
+ # With streaming enabled all Datasets are yielded when they are up-to-date (not changed, or refetched)
134
+ # auto_ingest={"streaming": True}
135
+
136
+ dataset_state="complete",
137
+ provider="statsbomb",
138
+ dataset_type="match",
139
+ competition_id=11,
140
+ season_id=90):
141
+ df = (
142
+ engine
143
+ .load_dataset_with_kloppy(dataset)
144
+ .to_df(engine="polars")
145
+ )
146
+ df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
147
+ ```
148
+
149
+ #### Open data
150
+
151
+ Ingestify has build-in support for StatsBomb Open Data (more to come).
152
+
153
+ ```shell
154
+ mkdir database_open_data
155
+ pip install kloppy
156
+ ```
157
+
158
+ ```python
159
+ from ingestify.main import get_engine
160
+
161
+ engine = get_engine(
162
+ metadata_url="sqlite:///database_open_data/catalog.db",
163
+ file_url="file://database_open_data/files/"
164
+ )
165
+
166
+ dataset_iter = engine.iter_datasets(
167
+ # This will tell ingestify to look for an Open Data provider
168
+ auto_ingest={"use_open_data": True, "streaming": True},
169
+
170
+ provider="statsbomb",
171
+ dataset_type="match",
172
+ competition_id=43,
173
+ season_id=281
174
+ )
175
+
176
+ for dataset in dataset_iter:
177
+ kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
178
+ ```
179
+
180
+
181
+ ---
182
+
183
+ ## Roadmap
184
+
185
+ * Workflow orchestration helpers (Airflow, Dagster, Prefect)
186
+ * Built‑in Kafka / Kinesis event emitters
187
+ * Streaming data ingestion
188
+ * Data quality hooks (Great Expectations)
189
+
190
+ ---
191
+
192
+ **Stop refetching the world. Own your data lake, keep it version‑safe, and analyse football faster with Ingestify.**
@@ -8,4 +8,4 @@ if not __INGESTIFY_SETUP__:
8
8
  from .infra import retrieve_http
9
9
  from .source_base import Source, DatasetResource
10
10
 
11
- __version__ = "0.6.4"
11
+ __version__ = "0.7.0"