ingestify 0.6.4__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ingestify/__init__.py +2 -1
- ingestify/application/dataset_store.py +228 -11
- ingestify/application/ingestion_engine.py +232 -7
- ingestify/application/loader.py +163 -28
- ingestify/cmdline.py +0 -48
- ingestify/domain/models/__init__.py +2 -0
- ingestify/domain/models/dataset/collection.py +0 -9
- ingestify/domain/models/dataset/dataset_repository.py +4 -0
- ingestify/domain/models/dataset/dataset_state.py +5 -0
- ingestify/domain/models/dataset/events.py +13 -0
- ingestify/domain/models/dataset/file.py +7 -1
- ingestify/domain/models/dataset/selector.py +8 -1
- ingestify/domain/models/event/event_bus.py +16 -1
- ingestify/domain/models/ingestion/ingestion_job.py +23 -4
- ingestify/domain/models/resources/dataset_resource.py +0 -1
- ingestify/infra/source/statsbomb/base.py +36 -0
- ingestify/infra/source/statsbomb/match.py +137 -0
- ingestify/infra/source/statsbomb_github.py +46 -44
- ingestify/infra/store/dataset/sqlalchemy/repository.py +77 -10
- ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
- ingestify/main.py +190 -10
- ingestify/utils.py +2 -32
- ingestify-0.8.0.dist-info/METADATA +257 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/RECORD +28 -36
- ingestify/infra/source/wyscout.py +0 -175
- ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
- ingestify/static/templates/statsbomb_github/database/README.md +0 -1
- ingestify/static/templates/statsbomb_github/query.py +0 -14
- ingestify/static/templates/wyscout/.env +0 -5
- ingestify/static/templates/wyscout/.gitignore +0 -2
- ingestify/static/templates/wyscout/README.md +0 -0
- ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
- ingestify/static/templates/wyscout/database/README.md +0 -1
- ingestify/static/templates/wyscout/query.py +0 -14
- ingestify-0.6.4.dist-info/METADATA +0 -266
- /ingestify/{static/templates/statsbomb_github/README.md → infra/source/statsbomb/__init__.py} +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/WHEEL +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/entry_points.txt +0 -0
- {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/top_level.txt +0 -0
ingestify/utils.py
CHANGED
|
@@ -5,13 +5,11 @@ import re
|
|
|
5
5
|
import traceback
|
|
6
6
|
from concurrent.futures import ThreadPoolExecutor
|
|
7
7
|
from contextlib import contextmanager
|
|
8
|
-
from multiprocessing import get_context, cpu_count, get_all_start_methods
|
|
9
8
|
|
|
10
9
|
from datetime import datetime, timezone
|
|
11
10
|
from string import Template
|
|
12
11
|
from typing import Dict, Tuple, Optional, Any, List
|
|
13
12
|
|
|
14
|
-
import cloudpickle
|
|
15
13
|
from pydantic import Field
|
|
16
14
|
from typing_extensions import Self
|
|
17
15
|
|
|
@@ -75,8 +73,8 @@ class AttributeBag:
|
|
|
75
73
|
return Template(string).substitute(**self.attributes)
|
|
76
74
|
|
|
77
75
|
def matches(self, attributes: Dict) -> bool:
|
|
78
|
-
for k, v in
|
|
79
|
-
if attributes.
|
|
76
|
+
for k, v in attributes.items():
|
|
77
|
+
if k in self.attributes and self.attributes[k] != v:
|
|
80
78
|
return False
|
|
81
79
|
return True
|
|
82
80
|
|
|
@@ -110,34 +108,6 @@ class AttributeBag:
|
|
|
110
108
|
)
|
|
111
109
|
|
|
112
110
|
|
|
113
|
-
def cloud_unpack_and_call(args):
|
|
114
|
-
f_pickled, org_args = args
|
|
115
|
-
|
|
116
|
-
f = cloudpickle.loads(f_pickled)
|
|
117
|
-
return f(org_args)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def map_in_pool(func, iterable, processes=0):
|
|
121
|
-
# TODO: move to cmdline
|
|
122
|
-
if os.environ.get("INGESTIFY_RUN_EAGER") == "true":
|
|
123
|
-
return list(map(func, iterable))
|
|
124
|
-
|
|
125
|
-
if not processes:
|
|
126
|
-
processes = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
|
|
127
|
-
|
|
128
|
-
if "fork" in get_all_start_methods():
|
|
129
|
-
ctx = get_context("fork")
|
|
130
|
-
else:
|
|
131
|
-
ctx = get_context("spawn")
|
|
132
|
-
|
|
133
|
-
wrapped_fn = cloudpickle.dumps(func)
|
|
134
|
-
|
|
135
|
-
with ctx.Pool(processes or cpu_count()) as pool:
|
|
136
|
-
return pool.map(
|
|
137
|
-
cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
|
|
141
111
|
class SyncExecutor:
|
|
142
112
|
def map(self, func, iterable):
|
|
143
113
|
return [func(item) for item in iterable]
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ingestify
|
|
3
|
+
Version: 0.8.0
|
|
4
|
+
Summary: Data Ingestion Framework
|
|
5
|
+
Author: Koen Vossen
|
|
6
|
+
Author-email: info@koenvossen.nl
|
|
7
|
+
License: AGPL
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: requests<3,>=2.0.0
|
|
10
|
+
Requires-Dist: SQLAlchemy<3,>=2
|
|
11
|
+
Requires-Dist: click>=8
|
|
12
|
+
Requires-Dist: python-dotenv
|
|
13
|
+
Requires-Dist: pyaml-env
|
|
14
|
+
Requires-Dist: boto3
|
|
15
|
+
Requires-Dist: pydantic>=2.0.0
|
|
16
|
+
Provides-Extra: test
|
|
17
|
+
Requires-Dist: pytest<7,>=6.2.5; extra == "test"
|
|
18
|
+
Requires-Dist: pytz; extra == "test"
|
|
19
|
+
|
|
20
|
+
# Ingestify
|
|
21
|
+
|
|
22
|
+
_Ingest everything – JSON, CSV, tracking ZIPs, even MP4 – keep it version‑safe, sync only what changed, and analyse while you ingest._
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Why Ingestify?
|
|
27
|
+
|
|
28
|
+
Football‐data APIs are often **slow**, **rate‑limited** or just **down**. One parsing bug and you’re forced to pull tens of gigabytes again.
|
|
29
|
+
Ingestify fixes that by building **your own data lake** of untouched provider files and fetching only what’s new:
|
|
30
|
+
|
|
31
|
+
* **Own your lake** – The first time you ask for a match, Ingestify downloads the original files (metadata, line‑ups, events, tracking, video) and stores them untouched in local disk, S3, GCS… every later query hits *your* lake, not the provider.
|
|
32
|
+
* **Never re‑fetch the world** – A file‑level checksum / timestamp check moves only changed bundles across the wire.
|
|
33
|
+
* **Atomic, complete packages** – A *Dataset* is all‑or‑nothing:
|
|
34
|
+
|
|
35
|
+
| Dataset type | Always contains |
|
|
36
|
+
|--------------|-----------------|
|
|
37
|
+
| **Match Dataset** | metadata + line‑ups + events |
|
|
38
|
+
| **Tracking Dataset** | metadata + raw tracking frames |
|
|
39
|
+
|
|
40
|
+
You never analyse events v2 with lineups v1, or yesterday’s first half with today’s second half.
|
|
41
|
+
* **Query while ingesting** – Datasets stream out of the engine the moment their files land, so notebooks or downstream services can start before the full season is in.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## The Ingestify Workflow
|
|
46
|
+
<img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## What you gain
|
|
51
|
+
|
|
52
|
+
### For football‑analytics practitioners
|
|
53
|
+
|
|
54
|
+
| Pain | Ingestify fix |
|
|
55
|
+
|------|---------------|
|
|
56
|
+
| API slowness / downtime | One request → lake; retries and parallelism happen behind the scenes. |
|
|
57
|
+
| Full re‑ingest after a bug | File‑level deltas mean you fetch only the corrected bundles. |
|
|
58
|
+
| Partial / drifting data | Dataset is atomic, versioned, and validated before it becomes visible. |
|
|
59
|
+
| Waiting hours for a season to sync | Stream each Dataset as soon as it lands; analyse while you ingest. |
|
|
60
|
+
| Boilerplate joins | `engine.load_dataset_with_kloppy(dataset)` → analysis‑ready object. |
|
|
61
|
+
|
|
62
|
+
### For software engineers
|
|
63
|
+
|
|
64
|
+
| Need | How Ingestify helps |
|
|
65
|
+
|------|---------------------|
|
|
66
|
+
| **Domain‑Driven Design** | `Dataset`, `Revision`, `Selector` plus rich domain events read like the problem space. |
|
|
67
|
+
| **Event‑driven integrations** | Subscribe to `RevisionAdded` and push to Kafka, AWS Lambda, Airflow… |
|
|
68
|
+
| **Pluggable everything** | Swap `Source`, `FetchPolicy`, `DatasetStore` subclasses to add providers, change delta logic, or move storage back‑ends. |
|
|
69
|
+
| **Safety & speed** | Multiprocessing downloader with temp‑dir commits – no half‑written matches; near‑linear I/O speed‑ups. |
|
|
70
|
+
| **Any file type** | JSON, CSV, MP4, proprietary binaries – stored verbatim so you parse / transcode later under version control. |
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Quick start
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Developing a new Source
|
|
81
|
+
|
|
82
|
+
When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from ingestify import Source, debug_source
|
|
86
|
+
|
|
87
|
+
class MyCustomSource(Source):
|
|
88
|
+
provider = "my_provider"
|
|
89
|
+
|
|
90
|
+
def __init__(self, name: str, api_key: str):
|
|
91
|
+
super().__init__(name)
|
|
92
|
+
self.api_key = api_key
|
|
93
|
+
|
|
94
|
+
def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
|
|
95
|
+
# Your source implementation
|
|
96
|
+
...
|
|
97
|
+
|
|
98
|
+
# Quick debug - runs full ingestion with temp storage
|
|
99
|
+
if __name__ == "__main__":
|
|
100
|
+
source = MyCustomSource(name="test", api_key="...")
|
|
101
|
+
|
|
102
|
+
debug_source(
|
|
103
|
+
source,
|
|
104
|
+
dataset_type="match",
|
|
105
|
+
data_spec_versions={"events": "v1"},
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
The `debug_source()` helper:
|
|
110
|
+
- ✅ Creates an ephemeral dev engine with temp storage
|
|
111
|
+
- ✅ Configures logging automatically
|
|
112
|
+
- ✅ Runs the full ingestion cycle
|
|
113
|
+
- ✅ Shows storage location and results
|
|
114
|
+
|
|
115
|
+
Perfect for testing your source before adding it to production config!
|
|
116
|
+
|
|
117
|
+
### Minimal `config.yaml`
|
|
118
|
+
|
|
119
|
+
```yaml
|
|
120
|
+
main:
|
|
121
|
+
metadata_url: sqlite:///database/catalog.db # where revision metadata lives
|
|
122
|
+
file_url: file://database/files/ # where raw files live
|
|
123
|
+
default_bucket: main
|
|
124
|
+
|
|
125
|
+
sources:
|
|
126
|
+
statsbomb:
|
|
127
|
+
type: ingestify.statsbomb_github # open‑data provider
|
|
128
|
+
|
|
129
|
+
ingestion_plans:
|
|
130
|
+
- source: statsbomb
|
|
131
|
+
dataset_type: match
|
|
132
|
+
# selectors can narrow the scope
|
|
133
|
+
# selectors:
|
|
134
|
+
# - competition_id: 11
|
|
135
|
+
# season_id: [90]
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### First ingest
|
|
139
|
+
|
|
140
|
+
When you configured event subscribers, all domain events are dispatched to the subscriber. Publishing the events to
|
|
141
|
+
Kafka, RabbitMQ or any other system becomes trivial.
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
mkdir -p database
|
|
145
|
+
pip install kloppy
|
|
146
|
+
|
|
147
|
+
ingestify run # fills your data lake
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Using the data
|
|
153
|
+
|
|
154
|
+
By default, Ingestify will search in your DatasetStore when you request data. You can pass several filters to only fetch what you need.
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from ingestify.main import get_engine
|
|
158
|
+
|
|
159
|
+
engine = get_engine("config.yaml")
|
|
160
|
+
|
|
161
|
+
for dataset in engine.iter_datasets(
|
|
162
|
+
dataset_state="complete",
|
|
163
|
+
provider="statsbomb",
|
|
164
|
+
dataset_type="match",
|
|
165
|
+
competition_id=11,
|
|
166
|
+
season_id=90):
|
|
167
|
+
df = (
|
|
168
|
+
engine
|
|
169
|
+
.load_dataset_with_kloppy(dataset)
|
|
170
|
+
.to_df(engine="polars")
|
|
171
|
+
)
|
|
172
|
+
df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
#### Auto Ingestion
|
|
176
|
+
|
|
177
|
+
When you don't want to use event driven architecture but just want to work with the latest data, ingestify got you covered. With the `auto_ingest` option, ingestify syncs the data in the background when you ask for the data.
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
from ingestify.main import get_engine
|
|
182
|
+
|
|
183
|
+
engine = get_engine("config.yaml")
|
|
184
|
+
|
|
185
|
+
for dataset in engine.iter_datasets(
|
|
186
|
+
# When set to True it will first do a full sync and then start yielding datasets
|
|
187
|
+
auto_ingest=True,
|
|
188
|
+
|
|
189
|
+
# With streaming enabled all Datasets are yielded when they are up-to-date (not changed, or refetched)
|
|
190
|
+
# auto_ingest={"streaming": True}
|
|
191
|
+
|
|
192
|
+
dataset_state="complete",
|
|
193
|
+
provider="statsbomb",
|
|
194
|
+
dataset_type="match",
|
|
195
|
+
competition_id=11,
|
|
196
|
+
season_id=90):
|
|
197
|
+
df = (
|
|
198
|
+
engine
|
|
199
|
+
.load_dataset_with_kloppy(dataset)
|
|
200
|
+
.to_df(engine="polars")
|
|
201
|
+
)
|
|
202
|
+
df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
#### Open data
|
|
206
|
+
|
|
207
|
+
Ingestify has build-in support for StatsBomb Open Data (more to come).
|
|
208
|
+
|
|
209
|
+
```shell
|
|
210
|
+
mkdir database_open_data
|
|
211
|
+
pip install kloppy
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
```python
|
|
215
|
+
import logging, sys
|
|
216
|
+
|
|
217
|
+
from ingestify.main import get_engine
|
|
218
|
+
|
|
219
|
+
logging.basicConfig(
|
|
220
|
+
level=logging.INFO,
|
|
221
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
222
|
+
stream=sys.stderr,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
engine = get_engine(
|
|
226
|
+
metadata_url="sqlite:///database_open_data/catalog.db",
|
|
227
|
+
file_url="file://database_open_data/files/"
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
dataset_iter = engine.iter_datasets(
|
|
231
|
+
# This will tell ingestify to look for an Open Data provider
|
|
232
|
+
auto_ingest={"use_open_data": True, "streaming": True},
|
|
233
|
+
|
|
234
|
+
provider="statsbomb",
|
|
235
|
+
dataset_type="match",
|
|
236
|
+
competition_id=43, # "FIFA World Cup"
|
|
237
|
+
#season_id=281
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
for dataset in dataset_iter:
|
|
241
|
+
kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
|
|
242
|
+
logging.info(f"Loaded {kloppy_dataset}")
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## Roadmap
|
|
249
|
+
|
|
250
|
+
* Workflow orchestration helpers (Airflow, Dagster, Prefect)
|
|
251
|
+
* Built‑in Kafka / Kinesis event emitters
|
|
252
|
+
* Streaming data ingestion
|
|
253
|
+
* Data quality hooks (Great Expectations)
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
**Stop refetching the world. Own your data lake, keep it version‑safe, and analyse football faster with Ingestify.**
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
ingestify/__init__.py,sha256=
|
|
2
|
-
ingestify/cmdline.py,sha256=
|
|
1
|
+
ingestify/__init__.py,sha256=FeK7pau-iTc6ooJiPelblIhkrPLojVHKpTHXIrkdpq8,336
|
|
2
|
+
ingestify/cmdline.py,sha256=Rs1_lSKSIJrcygH5fvtOGicOl_e0sZYW7deqp4_jGbY,6233
|
|
3
3
|
ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
|
|
4
|
-
ingestify/main.py,sha256=
|
|
4
|
+
ingestify/main.py,sha256=WjhcsT21F7dOibrg_S7wRiui6Ytj5ScsWqMCGuv9fs8,14938
|
|
5
5
|
ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
|
|
6
6
|
ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
|
|
7
|
-
ingestify/utils.py,sha256=
|
|
7
|
+
ingestify/utils.py,sha256=tsoo-GgeSrwK161WCqW793BAm5bjvnGwI8yGgLTJ1lk,6486
|
|
8
8
|
ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
ingestify/application/dataset_store.py,sha256=
|
|
10
|
-
ingestify/application/ingestion_engine.py,sha256=
|
|
11
|
-
ingestify/application/loader.py,sha256=
|
|
9
|
+
ingestify/application/dataset_store.py,sha256=GP6wGjVirefEn6hlqWIkOBqdELad9L_mmTpdHdzj18M,20353
|
|
10
|
+
ingestify/application/ingestion_engine.py,sha256=we16yiDS9QGOlAUiP1vidDycihjWK3B2jo64uqKmrXE,11246
|
|
11
|
+
ingestify/application/loader.py,sha256=K99ZJuHMEJFO6CIlxoyHKGSQtXw63JgOYu3moUD6sR0,13400
|
|
12
12
|
ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
|
|
13
13
|
ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
|
|
14
|
-
ingestify/domain/models/__init__.py,sha256=
|
|
14
|
+
ingestify/domain/models/__init__.py,sha256=WuKS34uiR1EwyczKujBHYGupqseJP-U2P5IQS4kpsA8,838
|
|
15
15
|
ingestify/domain/models/base.py,sha256=4gKbREajxJHlS-VwKoosNtHVupZ4eDLKMqnJ4ib0aS8,184
|
|
16
16
|
ingestify/domain/models/data_spec_version_collection.py,sha256=x5BvBnVI9QAfqhjCrUK19HKAiujdU1m8PkbQZwDheFU,1338
|
|
17
17
|
ingestify/domain/models/fetch_policy.py,sha256=I-DnIHI_0bYlD0vpKJ58Z6he85pXvjdqXkVQA8axJ8Y,1461
|
|
@@ -19,31 +19,31 @@ ingestify/domain/models/sink.py,sha256=OBVfFMpB7puJmHg4q2KYx4qgoAnlmX8xKWYnPi8a9
|
|
|
19
19
|
ingestify/domain/models/source.py,sha256=sB3aqr2LfjIbtw7ODJpHnPj3RUeo7gYmTU7MXvfaYg4,973
|
|
20
20
|
ingestify/domain/models/timing.py,sha256=TvvH6Szo61CD8wCP7Awyc45CXga5lKqvoW2U-0TRHlA,388
|
|
21
21
|
ingestify/domain/models/dataset/__init__.py,sha256=i1kswluvWjw0xn4OUByRt7yeRvNHu1mauevv-Vmayx4,630
|
|
22
|
-
ingestify/domain/models/dataset/collection.py,sha256=
|
|
22
|
+
ingestify/domain/models/dataset/collection.py,sha256=YKGQv6hqm88MYlNp2c47CoWysyNZWCeZwTkwCVpQcaE,1055
|
|
23
23
|
ingestify/domain/models/dataset/collection_metadata.py,sha256=aWY6O3_JLj_jKfVfUTjmi3-E4heBmmmtqX81vhdzr0I,498
|
|
24
24
|
ingestify/domain/models/dataset/dataset.py,sha256=OiP03nY0-m06y2GTrs_m-RiZE8HwypIHRwSqoM_DNnQ,4049
|
|
25
|
-
ingestify/domain/models/dataset/dataset_repository.py,sha256=
|
|
26
|
-
ingestify/domain/models/dataset/dataset_state.py,sha256=
|
|
27
|
-
ingestify/domain/models/dataset/events.py,sha256=
|
|
28
|
-
ingestify/domain/models/dataset/file.py,sha256=
|
|
25
|
+
ingestify/domain/models/dataset/dataset_repository.py,sha256=bf3F_1cKw0CvUberD3FMROE8iowAmYefnD4L6aPB39k,989
|
|
26
|
+
ingestify/domain/models/dataset/dataset_state.py,sha256=IaYG02WzgooGaM_AuwRhZgljs-9NhCF_LpBZXkl5ELY,324
|
|
27
|
+
ingestify/domain/models/dataset/events.py,sha256=M8jrHWCm9iXapAy3xjvZZtiiOxXDnfefBixiMwkas24,786
|
|
28
|
+
ingestify/domain/models/dataset/file.py,sha256=cXDjSw19HRMCGFpVN4u1oejxE1V8SMQptfNVDVixj6o,4464
|
|
29
29
|
ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
|
|
30
30
|
ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
|
|
31
31
|
ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
|
|
32
32
|
ingestify/domain/models/dataset/revision.py,sha256=e8-NsRS8AILrNjWqCxqANF55oY091CN3fBmIiVS9wz0,2049
|
|
33
|
-
ingestify/domain/models/dataset/selector.py,sha256=
|
|
33
|
+
ingestify/domain/models/dataset/selector.py,sha256=qGRA22gDAHhjDAhMWzOjZPz3Rrs1V-DZ32z75NARoTQ,1448
|
|
34
34
|
ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
|
|
35
35
|
ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
|
|
36
36
|
ingestify/domain/models/event/dispatcher.py,sha256=5WnyUJ7Qzr612btAtl1dMG9JBXDPcsBLyLmW6H7Q1zk,154
|
|
37
37
|
ingestify/domain/models/event/domain_event.py,sha256=OR6va417j2lisRr0gjQZ9rshAtlys5sVu7KU-W0r0xA,316
|
|
38
|
-
ingestify/domain/models/event/event_bus.py,sha256=
|
|
38
|
+
ingestify/domain/models/event/event_bus.py,sha256=feVXsbBcRNkbWYvXbmz-Yi9-3R690ymc9KkpejkfLxg,911
|
|
39
39
|
ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
|
|
40
40
|
ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
|
|
41
41
|
ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
-
ingestify/domain/models/ingestion/ingestion_job.py,sha256=
|
|
42
|
+
ingestify/domain/models/ingestion/ingestion_job.py,sha256=KaKpAu0XKvWV1YoWaTlOjbapcs-CCAvOHlSxUHZxZwI,15450
|
|
43
43
|
ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=7dmkWEvE7lMSv1ILWcSvys1bUGuGe_s-YbOFC6eYMBI,4794
|
|
44
44
|
ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
|
|
45
45
|
ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
|
|
46
|
-
ingestify/domain/models/resources/dataset_resource.py,sha256=
|
|
46
|
+
ingestify/domain/models/resources/dataset_resource.py,sha256=Le_C4nPzPPTDq75_amKSNsR94QvVWdZ_ZkjYIKa6whM,3084
|
|
47
47
|
ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
|
|
48
48
|
ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
|
|
49
49
|
ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
|
|
@@ -59,29 +59,21 @@ ingestify/infra/serialization/__init__.py,sha256=UqXWJmKTp7Mi58ZyDASGguPFlqdVWVU
|
|
|
59
59
|
ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
60
60
|
ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
|
|
61
61
|
ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
|
-
ingestify/infra/source/statsbomb_github.py,sha256=
|
|
63
|
-
ingestify/infra/source/
|
|
62
|
+
ingestify/infra/source/statsbomb_github.py,sha256=KHpl3Ojw2ZEkMdyLh1VOLIqrz6blHWldTLpsSgXyf-M,3773
|
|
63
|
+
ingestify/infra/source/statsbomb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
64
|
+
ingestify/infra/source/statsbomb/base.py,sha256=f486brtGdK_zPipHAtmVpnp7gcYdPSV28iTUqsBxldA,1155
|
|
65
|
+
ingestify/infra/source/statsbomb/match.py,sha256=8Zpdys6-bB_ral2AmjGKhF4BnXW3F0Y0C5aWnhxcWAY,5525
|
|
64
66
|
ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
|
|
65
67
|
ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
68
|
ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
|
|
67
|
-
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=
|
|
68
|
-
ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=
|
|
69
|
+
ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=cMmhXqUNp_HUg_IgsUlJ439VXX_H67pnivaToUlqlA4,22552
|
|
70
|
+
ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=ffHop9DQeVE9JrCMLJ2EvF7MD7j8thfjVwv2xcsbJtY,10954
|
|
69
71
|
ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
|
|
70
72
|
ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
|
|
71
73
|
ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
|
|
72
74
|
ingestify/infra/store/file/s3_file_repository.py,sha256=tz_EZ_gun7W2qJMlI3j_R03iKBZlJSDcG7AUJ1JkdpE,1501
|
|
73
|
-
ingestify/
|
|
74
|
-
ingestify
|
|
75
|
-
ingestify/
|
|
76
|
-
ingestify/
|
|
77
|
-
ingestify/
|
|
78
|
-
ingestify/static/templates/wyscout/.gitignore,sha256=db0A2IjIeZf5fLLwXKD-bLmC4pETofxm848bljymnNs,13
|
|
79
|
-
ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
80
|
-
ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
|
|
81
|
-
ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
|
|
82
|
-
ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
|
|
83
|
-
ingestify-0.6.4.dist-info/METADATA,sha256=g34iFogx4pOE2FYe2wbNZg9TwH_ufGSBOSrodty--NU,18854
|
|
84
|
-
ingestify-0.6.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
85
|
-
ingestify-0.6.4.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
|
|
86
|
-
ingestify-0.6.4.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
|
|
87
|
-
ingestify-0.6.4.dist-info/RECORD,,
|
|
75
|
+
ingestify-0.8.0.dist-info/METADATA,sha256=rpC2ALX0e4Ii-XzhJWmRKfW7YoBgl6gEpP2cUGFlQp4,8089
|
|
76
|
+
ingestify-0.8.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
77
|
+
ingestify-0.8.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
|
|
78
|
+
ingestify-0.8.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
|
|
79
|
+
ingestify-0.8.0.dist-info/RECORD,,
|
|
@@ -1,175 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from typing import Optional, Dict, List
|
|
3
|
-
|
|
4
|
-
import requests
|
|
5
|
-
|
|
6
|
-
from ingestify import Source, retrieve_http
|
|
7
|
-
from ingestify.domain import DraftFile
|
|
8
|
-
from ingestify.exceptions import ConfigurationError
|
|
9
|
-
|
|
10
|
-
BASE_URL = "https://apirest.wyscout.com/v3"
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def wyscout_pager_fn(url, response):
|
|
14
|
-
if response["meta"]["page_current"] < response["meta"]["page_count"]:
|
|
15
|
-
return f"{url}&page={response['meta']['page_current'] + 1}"
|
|
16
|
-
else:
|
|
17
|
-
return None
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class Wyscout(Source):
|
|
21
|
-
def discover_selectors(self, dataset_type: str) -> List[Dict]:
|
|
22
|
-
raise NotImplementedError("Not implemented for Wyscout")
|
|
23
|
-
|
|
24
|
-
provider = "wyscout"
|
|
25
|
-
|
|
26
|
-
def __init__(self, name: str, username: str, password: str):
|
|
27
|
-
super().__init__(name)
|
|
28
|
-
|
|
29
|
-
self.username = username.strip()
|
|
30
|
-
self.password = password.strip()
|
|
31
|
-
|
|
32
|
-
if not self.username:
|
|
33
|
-
raise ConfigurationError(
|
|
34
|
-
f"Username of Wyscout source named '{self.name}' cannot be empty"
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
if not self.password:
|
|
38
|
-
raise ConfigurationError(
|
|
39
|
-
f"Username of Wyscout source named '{self.name}' cannot be empty"
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
def _get(self, path: str):
|
|
43
|
-
response = requests.get(
|
|
44
|
-
BASE_URL + path,
|
|
45
|
-
auth=(self.username, self.password),
|
|
46
|
-
)
|
|
47
|
-
if response.status_code == 400:
|
|
48
|
-
# What if the response isn't a json?
|
|
49
|
-
error = response.json()["error"]
|
|
50
|
-
raise ConfigurationError(
|
|
51
|
-
f"Check username/password of Wyscout source named '{self.name}'. API response "
|
|
52
|
-
f"was '{error['message']}' ({error['code']})."
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
response.raise_for_status()
|
|
56
|
-
return response.json()
|
|
57
|
-
|
|
58
|
-
def _get_paged(self, path: str, data_path: str):
|
|
59
|
-
data = []
|
|
60
|
-
current_page = 1
|
|
61
|
-
page_count = None
|
|
62
|
-
while page_count is None or current_page <= page_count:
|
|
63
|
-
page_data = self._get(path + f"?page={current_page}&limit=100")
|
|
64
|
-
page_count = page_data["meta"]["page_count"]
|
|
65
|
-
|
|
66
|
-
data.extend(page_data[data_path])
|
|
67
|
-
current_page += 1
|
|
68
|
-
|
|
69
|
-
return data
|
|
70
|
-
|
|
71
|
-
def discover_datasets(self, dataset_type: str, season_id: int):
|
|
72
|
-
matches = self._get(f"/seasons/{season_id}/matches")
|
|
73
|
-
datasets = []
|
|
74
|
-
for match in matches["matches"]:
|
|
75
|
-
dataset = dict(match_id=match["matchId"], version="v3", _metadata=match)
|
|
76
|
-
datasets.append(dataset)
|
|
77
|
-
|
|
78
|
-
return datasets
|
|
79
|
-
|
|
80
|
-
def fetch_dataset_files(
|
|
81
|
-
self, dataset_type, identifier, current_version
|
|
82
|
-
) -> Dict[str, Optional[DraftFile]]:
|
|
83
|
-
current_files = current_version.modified_files_map if current_version else {}
|
|
84
|
-
files = {}
|
|
85
|
-
|
|
86
|
-
for filename, url in [
|
|
87
|
-
(
|
|
88
|
-
"events.json",
|
|
89
|
-
f"{BASE_URL}/matches/{identifier.match_id}/events?fetch=teams,players",
|
|
90
|
-
),
|
|
91
|
-
]:
|
|
92
|
-
files[filename] = retrieve_http(
|
|
93
|
-
url, current_files.get(filename), auth=(self.username, self.password)
|
|
94
|
-
)
|
|
95
|
-
return files
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
#
|
|
99
|
-
# class WyscoutEvent(Wyscout):
|
|
100
|
-
# dataset_type = "event"
|
|
101
|
-
#
|
|
102
|
-
# def discover_datasets(self, season_id: int):
|
|
103
|
-
# matches = self._get(f"/seasons/{season_id}/matches")
|
|
104
|
-
# datasets = []
|
|
105
|
-
# for match in matches["matches"]:
|
|
106
|
-
# dataset = dict(match_id=match["matchId"], version="v3", _metadata=match)
|
|
107
|
-
# datasets.append(dataset)
|
|
108
|
-
#
|
|
109
|
-
# return datasets
|
|
110
|
-
#
|
|
111
|
-
# def fetch_dataset_files(
|
|
112
|
-
# self, identifier, current_version
|
|
113
|
-
# ) -> Dict[str, Optional[DraftFile]]:
|
|
114
|
-
# current_files = current_version.modified_files_map if current_version else {}
|
|
115
|
-
# files = {}
|
|
116
|
-
#
|
|
117
|
-
# for filename, url in [
|
|
118
|
-
# (
|
|
119
|
-
# "events.json",
|
|
120
|
-
# f"{BASE_URL}/matches/{identifier.match_id}/events?fetch=teams,players",
|
|
121
|
-
# ),
|
|
122
|
-
# ]:
|
|
123
|
-
# files[filename] = retrieve_http(
|
|
124
|
-
# url, current_files.get(filename), auth=(self.username, self.password)
|
|
125
|
-
# )
|
|
126
|
-
# return files
|
|
127
|
-
#
|
|
128
|
-
#
|
|
129
|
-
# class WyscoutPlayer(Wyscout):
|
|
130
|
-
# dataset_type = "player"
|
|
131
|
-
#
|
|
132
|
-
# def discover_datasets(self, season_id: int):
|
|
133
|
-
# return [
|
|
134
|
-
# dict(
|
|
135
|
-
# version="v3",
|
|
136
|
-
# )
|
|
137
|
-
# ]
|
|
138
|
-
#
|
|
139
|
-
# def fetch_dataset_files(
|
|
140
|
-
# self, identifier, current_version
|
|
141
|
-
# ) -> Dict[str, Optional[DraftFile]]:
|
|
142
|
-
# current_files = current_version.modified_files_map if current_version else {}
|
|
143
|
-
#
|
|
144
|
-
# return {
|
|
145
|
-
# "players.json": retrieve_http(
|
|
146
|
-
# f"{BASE_URL}/seasons/{identifier.season_id}/players?limit=100",
|
|
147
|
-
# current_files.get("players.json"),
|
|
148
|
-
# pager=("players", wyscout_pager_fn),
|
|
149
|
-
# auth=(self.username, self.password),
|
|
150
|
-
# )
|
|
151
|
-
# }
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
if __name__ == "__main__":
|
|
155
|
-
import dotenv, os
|
|
156
|
-
|
|
157
|
-
dotenv.load_dotenv()
|
|
158
|
-
|
|
159
|
-
kilmarnock_id = 8516
|
|
160
|
-
competition_id = 750
|
|
161
|
-
season_id = 188105
|
|
162
|
-
match_id = 5459107
|
|
163
|
-
player_id = 840543
|
|
164
|
-
|
|
165
|
-
data = requests.get(
|
|
166
|
-
f"{BASE_URL}/competitions/{competition_id}/players",
|
|
167
|
-
# f"{BASE_URL}/players/{player_id}/career",
|
|
168
|
-
# f"{BASE_URL}/matches/{match_id}/advancedstats/players",
|
|
169
|
-
# f"{BASE_URL}/competitions/{competition_id}/matches", # teams/{kilmarnock_id}/advancedstats?compId={competition_id}",
|
|
170
|
-
# f"{BASE_URL}/teams/{kilmarnock_id}/squad", #teams/{kilmarnock_id}/advancedstats?compId={competition_id}",
|
|
171
|
-
auth=(os.environ["WYSCOUT_USERNAME"], os.environ["WYSCOUT_PASSWORD"]),
|
|
172
|
-
).json()
|
|
173
|
-
from pprint import pprint
|
|
174
|
-
|
|
175
|
-
pprint(data)
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
ingestify_version: {{ ingestify_version }}
|
|
2
|
-
|
|
3
|
-
main:
|
|
4
|
-
dataset_url: sqlite:///database/catalog.db
|
|
5
|
-
file_url: file://database/files/
|
|
6
|
-
default_bucket: main
|
|
7
|
-
|
|
8
|
-
sources:
|
|
9
|
-
statsbomb:
|
|
10
|
-
type: ingestify.statsbomb_github
|
|
11
|
-
|
|
12
|
-
extract_jobs:
|
|
13
|
-
- source: statsbomb
|
|
14
|
-
selectors:
|
|
15
|
-
- competition_id: 11
|
|
16
|
-
season_id: [42, 90]
|
|
17
|
-
|
|
18
|
-
# passing an empty selector means: fetch everything
|
|
19
|
-
# -
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# This will contain the database
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from ingestify.main import get_datastore
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def main():
|
|
5
|
-
store = get_datastore("config.yaml")
|
|
6
|
-
dataset_collection = store.get_dataset_collection()
|
|
7
|
-
|
|
8
|
-
for dataset in dataset_collection:
|
|
9
|
-
kloppy_dataset = store.load_with_kloppy(dataset)
|
|
10
|
-
print(f"Loaded dataset with {len(kloppy_dataset.records)} events")
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
if __name__ == "__main__":
|
|
14
|
-
main()
|
|
File without changes
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
ingestify_version: {{ ingestify_version }}
|
|
2
|
-
|
|
3
|
-
main:
|
|
4
|
-
dataset_url: sqlite:///database/catalog.db
|
|
5
|
-
file_url: file://database/files/
|
|
6
|
-
default_bucket: main
|
|
7
|
-
|
|
8
|
-
sources:
|
|
9
|
-
wyscout:
|
|
10
|
-
type: ingestify.wyscout
|
|
11
|
-
configuration:
|
|
12
|
-
username: !ENV ${WYSCOUT_USERNAME}
|
|
13
|
-
password: !ENV ${WYSCOUT_PASSWORD}
|
|
14
|
-
|
|
15
|
-
extract_jobs:
|
|
16
|
-
- source: wyscout
|
|
17
|
-
selectors:
|
|
18
|
-
- season_id: 188105
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# This will contain the database
|