ingestify 0.6.4__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. ingestify/__init__.py +2 -1
  2. ingestify/application/dataset_store.py +228 -11
  3. ingestify/application/ingestion_engine.py +232 -7
  4. ingestify/application/loader.py +163 -28
  5. ingestify/cmdline.py +0 -48
  6. ingestify/domain/models/__init__.py +2 -0
  7. ingestify/domain/models/dataset/collection.py +0 -9
  8. ingestify/domain/models/dataset/dataset_repository.py +4 -0
  9. ingestify/domain/models/dataset/dataset_state.py +5 -0
  10. ingestify/domain/models/dataset/events.py +13 -0
  11. ingestify/domain/models/dataset/file.py +7 -1
  12. ingestify/domain/models/dataset/selector.py +8 -1
  13. ingestify/domain/models/event/event_bus.py +16 -1
  14. ingestify/domain/models/ingestion/ingestion_job.py +23 -4
  15. ingestify/domain/models/resources/dataset_resource.py +0 -1
  16. ingestify/infra/source/statsbomb/base.py +36 -0
  17. ingestify/infra/source/statsbomb/match.py +137 -0
  18. ingestify/infra/source/statsbomb_github.py +46 -44
  19. ingestify/infra/store/dataset/sqlalchemy/repository.py +77 -10
  20. ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
  21. ingestify/main.py +190 -10
  22. ingestify/utils.py +2 -32
  23. ingestify-0.8.0.dist-info/METADATA +257 -0
  24. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/RECORD +28 -36
  25. ingestify/infra/source/wyscout.py +0 -175
  26. ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
  27. ingestify/static/templates/statsbomb_github/database/README.md +0 -1
  28. ingestify/static/templates/statsbomb_github/query.py +0 -14
  29. ingestify/static/templates/wyscout/.env +0 -5
  30. ingestify/static/templates/wyscout/.gitignore +0 -2
  31. ingestify/static/templates/wyscout/README.md +0 -0
  32. ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
  33. ingestify/static/templates/wyscout/database/README.md +0 -1
  34. ingestify/static/templates/wyscout/query.py +0 -14
  35. ingestify-0.6.4.dist-info/METADATA +0 -266
  36. /ingestify/{static/templates/statsbomb_github/README.md → infra/source/statsbomb/__init__.py} +0 -0
  37. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/WHEEL +0 -0
  38. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/entry_points.txt +0 -0
  39. {ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/top_level.txt +0 -0
ingestify/utils.py CHANGED
@@ -5,13 +5,11 @@ import re
5
5
  import traceback
6
6
  from concurrent.futures import ThreadPoolExecutor
7
7
  from contextlib import contextmanager
8
- from multiprocessing import get_context, cpu_count, get_all_start_methods
9
8
 
10
9
  from datetime import datetime, timezone
11
10
  from string import Template
12
11
  from typing import Dict, Tuple, Optional, Any, List
13
12
 
14
- import cloudpickle
15
13
  from pydantic import Field
16
14
  from typing_extensions import Self
17
15
 
@@ -75,8 +73,8 @@ class AttributeBag:
75
73
  return Template(string).substitute(**self.attributes)
76
74
 
77
75
  def matches(self, attributes: Dict) -> bool:
78
- for k, v in self.attributes.items():
79
- if attributes.get(k) != v:
76
+ for k, v in attributes.items():
77
+ if k in self.attributes and self.attributes[k] != v:
80
78
  return False
81
79
  return True
82
80
 
@@ -110,34 +108,6 @@ class AttributeBag:
110
108
  )
111
109
 
112
110
 
113
- def cloud_unpack_and_call(args):
114
- f_pickled, org_args = args
115
-
116
- f = cloudpickle.loads(f_pickled)
117
- return f(org_args)
118
-
119
-
120
- def map_in_pool(func, iterable, processes=0):
121
- # TODO: move to cmdline
122
- if os.environ.get("INGESTIFY_RUN_EAGER") == "true":
123
- return list(map(func, iterable))
124
-
125
- if not processes:
126
- processes = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
127
-
128
- if "fork" in get_all_start_methods():
129
- ctx = get_context("fork")
130
- else:
131
- ctx = get_context("spawn")
132
-
133
- wrapped_fn = cloudpickle.dumps(func)
134
-
135
- with ctx.Pool(processes or cpu_count()) as pool:
136
- return pool.map(
137
- cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
138
- )
139
-
140
-
141
111
  class SyncExecutor:
142
112
  def map(self, func, iterable):
143
113
  return [func(item) for item in iterable]
@@ -0,0 +1,257 @@
1
+ Metadata-Version: 2.1
2
+ Name: ingestify
3
+ Version: 0.8.0
4
+ Summary: Data Ingestion Framework
5
+ Author: Koen Vossen
6
+ Author-email: info@koenvossen.nl
7
+ License: AGPL
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: requests<3,>=2.0.0
10
+ Requires-Dist: SQLAlchemy<3,>=2
11
+ Requires-Dist: click>=8
12
+ Requires-Dist: python-dotenv
13
+ Requires-Dist: pyaml-env
14
+ Requires-Dist: boto3
15
+ Requires-Dist: pydantic>=2.0.0
16
+ Provides-Extra: test
17
+ Requires-Dist: pytest<7,>=6.2.5; extra == "test"
18
+ Requires-Dist: pytz; extra == "test"
19
+
20
+ # Ingestify
21
+
22
+ _Ingest everything – JSON, CSV, tracking ZIPs, even MP4 – keep it version‑safe, sync only what changed, and analyse while you ingest._
23
+
24
+ ---
25
+
26
+ ## Why Ingestify?
27
+
28
+ Football‐data APIs are often **slow**, **rate‑limited** or just **down**. One parsing bug and you’re forced to pull tens of gigabytes again.
29
+ Ingestify fixes that by building **your own data lake** of untouched provider files and fetching only what’s new:
30
+
31
+ * **Own your lake** – The first time you ask for a match, Ingestify downloads the original files (metadata, line‑ups, events, tracking, video) and stores them untouched in local disk, S3, GCS… every later query hits *your* lake, not the provider.
32
+ * **Never re‑fetch the world** – A file‑level checksum / timestamp check moves only changed bundles across the wire.
33
+ * **Atomic, complete packages** – A *Dataset* is all‑or‑nothing:
34
+
35
+ | Dataset type | Always contains |
36
+ |--------------|-----------------|
37
+ | **Match Dataset** | metadata + line‑ups + events |
38
+ | **Tracking Dataset** | metadata + raw tracking frames |
39
+
40
+ You never analyse events v2 with lineups v1, or yesterday’s first half with today’s second half.
41
+ * **Query while ingesting** – Datasets stream out of the engine the moment their files land, so notebooks or downstream services can start before the full season is in.
42
+
43
+ ---
44
+
45
+ ## The Ingestify Workflow
46
+ <img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
47
+
48
+ ---
49
+
50
+ ## What you gain
51
+
52
+ ### For football‑analytics practitioners
53
+
54
+ | Pain | Ingestify fix |
55
+ |------|---------------|
56
+ | API slowness / downtime | One request → lake; retries and parallelism happen behind the scenes. |
57
+ | Full re‑ingest after a bug | File‑level deltas mean you fetch only the corrected bundles. |
58
+ | Partial / drifting data | Dataset is atomic, versioned, and validated before it becomes visible. |
59
+ | Waiting hours for a season to sync | Stream each Dataset as soon as it lands; analyse while you ingest. |
60
+ | Boilerplate joins | `engine.load_dataset_with_kloppy(dataset)` → analysis‑ready object. |
61
+
62
+ ### For software engineers
63
+
64
+ | Need | How Ingestify helps |
65
+ |------|---------------------|
66
+ | **Domain‑Driven Design** | `Dataset`, `Revision`, `Selector` plus rich domain events read like the problem space. |
67
+ | **Event‑driven integrations** | Subscribe to `RevisionAdded` and push to Kafka, AWS Lambda, Airflow… |
68
+ | **Pluggable everything** | Swap `Source`, `FetchPolicy`, `DatasetStore` subclasses to add providers, change delta logic, or move storage back‑ends. |
69
+ | **Safety & speed** | Multiprocessing downloader with temp‑dir commits – no half‑written matches; near‑linear I/O speed‑ups. |
70
+ | **Any file type** | JSON, CSV, MP4, proprietary binaries – stored verbatim so you parse / transcode later under version control. |
71
+
72
+ ---
73
+
74
+ ## Quick start
75
+
76
+ ```bash
77
+ pip install ingestify # or: pip install git+https://github.com/PySport/ingestify.git
78
+ ```
79
+
80
+ ### Developing a new Source
81
+
82
+ When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
83
+
84
+ ```python
85
+ from ingestify import Source, debug_source
86
+
87
+ class MyCustomSource(Source):
88
+ provider = "my_provider"
89
+
90
+ def __init__(self, name: str, api_key: str):
91
+ super().__init__(name)
92
+ self.api_key = api_key
93
+
94
+ def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
95
+ # Your source implementation
96
+ ...
97
+
98
+ # Quick debug - runs full ingestion with temp storage
99
+ if __name__ == "__main__":
100
+ source = MyCustomSource(name="test", api_key="...")
101
+
102
+ debug_source(
103
+ source,
104
+ dataset_type="match",
105
+ data_spec_versions={"events": "v1"},
106
+ )
107
+ ```
108
+
109
+ The `debug_source()` helper:
110
+ - ✅ Creates an ephemeral dev engine with temp storage
111
+ - ✅ Configures logging automatically
112
+ - ✅ Runs the full ingestion cycle
113
+ - ✅ Shows storage location and results
114
+
115
+ Perfect for testing your source before adding it to production config!
116
+
117
+ ### Minimal `config.yaml`
118
+
119
+ ```yaml
120
+ main:
121
+ metadata_url: sqlite:///database/catalog.db # where revision metadata lives
122
+ file_url: file://database/files/ # where raw files live
123
+ default_bucket: main
124
+
125
+ sources:
126
+ statsbomb:
127
+ type: ingestify.statsbomb_github # open‑data provider
128
+
129
+ ingestion_plans:
130
+ - source: statsbomb
131
+ dataset_type: match
132
+ # selectors can narrow the scope
133
+ # selectors:
134
+ # - competition_id: 11
135
+ # season_id: [90]
136
+ ```
137
+
138
+ ### First ingest
139
+
140
+ When you configured event subscribers, all domain events are dispatched to the subscriber. Publishing the events to
141
+ Kafka, RabbitMQ or any other system becomes trivial.
142
+
143
+ ```bash
144
+ mkdir -p database
145
+ pip install kloppy
146
+
147
+ ingestify run # fills your data lake
148
+ ```
149
+
150
+ ---
151
+
152
+ ## Using the data
153
+
154
+ By default, Ingestify will search in your DatasetStore when you request data. You can pass several filters to only fetch what you need.
155
+
156
+ ```python
157
+ from ingestify.main import get_engine
158
+
159
+ engine = get_engine("config.yaml")
160
+
161
+ for dataset in engine.iter_datasets(
162
+ dataset_state="complete",
163
+ provider="statsbomb",
164
+ dataset_type="match",
165
+ competition_id=11,
166
+ season_id=90):
167
+ df = (
168
+ engine
169
+ .load_dataset_with_kloppy(dataset)
170
+ .to_df(engine="polars")
171
+ )
172
+ df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
173
+ ```
174
+
175
+ #### Auto Ingestion
176
+
177
+ When you don't want to use event driven architecture but just want to work with the latest data, ingestify got you covered. With the `auto_ingest` option, ingestify syncs the data in the background when you ask for the data.
178
+
179
+
180
+ ```python
181
+ from ingestify.main import get_engine
182
+
183
+ engine = get_engine("config.yaml")
184
+
185
+ for dataset in engine.iter_datasets(
186
+ # When set to True it will first do a full sync and then start yielding datasets
187
+ auto_ingest=True,
188
+
189
+ # With streaming enabled all Datasets are yielded when they are up-to-date (not changed, or refetched)
190
+ # auto_ingest={"streaming": True}
191
+
192
+ dataset_state="complete",
193
+ provider="statsbomb",
194
+ dataset_type="match",
195
+ competition_id=11,
196
+ season_id=90):
197
+ df = (
198
+ engine
199
+ .load_dataset_with_kloppy(dataset)
200
+ .to_df(engine="polars")
201
+ )
202
+ df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
203
+ ```
204
+
205
+ #### Open data
206
+
207
+ Ingestify has build-in support for StatsBomb Open Data (more to come).
208
+
209
+ ```shell
210
+ mkdir database_open_data
211
+ pip install kloppy
212
+ ```
213
+
214
+ ```python
215
+ import logging, sys
216
+
217
+ from ingestify.main import get_engine
218
+
219
+ logging.basicConfig(
220
+ level=logging.INFO,
221
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
222
+ stream=sys.stderr,
223
+ )
224
+
225
+ engine = get_engine(
226
+ metadata_url="sqlite:///database_open_data/catalog.db",
227
+ file_url="file://database_open_data/files/"
228
+ )
229
+
230
+ dataset_iter = engine.iter_datasets(
231
+ # This will tell ingestify to look for an Open Data provider
232
+ auto_ingest={"use_open_data": True, "streaming": True},
233
+
234
+ provider="statsbomb",
235
+ dataset_type="match",
236
+ competition_id=43, # "FIFA World Cup"
237
+ #season_id=281
238
+ )
239
+
240
+ for dataset in dataset_iter:
241
+ kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
242
+ logging.info(f"Loaded {kloppy_dataset}")
243
+ ```
244
+
245
+
246
+ ---
247
+
248
+ ## Roadmap
249
+
250
+ * Workflow orchestration helpers (Airflow, Dagster, Prefect)
251
+ * Built‑in Kafka / Kinesis event emitters
252
+ * Streaming data ingestion
253
+ * Data quality hooks (Great Expectations)
254
+
255
+ ---
256
+
257
+ **Stop refetching the world. Own your data lake, keep it version‑safe, and analyse football faster with Ingestify.**
@@ -1,17 +1,17 @@
1
- ingestify/__init__.py,sha256=6tuDYCbk78GEM1qI7Uy35u62q6nyriksCeJ7JcD7W98,301
2
- ingestify/cmdline.py,sha256=JcveX6e4i6mJtIllhTuruwbqxyoKIITIWE8kB6byvJU,7721
1
+ ingestify/__init__.py,sha256=FeK7pau-iTc6ooJiPelblIhkrPLojVHKpTHXIrkdpq8,336
2
+ ingestify/cmdline.py,sha256=Rs1_lSKSIJrcygH5fvtOGicOl_e0sZYW7deqp4_jGbY,6233
3
3
  ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
4
- ingestify/main.py,sha256=yYKA-4WAk04RdBCGmatsCKiPFQzpyufoG4VzHiWkVtU,8979
4
+ ingestify/main.py,sha256=WjhcsT21F7dOibrg_S7wRiui6Ytj5ScsWqMCGuv9fs8,14938
5
5
  ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
6
6
  ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
7
- ingestify/utils.py,sha256=EMdG3ZP3bX9DHxHvBLdkYLC3vcEVym7dmpIXQTikI3I,7281
7
+ ingestify/utils.py,sha256=tsoo-GgeSrwK161WCqW793BAm5bjvnGwI8yGgLTJ1lk,6486
8
8
  ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- ingestify/application/dataset_store.py,sha256=l5YrU5Xmk3N1t6_z2UYFjeYtCvo-gh_MGeEDjzZY8Wk,12506
10
- ingestify/application/ingestion_engine.py,sha256=4SAmPZDm3e2QA5jZvMrb6xz1eDDshKoSZDWH3TCe4Bo,2372
11
- ingestify/application/loader.py,sha256=OvlBBmCiQS3KTs5G7kBbxcP80WTfFxJZ-CXGsJJGH8M,7958
9
+ ingestify/application/dataset_store.py,sha256=GP6wGjVirefEn6hlqWIkOBqdELad9L_mmTpdHdzj18M,20353
10
+ ingestify/application/ingestion_engine.py,sha256=we16yiDS9QGOlAUiP1vidDycihjWK3B2jo64uqKmrXE,11246
11
+ ingestify/application/loader.py,sha256=K99ZJuHMEJFO6CIlxoyHKGSQtXw63JgOYu3moUD6sR0,13400
12
12
  ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
13
13
  ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
14
- ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
14
+ ingestify/domain/models/__init__.py,sha256=WuKS34uiR1EwyczKujBHYGupqseJP-U2P5IQS4kpsA8,838
15
15
  ingestify/domain/models/base.py,sha256=4gKbREajxJHlS-VwKoosNtHVupZ4eDLKMqnJ4ib0aS8,184
16
16
  ingestify/domain/models/data_spec_version_collection.py,sha256=x5BvBnVI9QAfqhjCrUK19HKAiujdU1m8PkbQZwDheFU,1338
17
17
  ingestify/domain/models/fetch_policy.py,sha256=I-DnIHI_0bYlD0vpKJ58Z6he85pXvjdqXkVQA8axJ8Y,1461
@@ -19,31 +19,31 @@ ingestify/domain/models/sink.py,sha256=OBVfFMpB7puJmHg4q2KYx4qgoAnlmX8xKWYnPi8a9
19
19
  ingestify/domain/models/source.py,sha256=sB3aqr2LfjIbtw7ODJpHnPj3RUeo7gYmTU7MXvfaYg4,973
20
20
  ingestify/domain/models/timing.py,sha256=TvvH6Szo61CD8wCP7Awyc45CXga5lKqvoW2U-0TRHlA,388
21
21
  ingestify/domain/models/dataset/__init__.py,sha256=i1kswluvWjw0xn4OUByRt7yeRvNHu1mauevv-Vmayx4,630
22
- ingestify/domain/models/dataset/collection.py,sha256=E2utQ6oyaFFrfQFMiwP9J_I7Wm21z0sRvE4Zc3QEs20,1310
22
+ ingestify/domain/models/dataset/collection.py,sha256=YKGQv6hqm88MYlNp2c47CoWysyNZWCeZwTkwCVpQcaE,1055
23
23
  ingestify/domain/models/dataset/collection_metadata.py,sha256=aWY6O3_JLj_jKfVfUTjmi3-E4heBmmmtqX81vhdzr0I,498
24
24
  ingestify/domain/models/dataset/dataset.py,sha256=OiP03nY0-m06y2GTrs_m-RiZE8HwypIHRwSqoM_DNnQ,4049
25
- ingestify/domain/models/dataset/dataset_repository.py,sha256=kUjiqW58kOUOli1gZCLR5xw4dBX0bqI1UJsf16hgNsQ,812
26
- ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4FVk_T-ZNUDezkvt7VzY,220
27
- ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
28
- ingestify/domain/models/dataset/file.py,sha256=g2inMsvM8ElVleuEjTWL5_eATWSFyfAHW09kj_5Df-0,4224
25
+ ingestify/domain/models/dataset/dataset_repository.py,sha256=bf3F_1cKw0CvUberD3FMROE8iowAmYefnD4L6aPB39k,989
26
+ ingestify/domain/models/dataset/dataset_state.py,sha256=IaYG02WzgooGaM_AuwRhZgljs-9NhCF_LpBZXkl5ELY,324
27
+ ingestify/domain/models/dataset/events.py,sha256=M8jrHWCm9iXapAy3xjvZZtiiOxXDnfefBixiMwkas24,786
28
+ ingestify/domain/models/dataset/file.py,sha256=cXDjSw19HRMCGFpVN4u1oejxE1V8SMQptfNVDVixj6o,4464
29
29
  ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
30
30
  ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
31
31
  ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
32
32
  ingestify/domain/models/dataset/revision.py,sha256=e8-NsRS8AILrNjWqCxqANF55oY091CN3fBmIiVS9wz0,2049
33
- ingestify/domain/models/dataset/selector.py,sha256=PYoy-nHitrrvFmYqSlLIFX4Xd2-qiKMcFz1hOnPvRtU,1289
33
+ ingestify/domain/models/dataset/selector.py,sha256=qGRA22gDAHhjDAhMWzOjZPz3Rrs1V-DZ32z75NARoTQ,1448
34
34
  ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
35
35
  ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
36
36
  ingestify/domain/models/event/dispatcher.py,sha256=5WnyUJ7Qzr612btAtl1dMG9JBXDPcsBLyLmW6H7Q1zk,154
37
37
  ingestify/domain/models/event/domain_event.py,sha256=OR6va417j2lisRr0gjQZ9rshAtlys5sVu7KU-W0r0xA,316
38
- ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmceWLstOxiP3-2qU,576
38
+ ingestify/domain/models/event/event_bus.py,sha256=feVXsbBcRNkbWYvXbmz-Yi9-3R690ymc9KkpejkfLxg,911
39
39
  ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
40
40
  ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
41
41
  ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- ingestify/domain/models/ingestion/ingestion_job.py,sha256=diW4D5ooo2Rj5LHwLuptIuyjD_c0CuwqiBVFiubZk80,14592
42
+ ingestify/domain/models/ingestion/ingestion_job.py,sha256=KaKpAu0XKvWV1YoWaTlOjbapcs-CCAvOHlSxUHZxZwI,15450
43
43
  ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=7dmkWEvE7lMSv1ILWcSvys1bUGuGe_s-YbOFC6eYMBI,4794
44
44
  ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
45
45
  ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
46
- ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
46
+ ingestify/domain/models/resources/dataset_resource.py,sha256=Le_C4nPzPPTDq75_amKSNsR94QvVWdZ_ZkjYIKa6whM,3084
47
47
  ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
48
48
  ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
49
49
  ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
@@ -59,29 +59,21 @@ ingestify/infra/serialization/__init__.py,sha256=UqXWJmKTp7Mi58ZyDASGguPFlqdVWVU
59
59
  ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
60
  ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
61
61
  ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
- ingestify/infra/source/statsbomb_github.py,sha256=IzzrlIRqkChgJp87yW3ugG1my4g_5uMx_xEnoQLWNss,3543
63
- ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nOUGxE,5626
62
+ ingestify/infra/source/statsbomb_github.py,sha256=KHpl3Ojw2ZEkMdyLh1VOLIqrz6blHWldTLpsSgXyf-M,3773
63
+ ingestify/infra/source/statsbomb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
+ ingestify/infra/source/statsbomb/base.py,sha256=f486brtGdK_zPipHAtmVpnp7gcYdPSV28iTUqsBxldA,1155
65
+ ingestify/infra/source/statsbomb/match.py,sha256=8Zpdys6-bB_ral2AmjGKhF4BnXW3F0Y0C5aWnhxcWAY,5525
64
66
  ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
65
67
  ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
68
  ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
67
- ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=vO5usxMbMks7EKPASVCj6bjVld7c9LmlQkWNqLU-Kvs,19916
68
- ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=kALM32jbdeZ4Wn9gON-w2WSb5tH1lIWaBFgn5i29qTk,10635
69
+ ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=cMmhXqUNp_HUg_IgsUlJ439VXX_H67pnivaToUlqlA4,22552
70
+ ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=ffHop9DQeVE9JrCMLJ2EvF7MD7j8thfjVwv2xcsbJtY,10954
69
71
  ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
70
72
  ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
71
73
  ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
72
74
  ingestify/infra/store/file/s3_file_repository.py,sha256=tz_EZ_gun7W2qJMlI3j_R03iKBZlJSDcG7AUJ1JkdpE,1501
73
- ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
- ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
75
- ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
76
- ingestify/static/templates/statsbomb_github/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
77
- ingestify/static/templates/wyscout/.env,sha256=o2kfuDC_seZNIqDscPf2Ww5TGiJmLh_DMOUNykGvs8Q,141
78
- ingestify/static/templates/wyscout/.gitignore,sha256=db0A2IjIeZf5fLLwXKD-bLmC4pETofxm848bljymnNs,13
79
- ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
- ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
81
- ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
82
- ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
83
- ingestify-0.6.4.dist-info/METADATA,sha256=g34iFogx4pOE2FYe2wbNZg9TwH_ufGSBOSrodty--NU,18854
84
- ingestify-0.6.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
85
- ingestify-0.6.4.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
86
- ingestify-0.6.4.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
87
- ingestify-0.6.4.dist-info/RECORD,,
75
+ ingestify-0.8.0.dist-info/METADATA,sha256=rpC2ALX0e4Ii-XzhJWmRKfW7YoBgl6gEpP2cUGFlQp4,8089
76
+ ingestify-0.8.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
77
+ ingestify-0.8.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
78
+ ingestify-0.8.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
79
+ ingestify-0.8.0.dist-info/RECORD,,
@@ -1,175 +0,0 @@
1
- import json
2
- from typing import Optional, Dict, List
3
-
4
- import requests
5
-
6
- from ingestify import Source, retrieve_http
7
- from ingestify.domain import DraftFile
8
- from ingestify.exceptions import ConfigurationError
9
-
10
- BASE_URL = "https://apirest.wyscout.com/v3"
11
-
12
-
13
- def wyscout_pager_fn(url, response):
14
- if response["meta"]["page_current"] < response["meta"]["page_count"]:
15
- return f"{url}&page={response['meta']['page_current'] + 1}"
16
- else:
17
- return None
18
-
19
-
20
- class Wyscout(Source):
21
- def discover_selectors(self, dataset_type: str) -> List[Dict]:
22
- raise NotImplementedError("Not implemented for Wyscout")
23
-
24
- provider = "wyscout"
25
-
26
- def __init__(self, name: str, username: str, password: str):
27
- super().__init__(name)
28
-
29
- self.username = username.strip()
30
- self.password = password.strip()
31
-
32
- if not self.username:
33
- raise ConfigurationError(
34
- f"Username of Wyscout source named '{self.name}' cannot be empty"
35
- )
36
-
37
- if not self.password:
38
- raise ConfigurationError(
39
- f"Username of Wyscout source named '{self.name}' cannot be empty"
40
- )
41
-
42
- def _get(self, path: str):
43
- response = requests.get(
44
- BASE_URL + path,
45
- auth=(self.username, self.password),
46
- )
47
- if response.status_code == 400:
48
- # What if the response isn't a json?
49
- error = response.json()["error"]
50
- raise ConfigurationError(
51
- f"Check username/password of Wyscout source named '{self.name}'. API response "
52
- f"was '{error['message']}' ({error['code']})."
53
- )
54
-
55
- response.raise_for_status()
56
- return response.json()
57
-
58
- def _get_paged(self, path: str, data_path: str):
59
- data = []
60
- current_page = 1
61
- page_count = None
62
- while page_count is None or current_page <= page_count:
63
- page_data = self._get(path + f"?page={current_page}&limit=100")
64
- page_count = page_data["meta"]["page_count"]
65
-
66
- data.extend(page_data[data_path])
67
- current_page += 1
68
-
69
- return data
70
-
71
- def discover_datasets(self, dataset_type: str, season_id: int):
72
- matches = self._get(f"/seasons/{season_id}/matches")
73
- datasets = []
74
- for match in matches["matches"]:
75
- dataset = dict(match_id=match["matchId"], version="v3", _metadata=match)
76
- datasets.append(dataset)
77
-
78
- return datasets
79
-
80
- def fetch_dataset_files(
81
- self, dataset_type, identifier, current_version
82
- ) -> Dict[str, Optional[DraftFile]]:
83
- current_files = current_version.modified_files_map if current_version else {}
84
- files = {}
85
-
86
- for filename, url in [
87
- (
88
- "events.json",
89
- f"{BASE_URL}/matches/{identifier.match_id}/events?fetch=teams,players",
90
- ),
91
- ]:
92
- files[filename] = retrieve_http(
93
- url, current_files.get(filename), auth=(self.username, self.password)
94
- )
95
- return files
96
-
97
-
98
- #
99
- # class WyscoutEvent(Wyscout):
100
- # dataset_type = "event"
101
- #
102
- # def discover_datasets(self, season_id: int):
103
- # matches = self._get(f"/seasons/{season_id}/matches")
104
- # datasets = []
105
- # for match in matches["matches"]:
106
- # dataset = dict(match_id=match["matchId"], version="v3", _metadata=match)
107
- # datasets.append(dataset)
108
- #
109
- # return datasets
110
- #
111
- # def fetch_dataset_files(
112
- # self, identifier, current_version
113
- # ) -> Dict[str, Optional[DraftFile]]:
114
- # current_files = current_version.modified_files_map if current_version else {}
115
- # files = {}
116
- #
117
- # for filename, url in [
118
- # (
119
- # "events.json",
120
- # f"{BASE_URL}/matches/{identifier.match_id}/events?fetch=teams,players",
121
- # ),
122
- # ]:
123
- # files[filename] = retrieve_http(
124
- # url, current_files.get(filename), auth=(self.username, self.password)
125
- # )
126
- # return files
127
- #
128
- #
129
- # class WyscoutPlayer(Wyscout):
130
- # dataset_type = "player"
131
- #
132
- # def discover_datasets(self, season_id: int):
133
- # return [
134
- # dict(
135
- # version="v3",
136
- # )
137
- # ]
138
- #
139
- # def fetch_dataset_files(
140
- # self, identifier, current_version
141
- # ) -> Dict[str, Optional[DraftFile]]:
142
- # current_files = current_version.modified_files_map if current_version else {}
143
- #
144
- # return {
145
- # "players.json": retrieve_http(
146
- # f"{BASE_URL}/seasons/{identifier.season_id}/players?limit=100",
147
- # current_files.get("players.json"),
148
- # pager=("players", wyscout_pager_fn),
149
- # auth=(self.username, self.password),
150
- # )
151
- # }
152
-
153
-
154
- if __name__ == "__main__":
155
- import dotenv, os
156
-
157
- dotenv.load_dotenv()
158
-
159
- kilmarnock_id = 8516
160
- competition_id = 750
161
- season_id = 188105
162
- match_id = 5459107
163
- player_id = 840543
164
-
165
- data = requests.get(
166
- f"{BASE_URL}/competitions/{competition_id}/players",
167
- # f"{BASE_URL}/players/{player_id}/career",
168
- # f"{BASE_URL}/matches/{match_id}/advancedstats/players",
169
- # f"{BASE_URL}/competitions/{competition_id}/matches", # teams/{kilmarnock_id}/advancedstats?compId={competition_id}",
170
- # f"{BASE_URL}/teams/{kilmarnock_id}/squad", #teams/{kilmarnock_id}/advancedstats?compId={competition_id}",
171
- auth=(os.environ["WYSCOUT_USERNAME"], os.environ["WYSCOUT_PASSWORD"]),
172
- ).json()
173
- from pprint import pprint
174
-
175
- pprint(data)
@@ -1,19 +0,0 @@
1
- ingestify_version: {{ ingestify_version }}
2
-
3
- main:
4
- dataset_url: sqlite:///database/catalog.db
5
- file_url: file://database/files/
6
- default_bucket: main
7
-
8
- sources:
9
- statsbomb:
10
- type: ingestify.statsbomb_github
11
-
12
- extract_jobs:
13
- - source: statsbomb
14
- selectors:
15
- - competition_id: 11
16
- season_id: [42, 90]
17
-
18
- # passing an empty selector means: fetch everything
19
- # -
@@ -1 +0,0 @@
1
- # This will contain the database
@@ -1,14 +0,0 @@
1
- from ingestify.main import get_datastore
2
-
3
-
4
- def main():
5
- store = get_datastore("config.yaml")
6
- dataset_collection = store.get_dataset_collection()
7
-
8
- for dataset in dataset_collection:
9
- kloppy_dataset = store.load_with_kloppy(dataset)
10
- print(f"Loaded dataset with {len(kloppy_dataset.records)} events")
11
-
12
-
13
- if __name__ == "__main__":
14
- main()
@@ -1,5 +0,0 @@
1
- # Template .env file from Ingestify
2
- # You should not add this file to a version control system like git
3
-
4
- WYSCOUT_USERNAME=
5
- WYSCOUT_PASSWORD=
@@ -1,2 +0,0 @@
1
- .env
2
- database
File without changes
@@ -1,18 +0,0 @@
1
- ingestify_version: {{ ingestify_version }}
2
-
3
- main:
4
- dataset_url: sqlite:///database/catalog.db
5
- file_url: file://database/files/
6
- default_bucket: main
7
-
8
- sources:
9
- wyscout:
10
- type: ingestify.wyscout
11
- configuration:
12
- username: !ENV ${WYSCOUT_USERNAME}
13
- password: !ENV ${WYSCOUT_PASSWORD}
14
-
15
- extract_jobs:
16
- - source: wyscout
17
- selectors:
18
- - season_id: 188105
@@ -1 +0,0 @@
1
- # This will contain the database