PyPI - ingestify - Versions diffs - 0.6.4__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

ingestify 0.6.4py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

ingestify/__init__.py +2 -1
ingestify/application/dataset_store.py +228 -11
ingestify/application/ingestion_engine.py +232 -7
ingestify/application/loader.py +163 -28
ingestify/cmdline.py +0 -48
ingestify/domain/models/__init__.py +2 -0
ingestify/domain/models/dataset/collection.py +0 -9
ingestify/domain/models/dataset/dataset_repository.py +4 -0
ingestify/domain/models/dataset/dataset_state.py +5 -0
ingestify/domain/models/dataset/events.py +13 -0
ingestify/domain/models/dataset/file.py +7 -1
ingestify/domain/models/dataset/selector.py +8 -1
ingestify/domain/models/event/event_bus.py +16 -1
ingestify/domain/models/ingestion/ingestion_job.py +23 -4
ingestify/domain/models/resources/dataset_resource.py +0 -1
ingestify/infra/source/statsbomb/base.py +36 -0
ingestify/infra/source/statsbomb/match.py +137 -0
ingestify/infra/source/statsbomb_github.py +46 -44
ingestify/infra/store/dataset/sqlalchemy/repository.py +77 -10
ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
ingestify/main.py +190 -10
ingestify/utils.py +2 -32
ingestify-0.8.0.dist-info/METADATA +257 -0
{ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/RECORD +28 -36
ingestify/infra/source/wyscout.py +0 -175
ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
ingestify/static/templates/statsbomb_github/database/README.md +0 -1
ingestify/static/templates/statsbomb_github/query.py +0 -14
ingestify/static/templates/wyscout/.env +0 -5
ingestify/static/templates/wyscout/.gitignore +0 -2
ingestify/static/templates/wyscout/README.md +0 -0
ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
ingestify/static/templates/wyscout/database/README.md +0 -1
ingestify/static/templates/wyscout/query.py +0 -14
ingestify-0.6.4.dist-info/METADATA +0 -266
/ingestify/{static/templates/statsbomb_github/README.md → infra/source/statsbomb/__init__.py} +0 -0
{ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/WHEEL +0 -0
{ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/entry_points.txt +0 -0
{ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/top_level.txt +0 -0

ingestify/utils.py CHANGED Viewed

@@ -5,13 +5,11 @@ import re
 import traceback
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
-from multiprocessing import get_context, cpu_count, get_all_start_methods
 from datetime import datetime, timezone
 from string import Template
 from typing import Dict, Tuple, Optional, Any, List
-import cloudpickle
 from pydantic import Field
 from typing_extensions import Self
@@ -75,8 +73,8 @@ class AttributeBag:
         return Template(string).substitute(**self.attributes)
     def matches(self, attributes: Dict) -> bool:
-        for k, v in self.attributes.items():
-            if attributes.get(k) != v:
+        for k, v in attributes.items():
+            if k in self.attributes and self.attributes[k] != v:
                 return False
         return True
@@ -110,34 +108,6 @@ class AttributeBag:
         )
-def cloud_unpack_and_call(args):
-    f_pickled, org_args = args
-    f = cloudpickle.loads(f_pickled)
-    return f(org_args)
-def map_in_pool(func, iterable, processes=0):
-    # TODO: move to cmdline
-    if os.environ.get("INGESTIFY_RUN_EAGER") == "true":
-        return list(map(func, iterable))
-    if not processes:
-        processes = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
-    if "fork" in get_all_start_methods():
-        ctx = get_context("fork")
-    else:
-        ctx = get_context("spawn")
-    wrapped_fn = cloudpickle.dumps(func)
-    with ctx.Pool(processes or cpu_count()) as pool:
-        return pool.map(
-            cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
-        )
 class SyncExecutor:
     def map(self, func, iterable):
         return [func(item) for item in iterable]

ingestify-0.8.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,257 @@
+Metadata-Version: 2.1
+Name: ingestify
+Version: 0.8.0
+Summary: Data Ingestion Framework
+Author: Koen Vossen
+Author-email: info@koenvossen.nl
+License: AGPL
+Description-Content-Type: text/markdown
+Requires-Dist: requests<3,>=2.0.0
+Requires-Dist: SQLAlchemy<3,>=2
+Requires-Dist: click>=8
+Requires-Dist: python-dotenv
+Requires-Dist: pyaml-env
+Requires-Dist: boto3
+Requires-Dist: pydantic>=2.0.0
+Provides-Extra: test
+Requires-Dist: pytest<7,>=6.2.5; extra == "test"
+Requires-Dist: pytz; extra == "test"
+# Ingestify
+_Ingest everything – JSON, CSV, tracking ZIPs, even MP4 – keep it version‑safe, sync only what changed, and analyse while you ingest._
+---
+## Why Ingestify?
+Football‐data APIs are often **slow**, **rate‑limited** or just **down**. One parsing bug and you’re forced to pull tens of gigabytes again.
+Ingestify fixes that by building **your own data lake** of untouched provider files and fetching only what’s new:
+* **Own your lake** – The first time you ask for a match, Ingestify downloads the original files (metadata, line‑ups, events, tracking, video) and stores them untouched in local disk, S3, GCS… every later query hits *your* lake, not the provider.
+* **Never re‑fetch the world** – A file‑level checksum / timestamp check moves only changed bundles across the wire.
+* **Atomic, complete packages** – A *Dataset* is all‑or‑nothing:
+  | Dataset type | Always contains |
+  |--------------|-----------------|
+  | **Match Dataset** | metadata + line‑ups + events |
+  | **Tracking Dataset** | metadata + raw tracking frames |
+ You never analyse events v2 with lineups v1, or yesterday’s first half with today’s second half.
+* **Query while ingesting** – Datasets stream out of the engine the moment their files land, so notebooks or downstream services can start before the full season is in.
+---
+## The Ingestify Workflow
+<img src="https://raw.githubusercontent.com/PySport/ingestify/refs/heads/main/docs/overview.svg" />
+---
+## What you gain
+### For football‑analytics practitioners
+| Pain | Ingestify fix |
+|------|---------------|
+| API slowness / downtime | One request → lake; retries and parallelism happen behind the scenes. |
+| Full re‑ingest after a bug | File‑level deltas mean you fetch only the corrected bundles. |
+| Partial / drifting data | Dataset is atomic, versioned, and validated before it becomes visible. |
+| Waiting hours for a season to sync | Stream each Dataset as soon as it lands; analyse while you ingest. |
+| Boilerplate joins | `engine.load_dataset_with_kloppy(dataset)` → analysis‑ready object. |
+### For software engineers
+| Need | How Ingestify helps |
+|------|---------------------|
+| **Domain‑Driven Design** | `Dataset`, `Revision`, `Selector` plus rich domain events read like the problem space. |
+| **Event‑driven integrations** | Subscribe to `RevisionAdded` and push to Kafka, AWS Lambda, Airflow… |
+| **Pluggable everything** | Swap `Source`, `FetchPolicy`, `DatasetStore` subclasses to add providers, change delta logic, or move storage back‑ends. |
+| **Safety & speed** | Multiprocessing downloader with temp‑dir commits – no half‑written matches; near‑linear I/O speed‑ups. |
+| **Any file type** | JSON, CSV, MP4, proprietary binaries – stored verbatim so you parse / transcode later under version control. |
+---
+## Quick start
+```bash
+pip install ingestify            # or: pip install git+https://github.com/PySport/ingestify.git
+```
+### Developing a new Source
+When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
+```python
+from ingestify import Source, debug_source
+class MyCustomSource(Source):
+    provider = "my_provider"
+    def __init__(self, name: str, api_key: str):
+        super().__init__(name)
+        self.api_key = api_key
+    def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
+        # Your source implementation
+        ...
+# Quick debug - runs full ingestion with temp storage
+if __name__ == "__main__":
+    source = MyCustomSource(name="test", api_key="...")
+    debug_source(
+        source,
+        dataset_type="match",
+        data_spec_versions={"events": "v1"},
+    )
+```
+The `debug_source()` helper:
+- ✅ Creates an ephemeral dev engine with temp storage
+- ✅ Configures logging automatically
+- ✅ Runs the full ingestion cycle
+- ✅ Shows storage location and results
+Perfect for testing your source before adding it to production config!
+### Minimal `config.yaml`
+```yaml
+main:
+  metadata_url: sqlite:///database/catalog.db   # where revision metadata lives
+  file_url: file://database/files/              # where raw files live
+  default_bucket: main
+sources:
+  statsbomb:
+    type: ingestify.statsbomb_github            # open‑data provider
+ingestion_plans:
+  - source: statsbomb
+    dataset_type: match
+    # selectors can narrow the scope
+    # selectors:
+    #   - competition_id: 11
+    #     season_id: [90]
+```
+### First ingest
+When you configured event subscribers, all domain events are dispatched to the subscriber. Publishing the events to
+Kafka, RabbitMQ or any other system becomes trivial.
+```bash
+mkdir -p database
+pip install kloppy
+ingestify run                                # fills your data lake
+```
+---
+## Using the data
+By default, Ingestify will search in your DatasetStore when you request data. You can pass several filters to only fetch what you need.
+```python
+from ingestify.main import get_engine
+engine = get_engine("config.yaml")
+for dataset in engine.iter_datasets(
+        dataset_state="complete",
+        provider="statsbomb",
+        dataset_type="match",
+        competition_id=11,
+        season_id=90):
+    df = (
+        engine
+        .load_dataset_with_kloppy(dataset)
+        .to_df(engine="polars")
+    )
+    df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
+```
+#### Auto Ingestion
+When you don't want to use event driven architecture but just want to work with the latest data, ingestify got you covered. With the `auto_ingest` option, ingestify syncs the data in the background when you ask for the data.
+```python
+from ingestify.main import get_engine
+engine = get_engine("config.yaml")
+for dataset in engine.iter_datasets(
+        # When set to True it will first do a full sync and then start yielding datasets
+        auto_ingest=True,
+        # With streaming enabled all Datasets are yielded when they are up-to-date (not changed, or refetched)
+        # auto_ingest={"streaming": True}
+        dataset_state="complete",
+        provider="statsbomb",
+        dataset_type="match",
+        competition_id=11,
+        season_id=90):
+    df = (
+        engine
+        .load_dataset_with_kloppy(dataset)
+        .to_df(engine="polars")
+    )
+    df.write_parquet(f"out/{dataset.identifier['match_id']}.parquet")
+```
+#### Open data
+Ingestify has build-in support for StatsBomb Open Data (more to come).
+```shell
+mkdir database_open_data
+pip install kloppy
+```
+```python
+import logging, sys
+from ingestify.main import get_engine
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    stream=sys.stderr,
+)
+engine = get_engine(
+    metadata_url="sqlite:///database_open_data/catalog.db",
+    file_url="file://database_open_data/files/"
+)
+dataset_iter = engine.iter_datasets(
+    # This will tell ingestify to look for an Open Data provider
+    auto_ingest={"use_open_data": True, "streaming": True},
+    provider="statsbomb",
+    dataset_type="match",
+    competition_id=43,  # "FIFA World Cup"
+    #season_id=281
+)
+for dataset in dataset_iter:
+    kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
+    logging.info(f"Loaded {kloppy_dataset}")
+```
+---
+## Roadmap
+* Workflow orchestration helpers (Airflow, Dagster, Prefect)
+* Built‑in Kafka / Kinesis event emitters
+* Streaming data ingestion
+* Data quality hooks (Great Expectations)
+---
+**Stop refetching the world. Own your data lake, keep it version‑safe, and analyse football faster with Ingestify.**

{ingestify-0.6.4.dist-info → ingestify-0.8.0.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
-ingestify/__init__.py,sha256=6tuDYCbk78GEM1qI7Uy35u62q6nyriksCeJ7JcD7W98,301
-ingestify/cmdline.py,sha256=JcveX6e4i6mJtIllhTuruwbqxyoKIITIWE8kB6byvJU,7721
+ingestify/__init__.py,sha256=FeK7pau-iTc6ooJiPelblIhkrPLojVHKpTHXIrkdpq8,336
+ingestify/cmdline.py,sha256=Rs1_lSKSIJrcygH5fvtOGicOl_e0sZYW7deqp4_jGbY,6233
 ingestify/exceptions.py,sha256=izRzaLQmMy-4P8ZqGqVZyf4k6LFYOYqwYLuRaUH8BJw,187
-ingestify/main.py,sha256=yYKA-4WAk04RdBCGmatsCKiPFQzpyufoG4VzHiWkVtU,8979
+ingestify/main.py,sha256=WjhcsT21F7dOibrg_S7wRiui6Ytj5ScsWqMCGuv9fs8,14938
 ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
 ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
-ingestify/utils.py,sha256=EMdG3ZP3bX9DHxHvBLdkYLC3vcEVym7dmpIXQTikI3I,7281
+ingestify/utils.py,sha256=tsoo-GgeSrwK161WCqW793BAm5bjvnGwI8yGgLTJ1lk,6486
 ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ingestify/application/dataset_store.py,sha256=l5YrU5Xmk3N1t6_z2UYFjeYtCvo-gh_MGeEDjzZY8Wk,12506
-ingestify/application/ingestion_engine.py,sha256=4SAmPZDm3e2QA5jZvMrb6xz1eDDshKoSZDWH3TCe4Bo,2372
-ingestify/application/loader.py,sha256=OvlBBmCiQS3KTs5G7kBbxcP80WTfFxJZ-CXGsJJGH8M,7958
+ingestify/application/dataset_store.py,sha256=GP6wGjVirefEn6hlqWIkOBqdELad9L_mmTpdHdzj18M,20353
+ingestify/application/ingestion_engine.py,sha256=we16yiDS9QGOlAUiP1vidDycihjWK3B2jo64uqKmrXE,11246
+ingestify/application/loader.py,sha256=K99ZJuHMEJFO6CIlxoyHKGSQtXw63JgOYu3moUD6sR0,13400
 ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
 ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
-ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
+ingestify/domain/models/__init__.py,sha256=WuKS34uiR1EwyczKujBHYGupqseJP-U2P5IQS4kpsA8,838
 ingestify/domain/models/base.py,sha256=4gKbREajxJHlS-VwKoosNtHVupZ4eDLKMqnJ4ib0aS8,184
 ingestify/domain/models/data_spec_version_collection.py,sha256=x5BvBnVI9QAfqhjCrUK19HKAiujdU1m8PkbQZwDheFU,1338
 ingestify/domain/models/fetch_policy.py,sha256=I-DnIHI_0bYlD0vpKJ58Z6he85pXvjdqXkVQA8axJ8Y,1461
@@ -19,31 +19,31 @@ ingestify/domain/models/sink.py,sha256=OBVfFMpB7puJmHg4q2KYx4qgoAnlmX8xKWYnPi8a9
 ingestify/domain/models/source.py,sha256=sB3aqr2LfjIbtw7ODJpHnPj3RUeo7gYmTU7MXvfaYg4,973
 ingestify/domain/models/timing.py,sha256=TvvH6Szo61CD8wCP7Awyc45CXga5lKqvoW2U-0TRHlA,388
 ingestify/domain/models/dataset/__init__.py,sha256=i1kswluvWjw0xn4OUByRt7yeRvNHu1mauevv-Vmayx4,630
-ingestify/domain/models/dataset/collection.py,sha256=E2utQ6oyaFFrfQFMiwP9J_I7Wm21z0sRvE4Zc3QEs20,1310
+ingestify/domain/models/dataset/collection.py,sha256=YKGQv6hqm88MYlNp2c47CoWysyNZWCeZwTkwCVpQcaE,1055
 ingestify/domain/models/dataset/collection_metadata.py,sha256=aWY6O3_JLj_jKfVfUTjmi3-E4heBmmmtqX81vhdzr0I,498
 ingestify/domain/models/dataset/dataset.py,sha256=OiP03nY0-m06y2GTrs_m-RiZE8HwypIHRwSqoM_DNnQ,4049
-ingestify/domain/models/dataset/dataset_repository.py,sha256=kUjiqW58kOUOli1gZCLR5xw4dBX0bqI1UJsf16hgNsQ,812
-ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4FVk_T-ZNUDezkvt7VzY,220
-ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
-ingestify/domain/models/dataset/file.py,sha256=g2inMsvM8ElVleuEjTWL5_eATWSFyfAHW09kj_5Df-0,4224
+ingestify/domain/models/dataset/dataset_repository.py,sha256=bf3F_1cKw0CvUberD3FMROE8iowAmYefnD4L6aPB39k,989
+ingestify/domain/models/dataset/dataset_state.py,sha256=IaYG02WzgooGaM_AuwRhZgljs-9NhCF_LpBZXkl5ELY,324
+ingestify/domain/models/dataset/events.py,sha256=M8jrHWCm9iXapAy3xjvZZtiiOxXDnfefBixiMwkas24,786
+ingestify/domain/models/dataset/file.py,sha256=cXDjSw19HRMCGFpVN4u1oejxE1V8SMQptfNVDVixj6o,4464
 ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
 ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
 ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
 ingestify/domain/models/dataset/revision.py,sha256=e8-NsRS8AILrNjWqCxqANF55oY091CN3fBmIiVS9wz0,2049
-ingestify/domain/models/dataset/selector.py,sha256=PYoy-nHitrrvFmYqSlLIFX4Xd2-qiKMcFz1hOnPvRtU,1289
+ingestify/domain/models/dataset/selector.py,sha256=qGRA22gDAHhjDAhMWzOjZPz3Rrs1V-DZ32z75NARoTQ,1448
 ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
 ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
 ingestify/domain/models/event/dispatcher.py,sha256=5WnyUJ7Qzr612btAtl1dMG9JBXDPcsBLyLmW6H7Q1zk,154
 ingestify/domain/models/event/domain_event.py,sha256=OR6va417j2lisRr0gjQZ9rshAtlys5sVu7KU-W0r0xA,316
-ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmceWLstOxiP3-2qU,576
+ingestify/domain/models/event/event_bus.py,sha256=feVXsbBcRNkbWYvXbmz-Yi9-3R690ymc9KkpejkfLxg,911
 ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
 ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
 ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ingestify/domain/models/ingestion/ingestion_job.py,sha256=diW4D5ooo2Rj5LHwLuptIuyjD_c0CuwqiBVFiubZk80,14592
+ingestify/domain/models/ingestion/ingestion_job.py,sha256=KaKpAu0XKvWV1YoWaTlOjbapcs-CCAvOHlSxUHZxZwI,15450
 ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=7dmkWEvE7lMSv1ILWcSvys1bUGuGe_s-YbOFC6eYMBI,4794
 ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
 ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
-ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
+ingestify/domain/models/resources/dataset_resource.py,sha256=Le_C4nPzPPTDq75_amKSNsR94QvVWdZ_ZkjYIKa6whM,3084
 ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
 ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
 ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
@@ -59,29 +59,21 @@ ingestify/infra/serialization/__init__.py,sha256=UqXWJmKTp7Mi58ZyDASGguPFlqdVWVU
 ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
 ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ingestify/infra/source/statsbomb_github.py,sha256=IzzrlIRqkChgJp87yW3ugG1my4g_5uMx_xEnoQLWNss,3543
-ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nOUGxE,5626
+ingestify/infra/source/statsbomb_github.py,sha256=KHpl3Ojw2ZEkMdyLh1VOLIqrz6blHWldTLpsSgXyf-M,3773
+ingestify/infra/source/statsbomb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ingestify/infra/source/statsbomb/base.py,sha256=f486brtGdK_zPipHAtmVpnp7gcYdPSV28iTUqsBxldA,1155
+ingestify/infra/source/statsbomb/match.py,sha256=8Zpdys6-bB_ral2AmjGKhF4BnXW3F0Y0C5aWnhxcWAY,5525
 ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
 ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
-ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=vO5usxMbMks7EKPASVCj6bjVld7c9LmlQkWNqLU-Kvs,19916
-ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=kALM32jbdeZ4Wn9gON-w2WSb5tH1lIWaBFgn5i29qTk,10635
+ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=cMmhXqUNp_HUg_IgsUlJ439VXX_H67pnivaToUlqlA4,22552
+ingestify/infra/store/dataset/sqlalchemy/tables.py,sha256=ffHop9DQeVE9JrCMLJ2EvF7MD7j8thfjVwv2xcsbJtY,10954
 ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
 ingestify/infra/store/file/dummy_file_repository.py,sha256=azUq9c43Mz9-GWk9j0E97BaqyUKu-ZMrcuaIednLq5E,723
 ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
 ingestify/infra/store/file/s3_file_repository.py,sha256=tz_EZ_gun7W2qJMlI3j_R03iKBZlJSDcG7AUJ1JkdpE,1501
-ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
-ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
-ingestify/static/templates/statsbomb_github/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
-ingestify/static/templates/wyscout/.env,sha256=o2kfuDC_seZNIqDscPf2Ww5TGiJmLh_DMOUNykGvs8Q,141
-ingestify/static/templates/wyscout/.gitignore,sha256=db0A2IjIeZf5fLLwXKD-bLmC4pETofxm848bljymnNs,13
-ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
-ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
-ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
-ingestify-0.6.4.dist-info/METADATA,sha256=g34iFogx4pOE2FYe2wbNZg9TwH_ufGSBOSrodty--NU,18854
-ingestify-0.6.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
-ingestify-0.6.4.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
-ingestify-0.6.4.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
-ingestify-0.6.4.dist-info/RECORD,,
+ingestify-0.8.0.dist-info/METADATA,sha256=rpC2ALX0e4Ii-XzhJWmRKfW7YoBgl6gEpP2cUGFlQp4,8089
+ingestify-0.8.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+ingestify-0.8.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
+ingestify-0.8.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
+ingestify-0.8.0.dist-info/RECORD,,

ingestify/infra/source/wyscout.py DELETED Viewed

@@ -1,175 +0,0 @@
-import json
-from typing import Optional, Dict, List
-import requests
-from ingestify import Source, retrieve_http
-from ingestify.domain import DraftFile
-from ingestify.exceptions import ConfigurationError
-BASE_URL = "https://apirest.wyscout.com/v3"
-def wyscout_pager_fn(url, response):
-    if response["meta"]["page_current"] < response["meta"]["page_count"]:
-        return f"{url}&page={response['meta']['page_current'] + 1}"
-    else:
-        return None
-class Wyscout(Source):
-    def discover_selectors(self, dataset_type: str) -> List[Dict]:
-        raise NotImplementedError("Not implemented for Wyscout")
-    provider = "wyscout"
-    def __init__(self, name: str, username: str, password: str):
-        super().__init__(name)
-        self.username = username.strip()
-        self.password = password.strip()
-        if not self.username:
-            raise ConfigurationError(
-                f"Username of Wyscout source named '{self.name}' cannot be empty"
-            )
-        if not self.password:
-            raise ConfigurationError(
-                f"Username of Wyscout source named '{self.name}' cannot be empty"
-            )
-    def _get(self, path: str):
-        response = requests.get(
-            BASE_URL + path,
-            auth=(self.username, self.password),
-        )
-        if response.status_code == 400:
-            # What if the response isn't a json?
-            error = response.json()["error"]
-            raise ConfigurationError(
-                f"Check username/password of Wyscout source named '{self.name}'. API response "
-                f"was '{error['message']}' ({error['code']})."
-            )
-        response.raise_for_status()
-        return response.json()
-    def _get_paged(self, path: str, data_path: str):
-        data = []
-        current_page = 1
-        page_count = None
-        while page_count is None or current_page <= page_count:
-            page_data = self._get(path + f"?page={current_page}&limit=100")
-            page_count = page_data["meta"]["page_count"]
-            data.extend(page_data[data_path])
-            current_page += 1
-        return data
-    def discover_datasets(self, dataset_type: str, season_id: int):
-        matches = self._get(f"/seasons/{season_id}/matches")
-        datasets = []
-        for match in matches["matches"]:
-            dataset = dict(match_id=match["matchId"], version="v3", _metadata=match)
-            datasets.append(dataset)
-        return datasets
-    def fetch_dataset_files(
-        self, dataset_type, identifier, current_version
-    ) -> Dict[str, Optional[DraftFile]]:
-        current_files = current_version.modified_files_map if current_version else {}
-        files = {}
-        for filename, url in [
-            (
-                "events.json",
-                f"{BASE_URL}/matches/{identifier.match_id}/events?fetch=teams,players",
-            ),
-        ]:
-            files[filename] = retrieve_http(
-                url, current_files.get(filename), auth=(self.username, self.password)
-            )
-        return files
-#
-# class WyscoutEvent(Wyscout):
-#     dataset_type = "event"
-#
-#     def discover_datasets(self, season_id: int):
-#         matches = self._get(f"/seasons/{season_id}/matches")
-#         datasets = []
-#         for match in matches["matches"]:
-#             dataset = dict(match_id=match["matchId"], version="v3", _metadata=match)
-#             datasets.append(dataset)
-#
-#         return datasets
-#
-#     def fetch_dataset_files(
-#         self, identifier, current_version
-#     ) -> Dict[str, Optional[DraftFile]]:
-#         current_files = current_version.modified_files_map if current_version else {}
-#         files = {}
-#
-#         for filename, url in [
-#             (
-#                 "events.json",
-#                 f"{BASE_URL}/matches/{identifier.match_id}/events?fetch=teams,players",
-#             ),
-#         ]:
-#             files[filename] = retrieve_http(
-#                 url, current_files.get(filename), auth=(self.username, self.password)
-#             )
-#         return files
-#
-#
-# class WyscoutPlayer(Wyscout):
-#     dataset_type = "player"
-#
-#     def discover_datasets(self, season_id: int):
-#         return [
-#             dict(
-#                 version="v3",
-#             )
-#         ]
-#
-#     def fetch_dataset_files(
-#         self, identifier, current_version
-#     ) -> Dict[str, Optional[DraftFile]]:
-#         current_files = current_version.modified_files_map if current_version else {}
-#
-#         return {
-#             "players.json": retrieve_http(
-#                 f"{BASE_URL}/seasons/{identifier.season_id}/players?limit=100",
-#                 current_files.get("players.json"),
-#                 pager=("players", wyscout_pager_fn),
-#                 auth=(self.username, self.password),
-#             )
-#         }
-if __name__ == "__main__":
-    import dotenv, os
-    dotenv.load_dotenv()
-    kilmarnock_id = 8516
-    competition_id = 750
-    season_id = 188105
-    match_id = 5459107
-    player_id = 840543
-    data = requests.get(
-        f"{BASE_URL}/competitions/{competition_id}/players",
-        # f"{BASE_URL}/players/{player_id}/career",
-        # f"{BASE_URL}/matches/{match_id}/advancedstats/players",
-        # f"{BASE_URL}/competitions/{competition_id}/matches",  # teams/{kilmarnock_id}/advancedstats?compId={competition_id}",
-        # f"{BASE_URL}/teams/{kilmarnock_id}/squad", #teams/{kilmarnock_id}/advancedstats?compId={competition_id}",
-        auth=(os.environ["WYSCOUT_USERNAME"], os.environ["WYSCOUT_PASSWORD"]),
-    ).json()
-    from pprint import pprint
-    pprint(data)

ingestify/static/templates/statsbomb_github/config.yaml.jinja2 DELETED Viewed

@@ -1,19 +0,0 @@
-ingestify_version: {{ ingestify_version }}
-main:
-  dataset_url: sqlite:///database/catalog.db
-  file_url: file://database/files/
-  default_bucket: main
-sources:
-  statsbomb:
-    type: ingestify.statsbomb_github
-extract_jobs:
-  - source: statsbomb
-    selectors:
-      - competition_id: 11
-        season_id: [42, 90]
-    # passing an empty selector means: fetch everything
-    #  -

ingestify/static/templates/statsbomb_github/database/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- # This will contain the database

ingestify/static/templates/statsbomb_github/query.py DELETED Viewed

@@ -1,14 +0,0 @@
-from ingestify.main import get_datastore
-def main():
-    store = get_datastore("config.yaml")
-    dataset_collection = store.get_dataset_collection()
-    for dataset in dataset_collection:
-        kloppy_dataset = store.load_with_kloppy(dataset)
-        print(f"Loaded dataset with {len(kloppy_dataset.records)} events")
-if __name__ == "__main__":
-    main()

ingestify/static/templates/wyscout/.env DELETED Viewed

@@ -1,5 +0,0 @@
-# Template .env file from Ingestify
-# You should not add this file to a version control system like git
-WYSCOUT_USERNAME=
-WYSCOUT_PASSWORD=

ingestify/static/templates/wyscout/.gitignore DELETED Viewed

	@@ -1,2 +0,0 @@
1	- .env
2	- database

ingestify/static/templates/wyscout/README.md DELETED Viewed

File without changes

ingestify/static/templates/wyscout/config.yaml.jinja2 DELETED Viewed

@@ -1,18 +0,0 @@
-ingestify_version: {{ ingestify_version }}
-main:
-  dataset_url: sqlite:///database/catalog.db
-  file_url: file://database/files/
-  default_bucket: main
-sources:
-  wyscout:
-    type: ingestify.wyscout
-    configuration:
-      username: !ENV ${WYSCOUT_USERNAME}
-      password: !ENV ${WYSCOUT_PASSWORD}
-extract_jobs:
-  - source: wyscout
-    selectors:
-      - season_id: 188105

ingestify/static/templates/wyscout/database/README.md DELETED Viewed

	@@ -1 +0,0 @@
1	- # This will contain the database

ingestify 0.6.4__py3-none-any.whl → 0.8.0__py3-none-any.whl

ingestify 0.6.4py3-none-any.whl → 0.8.0py3-none-any.whl