PyPI - ingestify - Versions diffs - 0.1.0__py3-none-any.whl - Mend

ingestify 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

ingestify/__init__.py +11 -0
ingestify/application/__init__.py +0 -0
ingestify/application/dataset_store.py +339 -0
ingestify/application/ingestion_engine.py +62 -0
ingestify/application/loader.py +329 -0
ingestify/application/secrets_manager.py +53 -0
ingestify/cmdline.py +283 -0
ingestify/domain/__init__.py +2 -0
ingestify/domain/models/__init__.py +45 -0
ingestify/domain/models/data_spec_version_collection.py +33 -0
ingestify/domain/models/dataset/__init__.py +27 -0
ingestify/domain/models/dataset/collection.py +44 -0
ingestify/domain/models/dataset/collection_metadata.py +13 -0
ingestify/domain/models/dataset/dataset.py +104 -0
ingestify/domain/models/dataset/dataset_repository.py +46 -0
ingestify/domain/models/dataset/events.py +31 -0
ingestify/domain/models/dataset/file.py +146 -0
ingestify/domain/models/dataset/file_collection.py +35 -0
ingestify/domain/models/dataset/file_repository.py +59 -0
ingestify/domain/models/dataset/identifier.py +24 -0
ingestify/domain/models/dataset/revision.py +29 -0
ingestify/domain/models/dataset/selector.py +37 -0
ingestify/domain/models/event/__init__.py +4 -0
ingestify/domain/models/event/_old_event.py +21 -0
ingestify/domain/models/event/dispatcher.py +8 -0
ingestify/domain/models/event/domain_event.py +10 -0
ingestify/domain/models/event/event_bus.py +24 -0
ingestify/domain/models/event/publisher.py +23 -0
ingestify/domain/models/event/subscriber.py +39 -0
ingestify/domain/models/extract_job.py +23 -0
ingestify/domain/models/fetch_policy.py +40 -0
ingestify/domain/models/resources/__init__.py +1 -0
ingestify/domain/models/resources/dataset_resource.py +99 -0
ingestify/domain/models/sink.py +16 -0
ingestify/domain/models/source.py +34 -0
ingestify/domain/models/task/__init__.py +4 -0
ingestify/domain/models/task/set.py +21 -0
ingestify/domain/models/task/task.py +7 -0
ingestify/domain/services/__init__.py +0 -0
ingestify/domain/services/transformers/__init__.py +0 -0
ingestify/domain/services/transformers/kloppy_to_pandas.py +25 -0
ingestify/exceptions.py +10 -0
ingestify/infra/__init__.py +4 -0
ingestify/infra/fetch/__init__.py +0 -0
ingestify/infra/fetch/http.py +100 -0
ingestify/infra/serialization/__init__.py +50 -0
ingestify/infra/sink/__init__.py +0 -0
ingestify/infra/sink/postgresql.py +50 -0
ingestify/infra/source/__init__.py +0 -0
ingestify/infra/source/statsbomb_github.py +92 -0
ingestify/infra/source/wyscout.py +175 -0
ingestify/infra/store/__init__.py +2 -0
ingestify/infra/store/dataset/__init__.py +2 -0
ingestify/infra/store/dataset/local_dataset_repository.py +73 -0
ingestify/infra/store/dataset/sqlalchemy/__init__.py +1 -0
ingestify/infra/store/dataset/sqlalchemy/mapping.py +153 -0
ingestify/infra/store/dataset/sqlalchemy/repository.py +239 -0
ingestify/infra/store/file/__init__.py +2 -0
ingestify/infra/store/file/local_file_repository.py +32 -0
ingestify/infra/store/file/s3_file_repository.py +50 -0
ingestify/main.py +205 -0
ingestify/server.py +78 -0
ingestify/source_base.py +23 -0
ingestify/static/templates/statsbomb_github/README.md +0 -0
ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +19 -0
ingestify/static/templates/statsbomb_github/database/README.md +1 -0
ingestify/static/templates/statsbomb_github/query.py +14 -0
ingestify/static/templates/wyscout/.env +5 -0
ingestify/static/templates/wyscout/.gitignore +2 -0
ingestify/static/templates/wyscout/README.md +0 -0
ingestify/static/templates/wyscout/config.yaml.jinja2 +18 -0
ingestify/static/templates/wyscout/database/README.md +1 -0
ingestify/static/templates/wyscout/query.py +14 -0
ingestify/utils.py +276 -0
ingestify-0.1.0.dist-info/METADATA +265 -0
ingestify-0.1.0.dist-info/RECORD +79 -0
ingestify-0.1.0.dist-info/WHEEL +5 -0
ingestify-0.1.0.dist-info/entry_points.txt +2 -0
ingestify-0.1.0.dist-info/top_level.txt +1 -0

ingestify/infra/source/statsbomb_github.py ADDED Viewed

@@ -0,0 +1,92 @@
+import json
+from datetime import datetime
+import requests
+from ingestify import Source, retrieve_http
+from ingestify.domain import DraftFile
+from ingestify.domain.models.dataset.dataset import DatasetState
+BASE_URL = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"
+class StatsbombGithub(Source):
+    provider = "statsbomb"
+    def discover_selectors(self, dataset_type: str, data_spec_versions: None = None):
+        assert dataset_type == "match"
+        competitions = requests.get(f"{BASE_URL}/competitions.json").json()
+        return [
+            dict(
+                competition_id=competition["competition_id"],
+                season_id=competition["season_id"],
+            )
+            for competition in competitions
+        ]
+    def discover_datasets(
+        self,
+        dataset_type,
+        competition_id: str = None,
+        season_id: str = None,
+        data_spec_versions=None,
+    ):
+        assert dataset_type == "match"
+        datasets = []
+        matches = requests.get(
+            f"{BASE_URL}/matches/{competition_id}/{season_id}.json"
+        ).json()
+        for match in matches:
+            last_updated = match["last_updated"]
+            if "Z" not in last_updated:
+                # Assume UTC
+                last_updated += "Z"
+            last_modified = datetime.fromisoformat(last_updated.replace("Z", "+00:00"))
+            dataset = dict(
+                competition_id=competition_id,
+                season_id=season_id,
+                match_id=match["match_id"],
+                _last_modified=last_modified,
+                _match=match,
+                _metadata=match,
+                _state=DatasetState.COMPLETE,
+            )
+            datasets.append(dataset)
+        return datasets
+    def fetch_dataset_files(
+        self, dataset_type, identifier, current_revision, data_spec_versions
+    ):
+        assert dataset_type == "match"
+        current_files = current_revision.modified_files_map if current_revision else {}
+        files = {}
+        for filename, url in [
+            ("lineups.json", f"{BASE_URL}/lineups/{identifier.match_id}.json"),
+            ("events.json", f"{BASE_URL}/events/{identifier.match_id}.json"),
+        ]:
+            data_feed_key = filename.split(".")[0]
+            file_id = data_feed_key + "__v1"
+            files[file_id] = retrieve_http(
+                url,
+                current_files.get(filename),
+                file_data_feed_key=data_feed_key,
+                file_data_spec_version="v1",
+                file_data_serialization_format="json",
+            )
+        files["match__v1"] = DraftFile.from_input(
+            json.dumps(identifier._match, indent=4),
+            data_feed_key="match",
+            data_spec_version="v1",
+            data_serialization_format="json",
+            modified_at=None,
+        )
+        return files

ingestify/infra/source/wyscout.py ADDED Viewed

@@ -0,0 +1,175 @@
+import json
+from typing import Optional, Dict, List
+import requests
+from ingestify import Source, retrieve_http
+from ingestify.domain import DraftFile
+from ingestify.exceptions import ConfigurationError
+BASE_URL = "https://apirest.wyscout.com/v3"
+def wyscout_pager_fn(url, response):
+    if response["meta"]["page_current"] < response["meta"]["page_count"]:
+        return f"{url}&page={response['meta']['page_current'] + 1}"
+    else:
+        return None
+class Wyscout(Source):
+    def discover_selectors(self, dataset_type: str) -> List[Dict]:
+        raise NotImplementedError("Not implemented for Wyscout")
+    provider = "wyscout"
+    def __init__(self, name: str, username: str, password: str):
+        super().__init__(name)
+        self.username = username.strip()
+        self.password = password.strip()
+        if not self.username:
+            raise ConfigurationError(
+                f"Username of Wyscout source named '{self.name}' cannot be empty"
+            )
+        if not self.password:
+            raise ConfigurationError(
+                f"Username of Wyscout source named '{self.name}' cannot be empty"
+            )
+    def _get(self, path: str):
+        response = requests.get(
+            BASE_URL + path,
+            auth=(self.username, self.password),
+        )
+        if response.status_code == 400:
+            # What if the response isn't a json?
+            error = response.json()["error"]
+            raise ConfigurationError(
+                f"Check username/password of Wyscout source named '{self.name}'. API response "
+                f"was '{error['message']}' ({error['code']})."
+            )
+        response.raise_for_status()
+        return response.json()
+    def _get_paged(self, path: str, data_path: str):
+        data = []
+        current_page = 1
+        page_count = None
+        while page_count is None or current_page <= page_count:
+            page_data = self._get(path + f"?page={current_page}&limit=100")
+            page_count = page_data["meta"]["page_count"]
+            data.extend(page_data[data_path])
+            current_page += 1
+        return data
+    def discover_datasets(self, dataset_type: str, season_id: int):
+        matches = self._get(f"/seasons/{season_id}/matches")
+        datasets = []
+        for match in matches["matches"]:
+            dataset = dict(match_id=match["matchId"], version="v3", _metadata=match)
+            datasets.append(dataset)
+        return datasets
+    def fetch_dataset_files(
+        self, dataset_type, identifier, current_version
+    ) -> Dict[str, Optional[DraftFile]]:
+        current_files = current_version.modified_files_map if current_version else {}
+        files = {}
+        for filename, url in [
+            (
+                "events.json",
+                f"{BASE_URL}/matches/{identifier.match_id}/events?fetch=teams,players",
+            ),
+        ]:
+            files[filename] = retrieve_http(
+                url, current_files.get(filename), auth=(self.username, self.password)
+            )
+        return files
+#
+# class WyscoutEvent(Wyscout):
+#     dataset_type = "event"
+#
+#     def discover_datasets(self, season_id: int):
+#         matches = self._get(f"/seasons/{season_id}/matches")
+#         datasets = []
+#         for match in matches["matches"]:
+#             dataset = dict(match_id=match["matchId"], version="v3", _metadata=match)
+#             datasets.append(dataset)
+#
+#         return datasets
+#
+#     def fetch_dataset_files(
+#         self, identifier, current_version
+#     ) -> Dict[str, Optional[DraftFile]]:
+#         current_files = current_version.modified_files_map if current_version else {}
+#         files = {}
+#
+#         for filename, url in [
+#             (
+#                 "events.json",
+#                 f"{BASE_URL}/matches/{identifier.match_id}/events?fetch=teams,players",
+#             ),
+#         ]:
+#             files[filename] = retrieve_http(
+#                 url, current_files.get(filename), auth=(self.username, self.password)
+#             )
+#         return files
+#
+#
+# class WyscoutPlayer(Wyscout):
+#     dataset_type = "player"
+#
+#     def discover_datasets(self, season_id: int):
+#         return [
+#             dict(
+#                 version="v3",
+#             )
+#         ]
+#
+#     def fetch_dataset_files(
+#         self, identifier, current_version
+#     ) -> Dict[str, Optional[DraftFile]]:
+#         current_files = current_version.modified_files_map if current_version else {}
+#
+#         return {
+#             "players.json": retrieve_http(
+#                 f"{BASE_URL}/seasons/{identifier.season_id}/players?limit=100",
+#                 current_files.get("players.json"),
+#                 pager=("players", wyscout_pager_fn),
+#                 auth=(self.username, self.password),
+#             )
+#         }
+if __name__ == "__main__":
+    import dotenv, os
+    dotenv.load_dotenv()
+    kilmarnock_id = 8516
+    competition_id = 750
+    season_id = 188105
+    match_id = 5459107
+    player_id = 840543
+    data = requests.get(
+        f"{BASE_URL}/competitions/{competition_id}/players",
+        # f"{BASE_URL}/players/{player_id}/career",
+        # f"{BASE_URL}/matches/{match_id}/advancedstats/players",
+        # f"{BASE_URL}/competitions/{competition_id}/matches",  # teams/{kilmarnock_id}/advancedstats?compId={competition_id}",
+        # f"{BASE_URL}/teams/{kilmarnock_id}/squad", #teams/{kilmarnock_id}/advancedstats?compId={competition_id}",
+        auth=(os.environ["WYSCOUT_USERNAME"], os.environ["WYSCOUT_PASSWORD"]),
+    ).json()
+    from pprint import pprint
+    pprint(data)

ingestify/infra/store/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .dataset import *
2	+ from .file import *

ingestify/infra/store/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from .local_dataset_repository import LocalDatasetRepository
2	+ from .sqlalchemy import SqlAlchemyDatasetRepository

ingestify/infra/store/dataset/local_dataset_repository.py ADDED Viewed

@@ -0,0 +1,73 @@
+import glob
+import os
+import pickle
+import uuid
+from pathlib import Path
+from typing import Optional
+from ingestify.domain.models import (
+    Dataset,
+    DatasetCollection,
+    DatasetRepository,
+    Selector,
+)
+def parse_value(v):
+    try:
+        return int(v)
+    except ValueError:
+        return v
+class LocalDatasetRepository(DatasetRepository):
+    def destroy(self, dataset: Dataset):
+        path = (
+            self.base_dir / dataset.identifier.key.replace("/", "__") / "dataset.pickle"
+        )
+        path.unlink()
+    @classmethod
+    def supports(cls, url: str) -> bool:
+        return url.startswith("file://")
+    def __init__(self, url: str):
+        self.base_dir = Path(url[7:])
+        raise DeprecationWarning(
+            "This Repository should not be used. Better use SqlAlchemyDatasetRepository with a local sqlite database."
+        )
+    def get_dataset_collection(
+        self,
+        dataset_type: Optional[str] = None,
+        provider: Optional[str] = None,
+        dataset_id: Optional[str] = None,
+        selector: Optional[Selector] = None,
+        **kwargs
+    ) -> DatasetCollection:
+        datasets = []
+        for dir_name in glob.glob(str(self.base_dir / "*")):
+            attributes = {
+                item[0]: parse_value(item[1])
+                for item in [
+                    part.split("=") for part in os.path.basename(dir_name).split("__")
+                ]
+            }
+            if not selector or selector.matches(attributes):
+                with open(dir_name + "/dataset.pickle", "rb") as fp:
+                    dataset = pickle.load(fp)
+                datasets.append(dataset)
+        return DatasetCollection(datasets)
+    def save(self, bucket: str, dataset: Dataset):
+        path = (
+            self.base_dir / dataset.identifier.key.replace("/", "__") / "dataset.pickle"
+        )
+        path.parent.mkdir(parents=True, exist_ok=True)
+        with open(path, "wb") as fp:
+            pickle.dump(dataset, fp)
+    def next_identity(self):
+        return str(uuid.uuid4())

ingestify/infra/store/dataset/sqlalchemy/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .repository import SqlAlchemyDatasetRepository

ingestify/infra/store/dataset/sqlalchemy/mapping.py ADDED Viewed

@@ -0,0 +1,153 @@
+import datetime
+from pathlib import Path
+from sqlalchemy import (
+    JSON,
+    BigInteger,
+    Column,
+    DateTime,
+    ForeignKey,
+    ForeignKeyConstraint,
+    Integer,
+    MetaData,
+    String,
+    Table,
+    TypeDecorator,
+)
+from sqlalchemy.orm import registry, relationship
+from ingestify.domain.models import Dataset, File, Revision
+from ingestify.domain.models.dataset.dataset import DatasetState
+class TZDateTime(TypeDecorator):
+    impl = DateTime
+    LOCAL_TIMEZONE = datetime.datetime.utcnow().astimezone().tzinfo
+    cache_ok = True
+    def process_bind_param(self, value: datetime, dialect):
+        if value.tzinfo is None:
+            value = value.astimezone(self.LOCAL_TIMEZONE)
+        return value.astimezone(datetime.timezone.utc)
+    def process_result_value(self, value, dialect):
+        if not value:
+            return value
+        if value.tzinfo is None:
+            return value.replace(tzinfo=datetime.timezone.utc)
+        return value.astimezone(datetime.timezone.utc)
+class PathString(TypeDecorator):
+    impl = String(255)
+    def process_bind_param(self, value: Path, dialect):
+        return str(value)
+    def process_result_value(self, value, dialect):
+        if not value:
+            return value
+        return Path(value)
+class DatasetStateString(TypeDecorator):
+    impl = String(255)
+    def process_bind_param(self, value: DatasetState, dialect):
+        return value.value
+    def process_result_value(self, value, dialect):
+        if not value:
+            return value
+        return DatasetState[value]
+mapper_registry = registry()
+metadata = MetaData()
+dataset_table = Table(
+    "dataset",
+    metadata,
+    Column("bucket", String(255), default=None),
+    Column("dataset_id", String(255), primary_key=True),
+    Column("provider", String(255)),
+    Column("dataset_type", String(255)),
+    Column("state", DatasetStateString),
+    Column("name", String(255)),
+    Column("identifier", JSON),
+    Column("metadata", JSON),
+    Column("created_at", TZDateTime(6)),
+    Column("updated_at", TZDateTime(6)),
+)
+revision_table = Table(
+    "revision",
+    metadata,
+    Column(
+        "dataset_id", String(255), ForeignKey("dataset.dataset_id"), primary_key=True
+    ),
+    Column("revision_id", Integer, primary_key=True),
+    Column("description", String(255)),
+    Column("created_at", TZDateTime(6)),
+)
+file_table = Table(
+    "file",
+    metadata,
+    Column("dataset_id", String(255), primary_key=True),
+    Column("revision_id", Integer, primary_key=True),
+    Column("file_id", String(255), primary_key=True),
+    Column("created_at", TZDateTime(6)),
+    Column("modified_at", TZDateTime(6)),
+    Column("tag", String(255)),
+    Column("content_type", String(255)),
+    Column("size", BigInteger),
+    Column("data_feed_key", String(255)),
+    Column("data_spec_version", String(255)),
+    Column("data_serialization_format", String(255)),
+    Column("storage_compression_method", String(255)),
+    Column("storage_size", BigInteger),
+    Column("storage_path", PathString),
+    ForeignKeyConstraint(
+        ("dataset_id", "revision_id"),
+        [revision_table.c.dataset_id, revision_table.c.revision_id],
+        ondelete="CASCADE",
+    ),
+)
+mapper_registry.map_imperatively(
+    Dataset,
+    dataset_table,
+    properties={
+        "revisions": relationship(
+            Revision,
+            backref="dataset",
+            order_by=revision_table.c.revision_id,
+            lazy="joined",
+            cascade="all, delete-orphan",
+        ),
+    },
+)
+mapper_registry.map_imperatively(
+    Revision,
+    revision_table,
+    properties={
+        "modified_files": relationship(
+            File,
+            order_by=file_table.c.file_id,
+            primaryjoin="and_(Revision.revision_id==File.revision_id, Revision.dataset_id==File.dataset_id)",
+            lazy="joined",
+            cascade="all, delete-orphan",
+        )
+    },
+)
+mapper_registry.map_imperatively(File, file_table)