PyPI - ingestify - Versions diffs - 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

ingestify 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

ingestify/__init__.py +1 -1
ingestify/application/dataset_store.py +228 -11
ingestify/application/ingestion_engine.py +229 -7
ingestify/application/loader.py +153 -28
ingestify/cmdline.py +0 -48
ingestify/domain/models/__init__.py +2 -0
ingestify/domain/models/dataset/collection.py +0 -9
ingestify/domain/models/dataset/dataset_repository.py +4 -0
ingestify/domain/models/dataset/dataset_state.py +5 -0
ingestify/domain/models/dataset/events.py +13 -0
ingestify/domain/models/dataset/file.py +1 -1
ingestify/domain/models/dataset/selector.py +8 -1
ingestify/domain/models/event/event_bus.py +16 -1
ingestify/domain/models/ingestion/ingestion_job.py +23 -4
ingestify/domain/models/resources/dataset_resource.py +0 -1
ingestify/infra/source/statsbomb/base.py +36 -0
ingestify/infra/source/statsbomb/match.py +137 -0
ingestify/infra/source/statsbomb_github.py +46 -44
ingestify/infra/store/dataset/sqlalchemy/repository.py +100 -31
ingestify/infra/store/dataset/sqlalchemy/tables.py +10 -0
ingestify/main.py +35 -10
ingestify/utils.py +2 -32
ingestify-0.7.0.dist-info/METADATA +211 -0
{ingestify-0.6.3.dist-info → ingestify-0.7.0.dist-info}/RECORD +28 -36
ingestify/infra/source/wyscout.py +0 -175
ingestify/static/templates/statsbomb_github/config.yaml.jinja2 +0 -19
ingestify/static/templates/statsbomb_github/database/README.md +0 -1
ingestify/static/templates/statsbomb_github/query.py +0 -14
ingestify/static/templates/wyscout/.env +0 -5
ingestify/static/templates/wyscout/.gitignore +0 -2
ingestify/static/templates/wyscout/README.md +0 -0
ingestify/static/templates/wyscout/config.yaml.jinja2 +0 -18
ingestify/static/templates/wyscout/database/README.md +0 -1
ingestify/static/templates/wyscout/query.py +0 -14
ingestify-0.6.3.dist-info/METADATA +0 -266
/ingestify/{static/templates/statsbomb_github/README.md → infra/source/statsbomb/__init__.py} +0 -0
{ingestify-0.6.3.dist-info → ingestify-0.7.0.dist-info}/WHEEL +0 -0
{ingestify-0.6.3.dist-info → ingestify-0.7.0.dist-info}/entry_points.txt +0 -0
{ingestify-0.6.3.dist-info → ingestify-0.7.0.dist-info}/top_level.txt +0 -0

ingestify/infra/source/statsbomb/match.py ADDED Viewed

@@ -0,0 +1,137 @@
+from datetime import datetime
+from ingestify import DatasetResource
+from ingestify.domain.models.dataset.dataset import DatasetState
+from .base import StatsBombBaseAPI
+class StatsBombMatchAPI(StatsBombBaseAPI):
+    def discover_selectors(self, dataset_type: str):
+        assert dataset_type == "match"
+        competitions = self.get(data_spec_version="v4", path="competitions")
+        def get_last_modified(competition):
+            if not competition["match_updated"]:
+                return None
+            last_modified = datetime.fromisoformat(
+                competition["match_updated"] + "+00:00"
+            )
+            if competition["match_updated_360"]:
+                last_modified = max(
+                    last_modified,
+                    datetime.fromisoformat(competition["match_updated_360"] + "+00:00"),
+                )
+            return last_modified
+        return [
+            dict(
+                competition_id=competition["competition_id"],
+                season_id=competition["season_id"],
+                # Passing the LastModified for an entire competition allows Ingestify to entirely skip
+                # this Selector based on a datetime based check. Dataset comparison won't happen. When the
+                # DataSpecVersion is changed, but LastModified isn't changed on the Source, new files ARE NOT ingested!
+                _last_modified=get_last_modified(competition),
+            )
+            for competition in competitions
+        ]
+    def find_datasets(
+        self,
+        dataset_type: str,
+        competition_id: str,
+        season_id: str,
+        match_id: str = None,
+        data_spec_versions=None,
+        dataset_collection_metadata=None,
+    ):
+        assert dataset_type == "match"
+        match_data_spec_version = data_spec_versions.get_version("match")
+        matches = self.get(
+            path=f"competitions/{competition_id}/seasons/{season_id}/matches",
+            data_spec_version=match_data_spec_version,
+        )
+        for match in matches:
+            if match_id:
+                if match["match_id"] != match_id:
+                    continue
+            last_modified = datetime.fromisoformat(match["last_updated"] + "+00:00")
+            if match["collection_status"] == "Complete":
+                if match["match_status"] == "available":
+                    state = DatasetState.COMPLETE
+                else:
+                    # This could be "processing"
+                    state = DatasetState.PARTIAL
+            else:
+                state = DatasetState.SCHEDULED
+            name = (
+                f"{match['match_date']} / "
+                f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
+            )
+            dataset_resource = DatasetResource(
+                dataset_resource_id=dict(
+                    competition_id=competition_id,
+                    season_id=season_id,
+                    match_id=match["match_id"],
+                ),
+                dataset_type=dataset_type,
+                provider=self.provider,
+                name=name,
+                metadata=match,
+                state=state,
+            )
+            dataset_resource.add_file(
+                last_modified=last_modified,
+                data_feed_key="match",
+                data_spec_version=match_data_spec_version,
+                json_content=match,
+            )
+            if state.is_complete:
+                name += f" / {match['home_score']}-{match['away_score']}"
+                for data_feed_key in ["lineups", "events"]:
+                    for data_spec_version in data_spec_versions[data_feed_key]:
+                        dataset_resource.add_file(
+                            # Note: the LastModified value can be incorrect when only match Metadata (match file)
+                            #       is changed. Use it anyway for indication. Ingestify will also use the
+                            #       Dataset.last_modified_at value to determine if a file should be refetched
+                            last_modified=last_modified,
+                            data_feed_key=data_feed_key,
+                            data_spec_version=data_spec_version,
+                            url=self.get_url(
+                                data_feed_key, data_spec_version, match["match_id"]
+                            ),
+                            http_options=dict(auth=(self.username, self.password)),
+                            data_serialization_format="json",
+                        )
+                if (
+                    match["last_updated_360"]
+                    and match["match_status_360"] == "available"
+                ):
+                    for data_spec_version in data_spec_versions.get("360-frames", []):
+                        dataset_resource.add_file(
+                            last_modified=datetime.fromisoformat(
+                                match["last_updated_360"] + "+00:00"
+                            ),
+                            data_feed_key="360-frames",
+                            data_spec_version=data_spec_version,
+                            url=self.get_url(
+                                "360-frames", data_spec_version, match["match_id"]
+                            ),
+                            http_options=dict(auth=(self.username, self.password)),
+                            data_serialization_format="json",
+                        )
+            yield dataset_resource

ingestify/infra/source/statsbomb_github.py CHANGED Viewed

@@ -21,6 +21,7 @@ class StatsbombGithub(Source):
             dict(
                 competition_id=competition["competition_id"],
                 season_id=competition["season_id"],
+                _name=f"{competition['competition_name']} - {competition['season_name']}",
             )
             for competition in competitions
         ]
@@ -53,53 +54,54 @@ class StatsbombGithub(Source):
             name = (
                 f"{match['match_date']} / "
                 f"{match['home_team']['home_team_name']} - {match['away_team']['away_team_name']}"
+                f" / {match['home_score']}-{match['away_score']}"
             )
-            dataset_resource = DatasetResource(
-                dataset_resource_id=dict(
-                    competition_id=competition_id,
-                    season_id=season_id,
-                    match_id=match["match_id"],
-                ),
-                dataset_type=dataset_type,
-                provider=self.provider,
-                name=name,
-                metadata=match,
-                state=state,
+            dataset_resource = (
+                DatasetResource(
+                    dataset_resource_id=dict(
+                        competition_id=competition_id,
+                        season_id=season_id,
+                        match_id=match["match_id"],
+                    ),
+                    dataset_type=dataset_type,
+                    provider=self.provider,
+                    name=name,
+                    metadata=match,
+                    state=state,
+                )
+                .add_file(
+                    last_modified=last_modified,
+                    data_feed_key="match",
+                    data_spec_version=DATA_SPEC_VERSION,
+                    json_content=match,
+                )
+                .add_file(
+                    last_modified=last_modified,
+                    data_feed_key="lineups",
+                    data_spec_version=DATA_SPEC_VERSION,
+                    url=f"{BASE_URL}/lineups/{match['match_id']}.json",
+                    data_serialization_format="json",
+                )
+                .add_file(
+                    last_modified=last_modified,
+                    data_feed_key="events",
+                    data_spec_version=DATA_SPEC_VERSION,
+                    url=f"{BASE_URL}/events/{match['match_id']}.json",
+                    data_serialization_format="json",
+                )
             )
-            dataset_resource.add_file(
-                last_modified=last_modified,
-                data_feed_key="match",
-                data_spec_version=DATA_SPEC_VERSION,
-                json_content=match,
-            )
-            if state.is_complete:
-                name += f" / {match['home_score']}-{match['away_score']}"
-                for data_feed_key in ["lineups", "events"]:
-                    dataset_resource.add_file(
-                        last_modified=last_modified,
-                        data_feed_key=data_feed_key,
-                        data_spec_version=DATA_SPEC_VERSION,
-                        url=f"{BASE_URL}/{data_feed_key}/{match['match_id']}.json",
-                        data_serialization_format="json",
-                    )
-                if (
-                    match["last_updated_360"]
-                    and match["match_status_360"] == "available"
-                ):
-                    dataset_resource.add_file(
-                        last_modified=datetime.fromisoformat(
-                            match["last_updated_360"] + "+00:00"
-                        ),
-                        data_feed_key="360-frames",
-                        data_spec_version=DATA_SPEC_VERSION,
-                        url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
-                        data_serialization_format="json",
-                        http_options={"ignore_not_found": True},
-                    )
+            if match["last_updated_360"] and match["match_status_360"] == "available":
+                dataset_resource.add_file(
+                    last_modified=datetime.fromisoformat(
+                        match["last_updated_360"] + "+00:00"
+                    ),
+                    data_feed_key="360-frames",
+                    data_spec_version=DATA_SPEC_VERSION,
+                    url=f"{BASE_URL}/three-sixty/{match['match_id']}.json",
+                    data_serialization_format="json",
+                    http_options={"ignore_not_found": True},
+                )
             yield dataset_resource

ingestify/infra/store/dataset/sqlalchemy/repository.py CHANGED Viewed

@@ -29,6 +29,7 @@ from ingestify.domain.models import (
     Dataset,
     DatasetCollection,
     DatasetRepository,
+    DatasetState,
     Selector,
 )
 from ingestify.domain.models.dataset.collection_metadata import (
@@ -46,6 +47,7 @@ from .tables import (
     revision_table,
     ingestion_job_summary_table,
     task_summary_table,
+    store_version_table,
 )
 logger = logging.getLogger(__name__)
@@ -159,6 +161,10 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         entities: list[dict],
         immutable_rows: bool = False,
     ):
+        if not entities:
+            # Nothing to do
+            return
         dialect = self.dialect.name
         if dialect == "mysql":
             from sqlalchemy.dialects.mysql import insert
@@ -230,6 +236,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         provider: Optional[str] = None,
         dataset_id: Optional[Union[str, List[str]]] = None,
         selector: Optional[Union[Selector, List[Selector]]] = None,
+        dataset_state: Optional[List[DatasetState]] = None,
     ):
         if dataset_id is not None:
             if isinstance(dataset_id, list):
@@ -268,33 +275,35 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
             if not selectors:
                 raise ValueError("Selectors must contain at least one item")
-            attribute_cte = self._build_cte(
-                [selector.filtered_attributes for selector in selectors], "attributes"
-            )
-            keys = list(selectors[0].filtered_attributes.keys())
             first_selector = selectors[0].filtered_attributes
+            keys = list(first_selector.keys())
-            join_conditions = []
-            for k in keys:
-                if dialect == "postgresql":
-                    column = dataset_table.c.identifier[k]
+            if keys:
+                attribute_cte = self._build_cte(
+                    [selector.filtered_attributes for selector in selectors],
+                    "attributes",
+                )
-                    # Take the value from the first selector to determine the type.
-                    # TODO: check all selectors to determine the type
-                    v = first_selector[k]
-                    if isinstance(v, int):
-                        column = column.as_integer()
+                join_conditions = []
+                for k in keys:
+                    if dialect == "postgresql":
+                        column = dataset_table.c.identifier[k]
+                        # Take the value from the first selector to determine the type.
+                        # TODO: check all selectors to determine the type
+                        v = first_selector[k]
+                        if isinstance(v, int):
+                            column = column.as_integer()
+                        else:
+                            column = column.as_string()
                     else:
-                        column = column.as_string()
-                else:
-                    column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
+                        column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
-                join_conditions.append(attribute_cte.c[k] == column)
+                    join_conditions.append(attribute_cte.c[k] == column)
-            query = query.select_from(
-                dataset_table.join(attribute_cte, and_(*join_conditions))
-            )
+                query = query.select_from(
+                    dataset_table.join(attribute_cte, and_(*join_conditions))
+                )
         if where:
             query = query.filter(text(where))
@@ -304,6 +313,8 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
             query = query.filter(dataset_table.c.dataset_type == dataset_type)
         if provider:
             query = query.filter(dataset_table.c.provider == provider)
+        if dataset_state:
+            query = query.filter(dataset_table.c.state.in_(dataset_state))
         return query
@@ -393,6 +404,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         dataset_id: Optional[Union[str, List[str]]] = None,
         selector: Optional[Union[Selector, List[Selector]]] = None,
         metadata_only: bool = False,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        dataset_state: Optional[List[DatasetState]] = None,
     ) -> DatasetCollection:
         def apply_query_filter(query):
             return self._filter_query(
@@ -402,15 +416,23 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
                 provider=provider,
                 dataset_id=dataset_id,
                 selector=selector,
+                dataset_state=dataset_state,
             )
         with self.session:
             # Use a contextmanager to make sure it's closed afterwards
             if not metadata_only:
+                # Apply sorting by created_at in ascending order
                 dataset_query = apply_query_filter(
                     self.session.query(dataset_table.c.dataset_id)
-                )
+                ).order_by(dataset_table.c.created_at.asc())
+                # Apply pagination if both page and page_size are provided
+                if page is not None and page_size is not None:
+                    offset = (page - 1) * page_size
+                    dataset_query = dataset_query.offset(offset).limit(page_size)
                 self._debug_query(dataset_query)
                 dataset_ids = [row.dataset_id for row in dataset_query]
                 datasets = self._load_datasets(dataset_ids)
@@ -501,19 +523,25 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         with self.connect() as connection:
             try:
                 # Delete modified files related to the dataset
-                file_table.delete().where(
-                    file_table.c.dataset_id == dataset.dataset_id
-                ).execute()
+                connection.execute(
+                    file_table.delete().where(
+                        file_table.c.dataset_id == dataset.dataset_id
+                    )
+                )
                 # Delete revisions related to the dataset
-                revision_table.delete().where(
-                    revision_table.c.dataset_id == dataset.dataset_id
-                ).execute()
+                connection.execute(
+                    revision_table.delete().where(
+                        revision_table.c.dataset_id == dataset.dataset_id
+                    )
+                )
                 # Delete the dataset itself
-                dataset_table.delete().where(
-                    dataset_table.c.dataset_id == dataset.dataset_id
-                ).execute()
+                connection.execute(
+                    dataset_table.delete().where(
+                        dataset_table.c.dataset_id == dataset.dataset_id
+                    )
+                )
                 connection.commit()
             except Exception:
@@ -604,3 +632,44 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
                 )
             )
         return ingestion_job_summaries
+    def get_store_version(self) -> Optional[str]:
+        """Get the current Ingestify version stored for this store."""
+        with self.session:
+            row = self.session.query(store_version_table.c.ingestify_version).first()
+            return row.ingestify_version if row else None
+    def set_store_version(self, version: str):
+        """Set the Ingestify version for this store."""
+        from ingestify.utils import utcnow
+        now = utcnow()
+        entity = {
+            "id": 1,
+            "ingestify_version": version,
+            "created_at": now,
+            "updated_at": now,
+        }
+        with self.connect() as connection:
+            try:
+                self._upsert(connection, store_version_table, [entity])
+                connection.commit()
+            except Exception:
+                connection.rollback()
+                raise
+    def ensure_compatible_version(self, current_version: str):
+        """Ensure the store is compatible with the current Ingestify version."""
+        stored_version = self.get_store_version()
+        if stored_version is None:
+            # First time setup - store the current version
+            self.set_store_version(current_version)
+            logger.info(f"Initialized store with Ingestify version {current_version}")
+        elif stored_version != current_version:
+            # Version mismatch - for now just log, future: trigger migration
+            logger.warning(
+                f"Store version mismatch: stored={stored_version}, current={current_version}. "
+                f"Future versions may require migration."
+            )

ingestify/infra/store/dataset/sqlalchemy/tables.py CHANGED Viewed

@@ -84,6 +84,7 @@ class PathString(TypeDecorator):
 class DatasetStateString(TypeDecorator):
+    cache_ok = True
     impl = String(255)
     def process_bind_param(self, value: DatasetState, dialect):
@@ -318,6 +319,15 @@ task_summary_table = Table(
     # Column("state", RevisionStateString, default=RevisionState.PENDING_VALIDATION),
     # Column("source", JSONType()),
 )
+store_version_table = Table(
+    "store_version",
+    metadata,
+    Column("id", Integer, primary_key=True, default=1),
+    Column("ingestify_version", String(255), nullable=False),
+    Column("created_at", TZDateTime(6), nullable=False),
+    Column("updated_at", TZDateTime(6), nullable=False),
+)
 #
 #
 # mapper_registry = registry()

ingestify/main.py CHANGED Viewed

@@ -138,12 +138,16 @@ def get_remote_datastore(url: str, bucket: str, **kwargs) -> DatasetStore:
 def get_source_cls(key: str) -> Type[Source]:
     if key.startswith("ingestify."):
-        _, type_ = key.split(".")
+        _, type_ = key.split(".", maxsplit=1)
         if type_ == "wyscout":
             from ingestify.infra.source.wyscout import Wyscout
             return Wyscout
+        elif type_ == "statsbomb.match":
+            from ingestify.infra.source.statsbomb.match import StatsBombMatchAPI
+            return StatsBombMatchAPI
         elif type_ == "statsbomb_github":
             from ingestify.infra.source.statsbomb_github import StatsbombGithub
@@ -183,15 +187,36 @@ def get_event_subscriber_cls(key: str) -> Type[Subscriber]:
 def get_engine(
-    config_file, bucket: Optional[str] = None, disable_events: bool = False
+    config_file: Optional[str] = None,
+    bucket: Optional[str] = None,
+    disable_events: bool = False,
+    metadata_url: Optional[str] = None,
+    file_url: Optional[str] = None,
 ) -> IngestionEngine:
-    config = parse_config(config_file, default_value="")
-    logger.info("Initializing sources")
     sources = {}
-    sys.path.append(os.path.dirname(config_file))
-    for name, source_args in config["sources"].items():
-        sources[name] = build_source(name=name, source_args=source_args)
+    if not config_file:
+        if not metadata_url or not file_url:
+            raise ValueError(
+                f"You must specify metadata_url and file_url in case you don't use a config_file"
+            )
+        config = {
+            "main": {
+                "metadata_url": metadata_url,
+                "file_url": file_url,
+                "default_bucket": bucket or "main",
+            }
+        }
+    elif not config_file:
+        raise ValueError("You must specify a config file")
+    else:
+        config = parse_config(config_file, default_value="")
+        logger.info("Initializing sources")
+        sys.path.append(os.path.dirname(config_file))
+        for name, source_args in config.get("sources", {}).items():
+            sources[name] = build_source(name=name, source_args=source_args)
     logger.info("Initializing IngestionEngine")
     store = get_dataset_store_by_urls(
@@ -244,13 +269,13 @@ def get_engine(
             # but makes it easier later one where we loop over selectors.
             selectors = [Selector.build({}, data_spec_versions=data_spec_versions)]
-        ingestion_plan = IngestionPlan(
+        ingestion_plan_ = IngestionPlan(
             source=sources[ingestion_plan["source"]],
             dataset_type=ingestion_plan["dataset_type"],
             selectors=selectors,
             fetch_policy=fetch_policy,
             data_spec_versions=data_spec_versions,
         )
-        ingestion_engine.add_ingestion_plan(ingestion_plan)
+        ingestion_engine.add_ingestion_plan(ingestion_plan_)
     return ingestion_engine

ingestify/utils.py CHANGED Viewed

@@ -5,13 +5,11 @@ import re
 import traceback
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
-from multiprocessing import get_context, cpu_count, get_all_start_methods
 from datetime import datetime, timezone
 from string import Template
 from typing import Dict, Tuple, Optional, Any, List
-import cloudpickle
 from pydantic import Field
 from typing_extensions import Self
@@ -75,8 +73,8 @@ class AttributeBag:
         return Template(string).substitute(**self.attributes)
     def matches(self, attributes: Dict) -> bool:
-        for k, v in self.attributes.items():
-            if attributes.get(k) != v:
+        for k, v in attributes.items():
+            if k in self.attributes and self.attributes[k] != v:
                 return False
         return True
@@ -110,34 +108,6 @@ class AttributeBag:
         )
-def cloud_unpack_and_call(args):
-    f_pickled, org_args = args
-    f = cloudpickle.loads(f_pickled)
-    return f(org_args)
-def map_in_pool(func, iterable, processes=0):
-    # TODO: move to cmdline
-    if os.environ.get("INGESTIFY_RUN_EAGER") == "true":
-        return list(map(func, iterable))
-    if not processes:
-        processes = int(os.environ.get("INGESTIFY_CONCURRENCY", "0"))
-    if "fork" in get_all_start_methods():
-        ctx = get_context("fork")
-    else:
-        ctx = get_context("spawn")
-    wrapped_fn = cloudpickle.dumps(func)
-    with ctx.Pool(processes or cpu_count()) as pool:
-        return pool.map(
-            cloud_unpack_and_call, ((wrapped_fn, item) for item in iterable)
-        )
 class SyncExecutor:
     def map(self, func, iterable):
         return [func(item) for item in iterable]

ingestify 0.6.3__py3-none-any.whl → 0.7.0__py3-none-any.whl

ingestify 0.6.3py3-none-any.whl → 0.7.0py3-none-any.whl