PyPI - ingestify - Versions diffs - 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

ingestify 0.1.3py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

ingestify/__init__.py +1 -1
ingestify/application/dataset_store.py +47 -36
ingestify/application/ingestion_engine.py +3 -3
ingestify/application/loader.py +71 -241
ingestify/domain/models/__init__.py +1 -6
ingestify/domain/models/base.py +22 -0
ingestify/domain/models/data_spec_version_collection.py +6 -0
ingestify/domain/models/dataset/__init__.py +3 -5
ingestify/domain/models/dataset/dataset.py +15 -32
ingestify/domain/models/dataset/dataset_repository.py +1 -15
ingestify/domain/models/dataset/dataset_state.py +11 -0
ingestify/domain/models/dataset/events.py +6 -16
ingestify/domain/models/dataset/file.py +21 -34
ingestify/domain/models/dataset/file_collection.py +3 -1
ingestify/domain/models/dataset/file_repository.py +29 -28
ingestify/domain/models/dataset/revision.py +26 -3
ingestify/domain/models/event/domain_event.py +8 -4
ingestify/domain/models/ingestion/__init__.py +0 -0
ingestify/domain/models/ingestion/ingestion_job.py +325 -0
ingestify/domain/models/ingestion/ingestion_job_summary.py +123 -0
ingestify/domain/models/{extract_job.py → ingestion/ingestion_plan.py} +4 -4
ingestify/domain/models/resources/dataset_resource.py +29 -37
ingestify/domain/models/sink.py +1 -8
ingestify/domain/models/task/task.py +3 -1
ingestify/domain/models/task/task_summary.py +118 -0
ingestify/domain/models/timing.py +16 -0
ingestify/domain/services/identifier_key_transformer.py +111 -0
ingestify/infra/fetch/http.py +5 -0
ingestify/infra/source/statsbomb_github.py +67 -54
ingestify/infra/store/dataset/__init__.py +0 -2
ingestify/infra/store/dataset/sqlalchemy/mapping.py +187 -4
ingestify/infra/store/dataset/sqlalchemy/repository.py +24 -24
ingestify/infra/store/file/local_file_repository.py +3 -5
ingestify/infra/store/file/s3_file_repository.py +4 -9
ingestify/main.py +64 -25
ingestify/utils.py +15 -78
{ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/METADATA +2 -1
{ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/RECORD +41 -34
{ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/WHEEL +1 -1
ingestify/infra/store/dataset/local_dataset_repository.py +0 -73
{ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt +0 -0
{ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/top_level.txt +0 -0

ingestify/infra/store/dataset/sqlalchemy/repository.py CHANGED Viewed

@@ -29,22 +29,6 @@ def parse_value(v):
         return v
-def json_serializer(o):
-    return json.dumps(o)
-def json_deserializer(o):
-    o = json.loads(o)
-    # THIS BREAKS WHEN USING OTHER JSON COLUMNS!!
-    o = Identifier(**o)
-    return o
-# @compiles(DateTime, "mysql")
-# def compile_datetime_mysql(type_, compiler, **kw):
-#     return "DATETIME(6)"
 def isfloat(x):
     try:
         a = float(x)
@@ -64,7 +48,7 @@ def isint(x):
         return a == b
-class SqlAlchemyDatasetRepository(DatasetRepository):
+class SqlAlchemySessionProvider:
     @staticmethod
     def fix_url(url: str) -> str:
         if url.startswith("postgres://"):
@@ -87,8 +71,6 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
             self.url,
             # Use the default isolation level, don't need SERIALIZABLE
             # isolation_level="SERIALIZABLE",
-            json_serializer=json_serializer,
-            json_deserializer=json_deserializer,
         )
         self.session = Session(bind=self.engine)
@@ -107,9 +89,29 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         self.url = state["url"]
         self._init_engine()
+    def _close_engine(self):
+        if hasattr(self, "session"):
+            self.session.close()
+            self.engine.dispose()
     def __del__(self):
-        self.session.close()
-        self.engine.dispose()
+        self._close_engine()
+    def reset(self):
+        self._close_engine()
+        self._init_engine()
+    def get(self):
+        return self.session
+class SqlAlchemyDatasetRepository(DatasetRepository):
+    def __init__(self, session_provider: SqlAlchemySessionProvider):
+        self.session_provider = session_provider
+    @property
+    def session(self):
+        return self.session_provider.get()
     def _filter_query(
         self,
@@ -207,9 +209,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
             )
         if not metadata_only:
-            dataset_query = apply_query_filter(
-                self.session.query(Dataset).options(joinedload(Dataset.revisions))
-            )
+            dataset_query = apply_query_filter(self.session.query(Dataset))
             datasets = list(dataset_query)
         else:
             datasets = []

ingestify/infra/store/file/local_file_repository.py CHANGED Viewed

@@ -19,14 +19,12 @@ class LocalFileRepository(FileRepository):
         filename: str,
         stream: BinaryIO,
     ) -> Path:
-        path = self.get_path(bucket, dataset, revision_id, filename)
+        path = self.get_write_path(bucket, dataset, revision_id, filename)
         path.parent.mkdir(parents=True, exist_ok=True)
         with open(path, "wb") as fp:
             shutil.copyfileobj(stream, fp)
         return path
-    def load_content(
-        self, bucket: str, dataset: Dataset, revision_id: int, filename: str
-    ) -> BinaryIO:
-        return open(self.get_path(bucket, dataset, revision_id, filename), "rb")
+    def load_content(self, storage_path: str) -> BinaryIO:
+        return open(self.get_read_path(storage_path), "rb")

ingestify/infra/store/file/s3_file_repository.py CHANGED Viewed

@@ -8,10 +8,7 @@ from ingestify.domain.models import FileRepository
 class S3FileRepository(FileRepository):
-    def __init__(self, url):
-        super().__init__(url)
-        self._s3 = None
+    _s3 = None
     @property
     def s3(self):
@@ -30,16 +27,14 @@ class S3FileRepository(FileRepository):
         filename: str,
         stream: BinaryIO,
     ) -> Path:
-        key = self.get_path(bucket, dataset, revision_id, filename)
+        key = self.get_write_path(bucket, dataset, revision_id, filename)
         s3_bucket = Path(key.parts[0])
         self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).put(Body=stream)
         return key
-    def load_content(
-        self, bucket: str, dataset: Dataset, revision_id: int, filename: str
-    ) -> BinaryIO:
-        key = self.get_path(bucket, dataset, revision_id, filename)
+    def load_content(self, storage_path: str) -> BinaryIO:
+        key = self.get_read_path(storage_path)
         s3_bucket = Path(key.parts[0])
         return self.s3.Object(str(s3_bucket), str(key.relative_to(s3_bucket))).get()[
             "Body"

ingestify/main.py CHANGED Viewed

@@ -11,19 +11,21 @@ from ingestify import Source
 from ingestify.application.dataset_store import DatasetStore
 from ingestify.application.ingestion_engine import IngestionEngine
 from ingestify.application.secrets_manager import SecretsManager
-from ingestify.domain import Selector
-from ingestify.domain.models import (
-    dataset_repository_factory,
-    file_repository_factory,
-)
+from ingestify.domain import Selector, FileRepository
 from ingestify.domain.models.data_spec_version_collection import (
     DataSpecVersionCollection,
 )
 from ingestify.domain.models.event import EventBus, Publisher, Subscriber
-from ingestify.domain.models.extract_job import ExtractJob
+from ingestify.domain.models.ingestion.ingestion_plan import IngestionPlan
 from ingestify.domain.models.fetch_policy import FetchPolicy
+from ingestify.domain.services.identifier_key_transformer import IdentifierTransformer
 from ingestify.exceptions import ConfigurationError
+from ingestify.infra import S3FileRepository, LocalFileRepository
+from ingestify.infra.store.dataset.sqlalchemy import SqlAlchemyDatasetRepository
+from ingestify.infra.store.dataset.sqlalchemy.repository import (
+    SqlAlchemySessionProvider,
+)
 logger = logging.getLogger(__name__)
@@ -59,8 +61,23 @@ def import_cls(name):
     return getattr(mod, components[-1])
+def build_file_repository(file_url: str, identifier_transformer) -> FileRepository:
+    if file_url.startswith("s3://"):
+        repository = S3FileRepository(
+            url=file_url, identifier_transformer=identifier_transformer
+        )
+    elif file_url.startswith("file://"):
+        repository = LocalFileRepository(
+            url=file_url, identifier_transformer=identifier_transformer
+        )
+    else:
+        raise Exception(f"Cannot find repository to handle file {file_url}")
+    return repository
 def get_dataset_store_by_urls(
-    dataset_url: str, file_url: str, bucket: str
+    metadata_url: str, file_url: str, bucket: str, dataset_types
 ) -> DatasetStore:
     """
     Initialize a DatasetStore by a DatasetRepository and a FileRepository
@@ -68,15 +85,30 @@ def get_dataset_store_by_urls(
     if not bucket:
         raise Exception("Bucket is not specified")
-    file_repository = file_repository_factory.build_if_supports(url=file_url)
+    identifier_transformer = IdentifierTransformer()
+    for dataset_type in dataset_types:
+        for id_key, id_config in dataset_type["identifier_keys"].items():
+            identifier_transformer.register_transformation(
+                provider=dataset_type["provider"],
+                dataset_type=dataset_type["dataset_type"],
+                id_key=id_key,
+                transformation=id_config["transformation"],
+            )
+    file_repository = build_file_repository(
+        file_url, identifier_transformer=identifier_transformer
+    )
+    if secrets_manager.supports(metadata_url):
+        metadata_url = secrets_manager.load_as_db_url(metadata_url)
+    if metadata_url.startswith("postgres://"):
+        metadata_url = metadata_url.replace("postgress://", "postgress+")
-    if secrets_manager.supports(dataset_url):
-        dataset_url = secrets_manager.load_as_db_url(dataset_url)
+    sqlalchemy_session_provider = SqlAlchemySessionProvider(metadata_url)
-    if dataset_url.startswith("postgres://"):
-        dataset_url = dataset_url.replace("postgress://", "postgress+")
+    dataset_repository = SqlAlchemyDatasetRepository(sqlalchemy_session_provider)
-    dataset_repository = dataset_repository_factory.build_if_supports(url=dataset_url)
     return DatasetStore(
         dataset_repository=dataset_repository,
         file_repository=file_repository,
@@ -88,14 +120,15 @@ def get_datastore(config_file, bucket: Optional[str] = None) -> DatasetStore:
     config = parse_config(config_file, default_value="")
     return get_dataset_store_by_urls(
-        dataset_url=config["main"]["dataset_url"],
+        metadata_url=config["main"]["metadata_url"],
         file_url=config["main"]["file_url"],
         bucket=bucket or config["main"].get("default_bucket"),
+        dataset_types=config.get("dataset_types", []),
     )
 def get_remote_datastore(url: str, bucket: str, **kwargs) -> DatasetStore:
-    return get_dataset_store_by_urls(dataset_url=url, file_url=url, bucket=bucket)
+    return get_dataset_store_by_urls(metadata_url=url, file_url=url, bucket=bucket)
 def get_source_cls(key: str) -> Type[Source]:
@@ -155,9 +188,10 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
     logger.info("Initializing IngestionEngine")
     store = get_dataset_store_by_urls(
-        dataset_url=config["main"]["dataset_url"],
+        metadata_url=config["main"]["metadata_url"],
         file_url=config["main"]["file_url"],
         bucket=bucket or config["main"].get("default_bucket"),
+        dataset_types=config.get("dataset_types", []),
     )
     # Setup an EventBus and wire some more components
@@ -173,19 +207,24 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
         store=store,
     )
-    logger.info("Determining tasks...")
+    logger.info("Adding IngestionPlans...")
     fetch_policy = FetchPolicy()
-    for job in config["extract_jobs"]:
+    # Previous naming
+    ingestion_plans = config.get("extract_jobs", [])
+    # New naming
+    ingestion_plans.extend(config.get("ingestion_plans", []))
+    for ingestion_plan in ingestion_plans:
         data_spec_versions = DataSpecVersionCollection.from_dict(
-            job.get("data_spec_versions", {"default": {"v1"}})
+            ingestion_plan.get("data_spec_versions", {"default": {"v1"}})
         )
-        if "selectors" in job:
+        if "selectors" in ingestion_plan:
             selectors = [
                 Selector.build(selector, data_spec_versions=data_spec_versions)
-                for selector_args in job["selectors"]
+                for selector_args in ingestion_plan["selectors"]
                 for selector in _product_selectors(selector_args)
             ]
         else:
@@ -193,13 +232,13 @@ def get_engine(config_file, bucket: Optional[str] = None) -> IngestionEngine:
             # but makes it easier later one where we loop over selectors.
             selectors = [Selector.build({}, data_spec_versions=data_spec_versions)]
-        import_job = ExtractJob(
-            source=sources[job["source"]],
-            dataset_type=job["dataset_type"],
+        ingestion_plan = IngestionPlan(
+            source=sources[ingestion_plan["source"]],
+            dataset_type=ingestion_plan["dataset_type"],
             selectors=selectors,
             fetch_policy=fetch_policy,
             data_spec_versions=data_spec_versions,
         )
-        ingestion_engine.add_extract_job(import_job)
+        ingestion_engine.add_ingestion_plan(ingestion_plan)
     return ingestion_engine

ingestify/utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import abc
+import asyncio
 import inspect
 import logging
 import os
@@ -8,7 +9,19 @@ from multiprocessing import get_context, cpu_count, get_all_start_methods
 from datetime import datetime, timezone
 from string import Template
-from typing import Dict, Generic, Type, TypeVar, Tuple, Optional, Any
+from typing import (
+    Dict,
+    Generic,
+    Type,
+    TypeVar,
+    Tuple,
+    Optional,
+    Any,
+    Callable,
+    Awaitable,
+    List,
+    Iterable,
+)
 import cloudpickle
 from typing_extensions import Self
@@ -39,83 +52,6 @@ def sanitize_exception_message(exception_message):
     return sanitized_message
-class ComponentRegistry:
-    def __init__(self):
-        self.__registered_components = {}
-        class _Registered(abc.ABCMeta):
-            def __new__(mcs, cls_name, bases, class_dict):
-                class_dict["name"] = cls_name
-                component_cls = super(_Registered, mcs).__new__(
-                    mcs, cls_name, bases, class_dict
-                )
-                if not inspect.isabstract(component_cls):
-                    self.register_component(cls_name, component_cls)
-                else:
-                    if bases[0] != abc.ABC:
-                        raise Exception(
-                            f"Class '{cls_name}' seems to be an concrete class, but missing some abstract methods"
-                        )
-                return component_cls
-        self.__metaclass = _Registered
-    @property
-    def metaclass(self):
-        return self.__metaclass
-    def register_component(self, cls_name, component_cls):
-        self.__registered_components[cls_name] = component_cls
-    def get_component(self, cls_name: str):
-        return self.__registered_components[cls_name]
-    def get_supporting_component(self, **kwargs) -> str:
-        for cls_name, class_ in self.__registered_components.items():
-            if not hasattr(class_, "supports"):
-                raise Exception(
-                    f"Class '{cls_name}' does not implemented a 'supports' classmethod. "
-                    f"This is required when using 'get_supporting_component'."
-                )
-            if class_.supports(**kwargs):
-                return cls_name
-        kwargs_str = sanitize_exception_message(str(kwargs))
-        raise Exception(f"No supporting class found for {kwargs_str}")
-T = TypeVar("T")
-R = TypeVar("R")
-class ComponentFactory(Generic[T]):
-    def __init__(self, registry: ComponentRegistry):
-        self.registry = registry
-    @classmethod
-    def build_factory(
-        cls, component_cls: Type[R], registry: ComponentRegistry
-    ) -> "ComponentFactory[R]":
-        return cls[component_cls](registry)
-    def build(self, cls_name, **kwargs) -> T:
-        component_cls = self.registry.get_component(cls_name)
-        try:
-            return component_cls.from_dict(**kwargs)
-        except AttributeError:
-            pass
-        try:
-            return component_cls(**kwargs)
-        except TypeError as e:
-            raise e
-            # raise TypeError(f"Could not initialize {cls_name}")
-    def build_if_supports(self, **kwargs) -> T:
-        cls_name = self.registry.get_supporting_component(**kwargs)
-        return self.build(cls_name, **kwargs)
 def key_from_dict(d: dict) -> str:
     return "/".join([f"{k}={v}" for k, v in sorted(d.items()) if not k.startswith("_")])
@@ -270,6 +206,7 @@ class TaskExecutor:
             logger.info(
                 f"Finished {len(res)} tasks in {took:.1f} seconds. {(len(res)/took):.1f} tasks/sec"
             )
+        return res
     def join(self):
         self.pool.close()

{ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ingestify
-Version: 0.1.3
+Version: 0.3.0
 Summary: Standardizing soccer tracking- and event data
 Author: Koen Vossen
 Author-email: info@koenvossen.nl
@@ -16,6 +16,7 @@ Requires-Dist: python-dotenv
 Requires-Dist: pyaml-env
 Requires-Dist: boto3
 Requires-Dist: pytz
+Requires-Dist: pydantic>=2.0.0
 Provides-Extra: test
 Requires-Dist: pytest<7,>=6.2.5; extra == "test"

{ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/RECORD RENAMED Viewed

@@ -1,67 +1,74 @@
-ingestify/__init__.py,sha256=7SmMqBObZfacvSUG-7g0fFyt6ZlamLxBNgU5ReQ36lk,301
+ingestify/__init__.py,sha256=DnPPEtJT32gAPuUKXgIsqUE4fIvc6QA96vrcKr6nz6A,301
 ingestify/cmdline.py,sha256=bIuyPgGEw4wIglNzpG9zp7TsJozsP8NSVsCe4eAyWUg,7189
 ingestify/exceptions.py,sha256=wMMuajl4AkQRfW60TLN7btJmQaH8-lUczXyW_2g9kOU,143
-ingestify/main.py,sha256=YjrAOiGzwurtoDyIf981DSJHHA6IT5q09k3QNzTKCC8,6814
+ingestify/main.py,sha256=0sTNoLcS7euOavIAviQIMTolRnXsvOvNbmFdXgXgxhE,8516
 ingestify/server.py,sha256=OVrf_XtpAQIn88MzqQzShXgsA9_jbnqYvD8YPBjn3cs,2413
 ingestify/source_base.py,sha256=GXAFCoT11Zov9M2v-fqQr9gFCXbtVfEIEH32V7r2oE8,382
-ingestify/utils.py,sha256=WcbG2mEb-oLF_sA-2JPRbx8nD55HqASrHQRz7Zd0Ejg,8198
+ingestify/utils.py,sha256=HETGhAoUlutLG0cQR63nac2JbFei9gnktDHeBQoYWfU,5692
 ingestify/application/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ingestify/application/dataset_store.py,sha256=NAW-XSvp118Lr2hXZd3qtuQr6VkPdWCLksIwd5MSs30,11489
-ingestify/application/ingestion_engine.py,sha256=GYIhb8a9ePkEcNOBPdfu-YawiD7eRZMRlxCA-6g9DRA,2249
-ingestify/application/loader.py,sha256=DSdSNFf7WynGsMCoK3iQGiMKkO76fZ_KIOBDEMZK3zU,13495
+ingestify/application/dataset_store.py,sha256=6xMHa_ShyPOyegIKl2xwmRl3BlV5i21z95cpKW3oARw,11712
+ingestify/application/ingestion_engine.py,sha256=PtMjKMpvfqB802G5zfKLzyamdH7qFOXl3x6_97y8w60,2288
+ingestify/application/loader.py,sha256=v8ZcpMDEml9k_uFPFqT4WaCjXED_OIpAr7g0Pz5Hp6Y,7153
 ingestify/application/secrets_manager.py,sha256=5qCbPfUvRGP1Xbq6xPkMfpgYl8uPzF_0NbiKeRY5zxU,1757
 ingestify/domain/__init__.py,sha256=M7_fVTJjQUx53P4UQUPhowRKPKsIIjx4JYZL1yjHKsM,46
-ingestify/domain/models/__init__.py,sha256=xHVQZP57ZQYUKwAtbccnDKX89_yTOvBKAtn4XDVbEbY,930
-ingestify/domain/models/data_spec_version_collection.py,sha256=qjEM6-gt-Uf5orQlv64P6NJCEdWiUPX2oTZv8cC-KVY,1203
-ingestify/domain/models/extract_job.py,sha256=yXrlF2Vt5hxB1Vo9CicpgyW5rjvJaEPfSiMzaAqhqB0,624
+ingestify/domain/models/__init__.py,sha256=cjQmdSDFA-saXjdF1mLPNWILFHIFgdj20J_fC5FmFsI,770
+ingestify/domain/models/base.py,sha256=6jzzIqSkH1mPsXZ2OTXMj09S_IlvMOrOBHBJyWAKEjE,555
+ingestify/domain/models/data_spec_version_collection.py,sha256=CAXlO4W2AOOWAPdPAuymqBHnJpiYtkr2z7fYFJ3HSCk,1372
 ingestify/domain/models/fetch_policy.py,sha256=d7K1TzliNJXxqaqzqEOQWLhvgIvmmqhUQEliXvSUcTs,1405
-ingestify/domain/models/sink.py,sha256=AieqDQ76Vj7WGxCrl3-F93AKe-VBfoPHtMNH28GTQM4,384
+ingestify/domain/models/sink.py,sha256=OBVfFMpB7puJmHg4q2KYx4qgoAnlmX8xKWYnPi8a9pc,178
 ingestify/domain/models/source.py,sha256=sB3aqr2LfjIbtw7ODJpHnPj3RUeo7gYmTU7MXvfaYg4,973
-ingestify/domain/models/dataset/__init__.py,sha256=kSn3XZo0o-D0WzMb2VDxhOXw9Rr9jvS-8fkHdOnrccU,748
+ingestify/domain/models/timing.py,sha256=TvvH6Szo61CD8wCP7Awyc45CXga5lKqvoW2U-0TRHlA,388
+ingestify/domain/models/dataset/__init__.py,sha256=i1kswluvWjw0xn4OUByRt7yeRvNHu1mauevv-Vmayx4,630
 ingestify/domain/models/dataset/collection.py,sha256=E2utQ6oyaFFrfQFMiwP9J_I7Wm21z0sRvE4Zc3QEs20,1310
 ingestify/domain/models/dataset/collection_metadata.py,sha256=gI5cb9M0QRsheIr2jA71wOyWfI5lGx5ES2Qw7rbDIoA,371
-ingestify/domain/models/dataset/dataset.py,sha256=m0iVJPXd1KOAHbDg7fmY_7MCdrKQaILUekIWUfo5pXI,2893
-ingestify/domain/models/dataset/dataset_repository.py,sha256=eiloP5msmDau4WRHee8gA7pLoH_ca2JXAhPx9UecPIA,1185
-ingestify/domain/models/dataset/events.py,sha256=x4l_pdzBHbemE_722EyCYXzWy9t8IcTx5j-wNFxWs6o,708
-ingestify/domain/models/dataset/file.py,sha256=O-yJom9dr13PaHfmc_4crtSa9B1Q9iruHsnf-m01McU,3943
-ingestify/domain/models/dataset/file_collection.py,sha256=V5wh2aSc61UA4HWcHi9PvyQUIUvssDRkaPVe2YR6XwU,1140
-ingestify/domain/models/dataset/file_repository.py,sha256=lxf3Dh8e-_67dRspMZHT1DZ79IWW_vlvb3z8lKjypj4,1514
+ingestify/domain/models/dataset/dataset.py,sha256=ReL50BXNaJVU29OB5_9CQEI7BekWsgi1t3AR7e5jENc,2743
+ingestify/domain/models/dataset/dataset_repository.py,sha256=kUjiqW58kOUOli1gZCLR5xw4dBX0bqI1UJsf16hgNsQ,812
+ingestify/domain/models/dataset/dataset_state.py,sha256=O95mea5N34HDXw7XsYzxHna4FVk_T-ZNUDezkvt7VzY,220
+ingestify/domain/models/dataset/events.py,sha256=58VacQejQt-WPh9BywP4st5McauM3gXBQo0kaDnSekY,481
+ingestify/domain/models/dataset/file.py,sha256=nuoZI9GI5OysYwWCCyNsHMlm1Z9A1GbEKd38jvBzJ4E,4119
+ingestify/domain/models/dataset/file_collection.py,sha256=yaQmqFlmbajLCkU5QnjgqCvKzvVEZJrXVvinx5UGHcM,1193
+ingestify/domain/models/dataset/file_repository.py,sha256=9EQprch9isAH2pbK7e7tfOKl6ulip4Ij1kBCTbO_rTc,1721
 ingestify/domain/models/dataset/identifier.py,sha256=EJYsxt0OS_43Y989DZQq8U9NjwmtvnHGYGMe6-hOBlI,575
-ingestify/domain/models/dataset/revision.py,sha256=fiHnd_mad0iYmNCGswKImUHpauhIf2gW_ukztDFVP48,781
+ingestify/domain/models/dataset/revision.py,sha256=O_1HG2S2EmYdWqI2K282S_D-d6IhRh_f4Q3wV8MEhkk,1311
 ingestify/domain/models/dataset/selector.py,sha256=kEGpU8pIyjZ0zwE9n2uo_NY5xrNanWiTTgapyMAUEsw,1039
 ingestify/domain/models/event/__init__.py,sha256=OdPTpE9bj5QqdGmrYqRTLPX1f-LR9GWJYlGMPPEsuL8,138
 ingestify/domain/models/event/_old_event.py,sha256=RktgCAj9SMdtqkAc_bOwoghEb2Z6m4r5_xWXin9wqx4,472
 ingestify/domain/models/event/dispatcher.py,sha256=5WnyUJ7Qzr612btAtl1dMG9JBXDPcsBLyLmW6H7Q1zk,154
-ingestify/domain/models/event/domain_event.py,sha256=a5nNNwDWSAqou8aSBGIEA6aQOHTOxYyMEUXB91fYUIM,187
+ingestify/domain/models/event/domain_event.py,sha256=OR6va417j2lisRr0gjQZ9rshAtlys5sVu7KU-W0r0xA,316
 ingestify/domain/models/event/event_bus.py,sha256=iseourbCwdUg-ODM5bM_u6cageJmceWLstOxiP3-2qU,576
 ingestify/domain/models/event/publisher.py,sha256=TOAawYYiPQCLR2Gm17LumMEzeapMDYcAYeklLFmwqAY,620
 ingestify/domain/models/event/subscriber.py,sha256=tP1ZFSvpJWKUITnATYekRxJzepz85UY7egBTMiP-dwg,1039
+ingestify/domain/models/ingestion/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ingestify/domain/models/ingestion/ingestion_job.py,sha256=U6B62c7NGeHBAjmKhgOa4uHeul34xyR66WtWaPSRNTU,12276
+ingestify/domain/models/ingestion/ingestion_job_summary.py,sha256=1l9O3QJkYLs74HhrwAijwNEriPMwHN9OFG64Iz4z3uI,4262
+ingestify/domain/models/ingestion/ingestion_plan.py,sha256=KAvITBMQt3zmMFokESQJyp3rMuz1Hxr6msfZK1_faZM,648
 ingestify/domain/models/resources/__init__.py,sha256=ZuY9DPRfwk-aLB3Lj6DYP_NqMkcQfcYjZp4VejTtcbU,46
-ingestify/domain/models/resources/dataset_resource.py,sha256=HH5wMqzoWvcL84GzNa7QL3YsciI757FG4iZu9DbXn_k,3181
+ingestify/domain/models/resources/dataset_resource.py,sha256=NRnN029ct3P_Eg2d9Unb1t7A12Ksv_emBGhoe9DpPwM,3118
 ingestify/domain/models/task/__init__.py,sha256=BdlyIPvE07Xax_IzLgO9DUw0wsz9OZutxnxdDNyRlys,79
 ingestify/domain/models/task/set.py,sha256=04txDYgS5rotXofD9TqChKdW0VZIYshrkfPIpXtlhW4,430
-ingestify/domain/models/task/task.py,sha256=R6tEZub-N_Wjl4VjwlPySdFb3L9D7nH4St2CcDzFoKA,107
+ingestify/domain/models/task/task.py,sha256=OwLZQi9GGe0O8m1dKvJdN2Rham5oilI49KyKc5uV20A,161
+ingestify/domain/models/task/task_summary.py,sha256=ovzqKPstngRVzVA_JboQMluq5uQjKVJDsWNNcfcadhU,3774
 ingestify/domain/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+ingestify/domain/services/identifier_key_transformer.py,sha256=y4GS9u9Ej1MO2jUhAxWbifp0mrE_MqTHvVVcoQzSKb4,4034
 ingestify/domain/services/transformers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/domain/services/transformers/kloppy_to_pandas.py,sha256=NcN6nTBGVn9gz-_hWZJTMcduS1Gg7EM4X95Cqxi1QIM,809
 ingestify/infra/__init__.py,sha256=V0hpLzPVTcOHRVh0gguF6FT30YIgEOUd5v87xUHkfZ4,88
 ingestify/infra/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ingestify/infra/fetch/http.py,sha256=4CcEkwtNzYkPspNIaQIfcthA5yLow0x_M9xpEsoucWw,3982
+ingestify/infra/fetch/http.py,sha256=ldaXy6alBbI9z63H97lXfYZNT0ZCBkTac1W6-acNjjY,4127
 ingestify/infra/serialization/__init__.py,sha256=LwfmRoO4qykZkJZXxVPSKpwoVIkg9qzXa7Egut9JjL4,1772
 ingestify/infra/sink/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/infra/sink/postgresql.py,sha256=SxuM3LntfYcpCriUpqJhMvgAf0s9cohXf6WkxSEDYDY,1816
 ingestify/infra/source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-ingestify/infra/source/statsbomb_github.py,sha256=CuHZoJn6fU8ZKQl4f1-gyaVYsmxL6R33n0cbOx1jQmI,2895
+ingestify/infra/source/statsbomb_github.py,sha256=IzzrlIRqkChgJp87yW3ugG1my4g_5uMx_xEnoQLWNss,3543
 ingestify/infra/source/wyscout.py,sha256=DxCzdkzYpVRHTfV9GpF8pe3FzwIk-WHYUlea6nOUGxE,5626
 ingestify/infra/store/__init__.py,sha256=3dA6NWfB6FS5SFdQiSlJ0ZghBfnUAUuGIP5Vr4rkCqk,43
-ingestify/infra/store/dataset/__init__.py,sha256=8oVJFiA-IKccrEpiYxAmSc65dfpNut7PYx8PUhylmdU,113
-ingestify/infra/store/dataset/local_dataset_repository.py,sha256=UMgSe1M9u_629V4WyuTJ-QegZJiDczzMo7vkNbNleqA,2064
+ingestify/infra/store/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/infra/store/dataset/sqlalchemy/__init__.py,sha256=Z5JHWGO_hwT6rO-ecMOOAmOKjFFJi449KZvJTQgt6vQ,52
-ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256=Q7Od3zBnoZgxE5aThdZE93waWeKVut9dstrCnEYb9nc,3981
-ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=ynoIVMVD0_w9aa2hFKkcLxRKzJDoET_SNfGHXPIoN40,7067
+ingestify/infra/store/dataset/sqlalchemy/mapping.py,sha256=UlEIfNusSOEWOxPi_ORrdLSylbi6-TO1qwEmcrBLwog,9447
+ingestify/infra/store/dataset/sqlalchemy/repository.py,sha256=-eSR_F9tS9Hd3JNEpoJoDAb5RY38rFaKLMI3eBedjx8,7068
 ingestify/infra/store/file/__init__.py,sha256=DuEekZa2pmDuRCFiulbgoGotN0wGv3OrRXSvokY0PhY,104
-ingestify/infra/store/file/local_file_repository.py,sha256=0oIzjjKO5U_7gPXhsBJFUqQBarQTFQS499ZK7HNxMxo,893
-ingestify/infra/store/file/s3_file_repository.py,sha256=txDviBrY9EHn3soqLFvTrjSPkyh548RxUgx4T83j0QY,1331
+ingestify/infra/store/file/local_file_repository.py,sha256=1hhLqds5LlppJq2QBB0oN0Q98j6aXreCtYQYz3Q1P8g,819
+ingestify/infra/store/file/s3_file_repository.py,sha256=_sekV1rfEbwIaSGhKRnFQlj92E9qNgONiwXt6ZLCyGg,1188
 ingestify/static/templates/statsbomb_github/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 ingestify/static/templates/statsbomb_github/config.yaml.jinja2,sha256=_gAuAipfBL3ddLacyS1IBP5JluvPS2vmrb8GGaFtcUM,386
 ingestify/static/templates/statsbomb_github/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
@@ -72,8 +79,8 @@ ingestify/static/templates/wyscout/README.md,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
 ingestify/static/templates/wyscout/config.yaml.jinja2,sha256=0zQXuvJVwd0oL2OJsPMZ8sOvRbdfRbieSGLQ44ezmYc,379
 ingestify/static/templates/wyscout/query.py,sha256=wjAOMoKvhX-BzCRqEm1SJp6YAcF8Fsq7ddrOaOpAeOk,364
 ingestify/static/templates/wyscout/database/README.md,sha256=7IuzjKo7Pqkx5wkmOETRZDljVOslqfA3ALuHMONq5dg,32
-ingestify-0.1.3.dist-info/METADATA,sha256=N5OO5RAYulhFCXCqp4Hi2gEcEB9dnf6W7n59ZOGRUQ8,18822
-ingestify-0.1.3.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
-ingestify-0.1.3.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
-ingestify-0.1.3.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
-ingestify-0.1.3.dist-info/RECORD,,
+ingestify-0.3.0.dist-info/METADATA,sha256=-QlChdV6OYWkqSyXUmkQTG4deBliRsSmmZMTWKeURnI,18853
+ingestify-0.3.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
+ingestify-0.3.0.dist-info/entry_points.txt,sha256=czYYXeX2ul4zdeB6bKlz3HaUF7zyVVcj9E_sRNDisI0,53
+ingestify-0.3.0.dist-info/top_level.txt,sha256=Lwnjgns4KequS7KiicXhh6mLUvcdfjzLyPI4qf_s4A0,10
+ingestify-0.3.0.dist-info/RECORD,,

{ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: bdist_wheel (0.44.0)
+Generator: bdist_wheel (0.45.1)
 Root-Is-Purelib: true
 Tag: py3-none-any

ingestify/infra/store/dataset/local_dataset_repository.py DELETED Viewed

@@ -1,73 +0,0 @@
-import glob
-import os
-import pickle
-import uuid
-from pathlib import Path
-from typing import Optional
-from ingestify.domain.models import (
-    Dataset,
-    DatasetCollection,
-    DatasetRepository,
-    Selector,
-)
-def parse_value(v):
-    try:
-        return int(v)
-    except ValueError:
-        return v
-class LocalDatasetRepository(DatasetRepository):
-    def destroy(self, dataset: Dataset):
-        path = (
-            self.base_dir / dataset.identifier.key.replace("/", "__") / "dataset.pickle"
-        )
-        path.unlink()
-    @classmethod
-    def supports(cls, url: str) -> bool:
-        return url.startswith("file://")
-    def __init__(self, url: str):
-        self.base_dir = Path(url[7:])
-        raise DeprecationWarning(
-            "This Repository should not be used. Better use SqlAlchemyDatasetRepository with a local sqlite database."
-        )
-    def get_dataset_collection(
-        self,
-        dataset_type: Optional[str] = None,
-        provider: Optional[str] = None,
-        dataset_id: Optional[str] = None,
-        selector: Optional[Selector] = None,
-        **kwargs
-    ) -> DatasetCollection:
-        datasets = []
-        for dir_name in glob.glob(str(self.base_dir / "*")):
-            attributes = {
-                item[0]: parse_value(item[1])
-                for item in [
-                    part.split("=") for part in os.path.basename(dir_name).split("__")
-                ]
-            }
-            if not selector or selector.matches(attributes):
-                with open(dir_name + "/dataset.pickle", "rb") as fp:
-                    dataset = pickle.load(fp)
-                datasets.append(dataset)
-        return DatasetCollection(datasets)
-    def save(self, bucket: str, dataset: Dataset):
-        path = (
-            self.base_dir / dataset.identifier.key.replace("/", "__") / "dataset.pickle"
-        )
-        path.parent.mkdir(parents=True, exist_ok=True)
-        with open(path, "wb") as fp:
-            pickle.dump(dataset, fp)
-    def next_identity(self):
-        return str(uuid.uuid4())

{ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{ingestify-0.1.3.dist-info → ingestify-0.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

ingestify 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl

ingestify 0.1.3py3-none-any.whl → 0.3.0py3-none-any.whl