PyPI - ingestify - Versions diffs - 0.7.0__tar.gz → 0.9.0__tar.gz - Mend

ingestify 0.7.0tar.gz → 0.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

{ingestify-0.7.0/ingestify.egg-info → ingestify-0.9.0}/PKG-INFO RENAMED Viewed

@@ -1,12 +1,29 @@
-Metadata-Version: 2.1
+Metadata-Version: 2.4
 Name: ingestify
-Version: 0.7.0
+Version: 0.9.0
 Summary: Data Ingestion Framework
 Author: Koen Vossen
 Author-email: info@koenvossen.nl
 License: AGPL
 Description-Content-Type: text/markdown
+Requires-Dist: requests<3,>=2.0.0
+Requires-Dist: SQLAlchemy<3,>=2
+Requires-Dist: click>=8
+Requires-Dist: python-dotenv
+Requires-Dist: pyaml_env
+Requires-Dist: boto3
+Requires-Dist: pydantic>=2.0.0
 Provides-Extra: test
+Requires-Dist: pytest<7,>=6.2.5; extra == "test"
+Requires-Dist: pytz; extra == "test"
+Dynamic: author
+Dynamic: author-email
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: license
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: summary
 # Ingestify
@@ -68,6 +85,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
 pip install ingestify            # or: pip install git+https://github.com/PySport/ingestify.git
 ```
+### Developing a new Source
+When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
+```python
+from ingestify import Source, debug_source
+class MyCustomSource(Source):
+    provider = "my_provider"
+    def __init__(self, name: str, api_key: str):
+        super().__init__(name)
+        self.api_key = api_key
+    def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
+        # Your source implementation
+        ...
+# Quick debug - runs full ingestion with temp storage
+if __name__ == "__main__":
+    source = MyCustomSource(name="test", api_key="...")
+    debug_source(
+        source,
+        dataset_type="match",
+        data_spec_versions={"events": "v1"},
+    )
+```
+The `debug_source()` helper:
+- ✅ Creates an ephemeral dev engine with temp storage
+- ✅ Configures logging automatically
+- ✅ Runs the full ingestion cycle
+- ✅ Shows storage location and results
+Perfect for testing your source before adding it to production config!
 ### Minimal `config.yaml`
 ```yaml
@@ -166,8 +220,16 @@ pip install kloppy
 ```
 ```python
+import logging, sys
 from ingestify.main import get_engine
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    stream=sys.stderr,
+)
 engine = get_engine(
     metadata_url="sqlite:///database_open_data/catalog.db",
     file_url="file://database_open_data/files/"
@@ -179,12 +241,13 @@ dataset_iter = engine.iter_datasets(
     provider="statsbomb",
     dataset_type="match",
-    competition_id=43,
-    season_id=281
+    competition_id=43,  # "FIFA World Cup"
+    #season_id=281
 )
 for dataset in dataset_iter:
     kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
+    logging.info(f"Loaded {kloppy_dataset}")
 ```

ingestify-0.7.0/PKG-INFO → ingestify-0.9.0/README.md RENAMED Viewed

@@ -1,13 +1,3 @@
-Metadata-Version: 2.1
-Name: ingestify
-Version: 0.7.0
-Summary: Data Ingestion Framework
-Author: Koen Vossen
-Author-email: info@koenvossen.nl
-License: AGPL
-Description-Content-Type: text/markdown
-Provides-Extra: test
 # Ingestify
 _Ingest everything – JSON, CSV, tracking ZIPs, even MP4 – keep it version‑safe, sync only what changed, and analyse while you ingest._
@@ -68,6 +58,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
 pip install ingestify            # or: pip install git+https://github.com/PySport/ingestify.git
 ```
+### Developing a new Source
+When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
+```python
+from ingestify import Source, debug_source
+class MyCustomSource(Source):
+    provider = "my_provider"
+    def __init__(self, name: str, api_key: str):
+        super().__init__(name)
+        self.api_key = api_key
+    def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
+        # Your source implementation
+        ...
+# Quick debug - runs full ingestion with temp storage
+if __name__ == "__main__":
+    source = MyCustomSource(name="test", api_key="...")
+    debug_source(
+        source,
+        dataset_type="match",
+        data_spec_versions={"events": "v1"},
+    )
+```
+The `debug_source()` helper:
+- ✅ Creates an ephemeral dev engine with temp storage
+- ✅ Configures logging automatically
+- ✅ Runs the full ingestion cycle
+- ✅ Shows storage location and results
+Perfect for testing your source before adding it to production config!
 ### Minimal `config.yaml`
 ```yaml
@@ -166,8 +193,16 @@ pip install kloppy
 ```
 ```python
+import logging, sys
 from ingestify.main import get_engine
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    stream=sys.stderr,
+)
 engine = get_engine(
     metadata_url="sqlite:///database_open_data/catalog.db",
     file_url="file://database_open_data/files/"
@@ -179,12 +214,13 @@ dataset_iter = engine.iter_datasets(
     provider="statsbomb",
     dataset_type="match",
-    competition_id=43,
-    season_id=281
+    competition_id=43,  # "FIFA World Cup"
+    #season_id=281
 )
 for dataset in dataset_iter:
     kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
+    logging.info(f"Loaded {kloppy_dataset}")
 ```

{ingestify-0.7.0 → ingestify-0.9.0}/ingestify/__init__.py RENAMED Viewed

@@ -7,5 +7,6 @@ except NameError:
 if not __INGESTIFY_SETUP__:
     from .infra import retrieve_http
     from .source_base import Source, DatasetResource
+    from .main import debug_source
-__version__ = "0.7.0"
+__version__ = "0.9.0"

{ingestify-0.7.0 → ingestify-0.9.0}/ingestify/application/ingestion_engine.py RENAMED Viewed

@@ -110,6 +110,9 @@ class IngestionEngine:
         else:
             do_load()
+    # Alias for load() - more intuitive name for running ingestion
+    run = load
     def list_datasets(self, as_count: bool = False):
         """Consider moving this to DataStore"""
         datasets = sorted(

{ingestify-0.7.0 → ingestify-0.9.0}/ingestify/application/loader.py RENAMED Viewed

@@ -307,7 +307,17 @@ class Loader:
             auto_ingest_config=auto_ingest_config,
             **selector_filters,
         )
-        if selector_filters and not selectors:
-            logger.warning(f"No data found matching {selector_filters}")
+        if (provider or source or dataset_type or selector_filters) and not selectors:
+            filters_applied = {
+                k: v
+                for k, v in {
+                    "provider": provider,
+                    "source": source,
+                    "dataset_type": dataset_type,
+                    **selector_filters,
+                }.items()
+                if v
+            }
+            logger.warning(f"No data found matching filters: {filters_applied}")
         else:
             self.run(selectors, dry_run=dry_run)

{ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/dataset_state.py RENAMED Viewed

@@ -10,6 +10,7 @@ class DatasetState(str, Enum):
     SCHEDULED = "SCHEDULED"
     PARTIAL = "PARTIAL"
     COMPLETE = "COMPLETE"
+    MISSING = "MISSING"
     @property
     def is_complete(self):

{ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/dataset/file.py RENAMED Viewed

@@ -39,6 +39,12 @@ class DraftFile(BaseModel):
             stream = BytesIO(file_.read().encode("utf-8"))
         elif isinstance(file_, BytesIO):
             stream = file_
+        elif hasattr(file_, "read"):
+            data = file_.read()
+            if isinstance(data, bytes):
+                stream = BytesIO(data)
+            else:
+                stream = BytesIO(data.encode("utf-8"))
         else:
             raise Exception(f"Not possible to create DraftFile from {type(file_)}")

{ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/ingestion/ingestion_job.py RENAMED Viewed

@@ -129,7 +129,6 @@ class UpdateDatasetTask(Task):
         with TaskSummary.update(
             self.task_id, dataset_identifier=dataset_identifier
         ) as task_summary:
             files = {
                 file_id: task_summary.record_load_file(
                     lambda: load_file(file_resource, dataset=self.dataset),
@@ -138,6 +137,8 @@ class UpdateDatasetTask(Task):
                 for file_id, file_resource in self.dataset_resource.files.items()
             }
+            self.dataset_resource.run_post_load_files(files)
             try:
                 revision = self.store.update_dataset(
                     dataset=self.dataset,
@@ -181,6 +182,9 @@ class CreateDatasetTask(Task):
                 )
                 for file_id, file_resource in self.dataset_resource.files.items()
             }
+            self.dataset_resource.run_post_load_files(files)
             try:
                 revision = self.store.create_dataset(
                     dataset_type=self.dataset_resource.dataset_type,

{ingestify-0.7.0 → ingestify-0.9.0}/ingestify/domain/models/resources/dataset_resource.py RENAMED Viewed

@@ -1,5 +1,5 @@
 from datetime import datetime
-from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING  # noqa
+from typing import Optional, Callable, Any, Protocol, TYPE_CHECKING, Dict  # noqa
 from pydantic import Field
 from ingestify.domain.models.base import BaseModel
@@ -50,6 +50,18 @@ class DatasetResource(BaseModel):
     metadata: dict = Field(default_factory=dict)
     state: DatasetState = Field(default_factory=lambda: DatasetState.COMPLETE)
     files: dict[str, FileResource] = Field(default_factory=dict)
+    post_load_files: Optional[
+        Callable[["DatasetResource", Dict[str, DraftFile]], None]
+    ] = None
+    def run_post_load_files(self, files: Dict[str, DraftFile]):
+        """Hook to modify dataset attributes based on loaded file content.
+        Useful for setting state based on file content, e.g., keep state=SCHEDULED
+        when files contain '{}', change to COMPLETE when they contain actual data.
+        """
+        if self.post_load_files:
+            self.post_load_files(self, files)
     def add_file(
         self,

{ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/fetch/http.py RENAMED Viewed

@@ -58,9 +58,9 @@ def retrieve_http(
             )
         # else:
         #     print(f"{current_file.modified_at=} {last_modified=}")
-        # headers["if-modified-since"] = (
-        #     format_datetime(current_file.modified_at, usegmt=True),
-        # )
+        headers["if-modified-since"] = (
+            format_datetime(current_file.modified_at, usegmt=True),
+        )
         headers["if-none-match"] = current_file.tag
     http_kwargs = {}

{ingestify-0.7.0 → ingestify-0.9.0}/ingestify/infra/store/dataset/sqlalchemy/repository.py RENAMED Viewed

@@ -40,15 +40,7 @@ from ingestify.domain.models.task.task_summary import TaskSummary
 from ingestify.exceptions import IngestifyError
 from ingestify.utils import get_concurrency
-from .tables import (
-    metadata,
-    dataset_table,
-    file_table,
-    revision_table,
-    ingestion_job_summary_table,
-    task_summary_table,
-    store_version_table,
-)
+from .tables import get_tables
 logger = logging.getLogger(__name__)
@@ -112,20 +104,33 @@ class SqlAlchemySessionProvider:
         session_factory = sessionmaker(bind=self.engine)
         self.session = scoped_session(session_factory)
+        # Create tables with the specified prefix
+        tables = get_tables(self.table_prefix)
+        self.metadata = tables["metadata"]
+        self.dataset_table = tables["dataset_table"]
+        self.revision_table = tables["revision_table"]
+        self.file_table = tables["file_table"]
+        self.ingestion_job_summary_table = tables["ingestion_job_summary_table"]
+        self.task_summary_table = tables["task_summary_table"]
+        self.store_version_table = tables["store_version_table"]
     def __getstate__(self):
-        return {"url": self.url}
+        return {"url": self.url, "table_prefix": self.table_prefix}
     def __setstate__(self, state):
         self.url = state["url"]
+        self.table_prefix = state.get("table_prefix", "")
         self._init_engine()
-    def __init__(self, url: str):
+    def __init__(self, url: str, table_prefix: str = ""):
         url = self.fix_url(url)
         self.url = url
+        self.table_prefix = table_prefix
         self._init_engine()
-        metadata.create_all(self.engine)
+        # Create all tables in the database
+        self.metadata.create_all(self.engine)
     def __del__(self):
         self.close()
@@ -154,6 +159,30 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
     def dialect(self) -> Dialect:
         return self.session_provider.dialect
+    @property
+    def dataset_table(self):
+        return self.session_provider.dataset_table
+    @property
+    def revision_table(self):
+        return self.session_provider.revision_table
+    @property
+    def file_table(self):
+        return self.session_provider.file_table
+    @property
+    def ingestion_job_summary_table(self):
+        return self.session_provider.ingestion_job_summary_table
+    @property
+    def task_summary_table(self):
+        return self.session_provider.task_summary_table
+    @property
+    def store_version_table(self):
+        return self.session_provider.store_version_table
     def _upsert(
         self,
         connection: Connection,
@@ -251,13 +280,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
                 )
                 query = query.select_from(
-                    dataset_table.join(
+                    self.dataset_table.join(
                         dataset_ids_cte,
-                        dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
+                        dataset_ids_cte.c.dataset_id == self.dataset_table.c.dataset_id,
                     )
                 )
             else:
-                query = query.filter(dataset_table.c.dataset_id == dataset_id)
+                query = query.filter(self.dataset_table.c.dataset_id == dataset_id)
         dialect = self.dialect.name
@@ -287,7 +316,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
                 join_conditions = []
                 for k in keys:
                     if dialect == "postgresql":
-                        column = dataset_table.c.identifier[k]
+                        column = self.dataset_table.c.identifier[k]
                         # Take the value from the first selector to determine the type.
                         # TODO: check all selectors to determine the type
@@ -297,24 +326,26 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
                         else:
                             column = column.as_string()
                     else:
-                        column = func.json_extract(dataset_table.c.identifier, f"$.{k}")
+                        column = func.json_extract(
+                            self.dataset_table.c.identifier, f"$.{k}"
+                        )
                     join_conditions.append(attribute_cte.c[k] == column)
                 query = query.select_from(
-                    dataset_table.join(attribute_cte, and_(*join_conditions))
+                    self.dataset_table.join(attribute_cte, and_(*join_conditions))
                 )
         if where:
             query = query.filter(text(where))
-        query = query.filter(dataset_table.c.bucket == bucket)
+        query = query.filter(self.dataset_table.c.bucket == bucket)
         if dataset_type:
-            query = query.filter(dataset_table.c.dataset_type == dataset_type)
+            query = query.filter(self.dataset_table.c.dataset_type == dataset_type)
         if provider:
-            query = query.filter(dataset_table.c.provider == provider)
+            query = query.filter(self.dataset_table.c.provider == provider)
         if dataset_state:
-            query = query.filter(dataset_table.c.state.in_(dataset_state))
+            query = query.filter(self.dataset_table.c.state.in_(dataset_state))
         return query
@@ -328,23 +359,23 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         )
         dataset_rows = list(
-            self.session.query(dataset_table).select_from(
-                dataset_table.join(
+            self.session.query(self.dataset_table).select_from(
+                self.dataset_table.join(
                     dataset_ids_cte,
-                    dataset_ids_cte.c.dataset_id == dataset_table.c.dataset_id,
+                    dataset_ids_cte.c.dataset_id == self.dataset_table.c.dataset_id,
                 )
             )
         )
         revisions_per_dataset = {}
         rows = (
-            self.session.query(revision_table)
+            self.session.query(self.revision_table)
             .select_from(
-                revision_table.join(
+                self.revision_table.join(
                     dataset_ids_cte,
-                    dataset_ids_cte.c.dataset_id == revision_table.c.dataset_id,
+                    dataset_ids_cte.c.dataset_id == self.revision_table.c.dataset_id,
                 )
             )
-            .order_by(revision_table.c.dataset_id)
+            .order_by(self.revision_table.c.dataset_id)
         )
         for dataset_id, revisions in itertools.groupby(
@@ -354,14 +385,14 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         files_per_revision = {}
         rows = (
-            self.session.query(file_table)
+            self.session.query(self.file_table)
             .select_from(
-                file_table.join(
+                self.file_table.join(
                     dataset_ids_cte,
-                    dataset_ids_cte.c.dataset_id == file_table.c.dataset_id,
+                    dataset_ids_cte.c.dataset_id == self.file_table.c.dataset_id,
                 )
             )
-            .order_by(file_table.c.dataset_id, file_table.c.revision_id)
+            .order_by(self.file_table.c.dataset_id, self.file_table.c.revision_id)
         )
         for (dataset_id, revision_id), files in itertools.groupby(
@@ -425,8 +456,8 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
             if not metadata_only:
                 # Apply sorting by created_at in ascending order
                 dataset_query = apply_query_filter(
-                    self.session.query(dataset_table.c.dataset_id)
-                ).order_by(dataset_table.c.created_at.asc())
+                    self.session.query(self.dataset_table.c.dataset_id)
+                ).order_by(self.dataset_table.c.created_at.asc())
                 # Apply pagination if both page and page_size are provided
                 if page is not None and page_size is not None:
@@ -448,9 +479,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
                 metadata_result_query = (
                     apply_query_filter(
-                        self.session.query(dataset_table.c.last_modified_at)
+                        self.session.query(self.dataset_table.c.last_modified_at)
                     )
-                    .order_by(dataset_table.c.last_modified_at.desc())
+                    .order_by(self.dataset_table.c.last_modified_at.desc())
                     .limit(1)
                 )
@@ -508,11 +539,16 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         with self.connect() as connection:
             try:
-                self._upsert(connection, dataset_table, datasets_entities)
+                self._upsert(connection, self.dataset_table, datasets_entities)
                 self._upsert(
-                    connection, revision_table, revision_entities, immutable_rows=True
+                    connection,
+                    self.revision_table,
+                    revision_entities,
+                    immutable_rows=True,
+                )
+                self._upsert(
+                    connection, self.file_table, file_entities, immutable_rows=True
                 )
-                self._upsert(connection, file_table, file_entities, immutable_rows=True)
             except Exception:
                 connection.rollback()
                 raise
@@ -569,11 +605,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
             try:
                 self._upsert(
                     connection,
-                    ingestion_job_summary_table,
+                    self.ingestion_job_summary_table,
                     ingestion_job_summary_entities,
                 )
                 if task_summary_entities:
-                    self._upsert(connection, task_summary_table, task_summary_entities)
+                    self._upsert(
+                        connection, self.task_summary_table, task_summary_entities
+                    )
             except Exception:
                 connection.rollback()
                 raise
@@ -584,13 +622,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         ingestion_job_summary_ids = [
             row.ingestion_job_summary_id
             for row in self.session.query(
-                ingestion_job_summary_table.c.ingestion_job_summary_id
+                self.ingestion_job_summary_table.c.ingestion_job_summary_id
             )
         ]
         ingestion_job_summary_rows = list(
-            self.session.query(ingestion_job_summary_table).filter(
-                ingestion_job_summary_table.c.ingestion_job_summary_id.in_(
+            self.session.query(self.ingestion_job_summary_table).filter(
+                self.ingestion_job_summary_table.c.ingestion_job_summary_id.in_(
                     ingestion_job_summary_ids
                 )
             )
@@ -598,13 +636,13 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         task_summary_entities_per_job_summary = {}
         rows = (
-            self.session.query(task_summary_table)
+            self.session.query(self.task_summary_table)
             .filter(
-                task_summary_table.c.ingestion_job_summary_id.in_(
+                self.task_summary_table.c.ingestion_job_summary_id.in_(
                     ingestion_job_summary_ids
                 )
             )
-            .order_by(task_summary_table.c.ingestion_job_summary_id)
+            .order_by(self.task_summary_table.c.ingestion_job_summary_id)
         )
         for ingestion_job_summary_id, task_summaries_rows in itertools.groupby(
@@ -636,7 +674,9 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
     def get_store_version(self) -> Optional[str]:
         """Get the current Ingestify version stored for this store."""
         with self.session:
-            row = self.session.query(store_version_table.c.ingestify_version).first()
+            row = self.session.query(
+                self.store_version_table.c.ingestify_version
+            ).first()
             return row.ingestify_version if row else None
     def set_store_version(self, version: str):
@@ -653,7 +693,7 @@ class SqlAlchemyDatasetRepository(DatasetRepository):
         with self.connect() as connection:
             try:
-                self._upsert(connection, store_version_table, [entity])
+                self._upsert(connection, self.store_version_table, [entity])
                 connection.commit()
             except Exception:
                 connection.rollback()

ingestify 0.7.0__tar.gz → 0.9.0__tar.gz

ingestify 0.7.0tar.gz → 0.9.0tar.gz