PyPI - ingestify - Versions diffs - 0.7.0__tar.gz → 0.8.0__tar.gz - Mend

ingestify 0.7.0tar.gz → 0.8.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

{ingestify-0.7.0 → ingestify-0.8.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ingestify
-Version: 0.7.0
+Version: 0.8.0
 Summary: Data Ingestion Framework
 Author: Koen Vossen
 Author-email: info@koenvossen.nl
@@ -68,6 +68,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
 pip install ingestify            # or: pip install git+https://github.com/PySport/ingestify.git
 ```
+### Developing a new Source
+When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
+```python
+from ingestify import Source, debug_source
+class MyCustomSource(Source):
+    provider = "my_provider"
+    def __init__(self, name: str, api_key: str):
+        super().__init__(name)
+        self.api_key = api_key
+    def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
+        # Your source implementation
+        ...
+# Quick debug - runs full ingestion with temp storage
+if __name__ == "__main__":
+    source = MyCustomSource(name="test", api_key="...")
+    debug_source(
+        source,
+        dataset_type="match",
+        data_spec_versions={"events": "v1"},
+    )
+```
+The `debug_source()` helper:
+- ✅ Creates an ephemeral dev engine with temp storage
+- ✅ Configures logging automatically
+- ✅ Runs the full ingestion cycle
+- ✅ Shows storage location and results
+Perfect for testing your source before adding it to production config!
 ### Minimal `config.yaml`
 ```yaml
@@ -166,8 +203,16 @@ pip install kloppy
 ```
 ```python
+import logging, sys
 from ingestify.main import get_engine
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    stream=sys.stderr,
+)
 engine = get_engine(
     metadata_url="sqlite:///database_open_data/catalog.db",
     file_url="file://database_open_data/files/"
@@ -179,12 +224,13 @@ dataset_iter = engine.iter_datasets(
     provider="statsbomb",
     dataset_type="match",
-    competition_id=43,
-    season_id=281
+    competition_id=43,  # "FIFA World Cup"
+    #season_id=281
 )
 for dataset in dataset_iter:
     kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
+    logging.info(f"Loaded {kloppy_dataset}")
 ```

{ingestify-0.7.0 → ingestify-0.8.0}/README.md RENAMED Viewed

@@ -58,6 +58,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
 pip install ingestify            # or: pip install git+https://github.com/PySport/ingestify.git
 ```
+### Developing a new Source
+When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
+```python
+from ingestify import Source, debug_source
+class MyCustomSource(Source):
+    provider = "my_provider"
+    def __init__(self, name: str, api_key: str):
+        super().__init__(name)
+        self.api_key = api_key
+    def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
+        # Your source implementation
+        ...
+# Quick debug - runs full ingestion with temp storage
+if __name__ == "__main__":
+    source = MyCustomSource(name="test", api_key="...")
+    debug_source(
+        source,
+        dataset_type="match",
+        data_spec_versions={"events": "v1"},
+    )
+```
+The `debug_source()` helper:
+- ✅ Creates an ephemeral dev engine with temp storage
+- ✅ Configures logging automatically
+- ✅ Runs the full ingestion cycle
+- ✅ Shows storage location and results
+Perfect for testing your source before adding it to production config!
 ### Minimal `config.yaml`
 ```yaml
@@ -156,8 +193,16 @@ pip install kloppy
 ```
 ```python
+import logging, sys
 from ingestify.main import get_engine
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    stream=sys.stderr,
+)
 engine = get_engine(
     metadata_url="sqlite:///database_open_data/catalog.db",
     file_url="file://database_open_data/files/"
@@ -169,12 +214,13 @@ dataset_iter = engine.iter_datasets(
     provider="statsbomb",
     dataset_type="match",
-    competition_id=43,
-    season_id=281
+    competition_id=43,  # "FIFA World Cup"
+    #season_id=281
 )
 for dataset in dataset_iter:
     kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
+    logging.info(f"Loaded {kloppy_dataset}")
 ```

{ingestify-0.7.0 → ingestify-0.8.0}/ingestify/__init__.py RENAMED Viewed

@@ -7,5 +7,6 @@ except NameError:
 if not __INGESTIFY_SETUP__:
     from .infra import retrieve_http
     from .source_base import Source, DatasetResource
+    from .main import debug_source
-__version__ = "0.7.0"
+__version__ = "0.8.0"

{ingestify-0.7.0 → ingestify-0.8.0}/ingestify/application/ingestion_engine.py RENAMED Viewed

@@ -110,6 +110,9 @@ class IngestionEngine:
         else:
             do_load()
+    # Alias for load() - more intuitive name for running ingestion
+    run = load
     def list_datasets(self, as_count: bool = False):
         """Consider moving this to DataStore"""
         datasets = sorted(

{ingestify-0.7.0 → ingestify-0.8.0}/ingestify/application/loader.py RENAMED Viewed

@@ -307,7 +307,17 @@ class Loader:
             auto_ingest_config=auto_ingest_config,
             **selector_filters,
         )
-        if selector_filters and not selectors:
-            logger.warning(f"No data found matching {selector_filters}")
+        if (provider or source or dataset_type or selector_filters) and not selectors:
+            filters_applied = {
+                k: v
+                for k, v in {
+                    "provider": provider,
+                    "source": source,
+                    "dataset_type": dataset_type,
+                    **selector_filters,
+                }.items()
+                if v
+            }
+            logger.warning(f"No data found matching filters: {filters_applied}")
         else:
             self.run(selectors, dry_run=dry_run)

{ingestify-0.7.0 → ingestify-0.8.0}/ingestify/domain/models/dataset/file.py RENAMED Viewed

@@ -39,6 +39,12 @@ class DraftFile(BaseModel):
             stream = BytesIO(file_.read().encode("utf-8"))
         elif isinstance(file_, BytesIO):
             stream = file_
+        elif hasattr(file_, "read"):
+            data = file_.read()
+            if isinstance(data, bytes):
+                stream = BytesIO(data)
+            else:
+                stream = BytesIO(data.encode("utf-8"))
         else:
             raise Exception(f"Not possible to create DraftFile from {type(file_)}")

{ingestify-0.7.0 → ingestify-0.8.0}/ingestify/main.py RENAMED Viewed

@@ -279,3 +279,158 @@ def get_engine(
         ingestion_engine.add_ingestion_plan(ingestion_plan_)
     return ingestion_engine
+def get_dev_engine(
+    source: Source,
+    dataset_type: str,
+    data_spec_versions: dict,
+    ephemeral: bool = True,
+    configure_logging: bool = True,
+    dev_dir: Optional[str] = None,
+) -> IngestionEngine:
+    """
+    Quick development helper - creates an engine with minimal setup.
+    Args:
+        source: The source to test
+        dataset_type: Dataset type to ingest
+        data_spec_versions: Dict like {"hops": "v1"}
+        ephemeral: If True, uses temp dir that gets cleaned. If False, uses persistent /tmp storage.
+        configure_logging: If True, configures basic logging (default: True)
+        dev_dir: Optional custom directory for data storage (overrides ephemeral)
+    Returns:
+        IngestionEngine configured for development
+    Example:
+        >>> source = MySource(name="test", ...)
+        >>> engine = get_dev_engine(source, "hops", {"hops": "v1"})
+        >>> engine.run()
+        >>>
+        >>> # Access the datasets
+        >>> datasets = engine.store.get_dataset_collection()
+        >>> print(f"Ingested {len(datasets)} datasets")
+    """
+    import tempfile
+    from pathlib import Path
+    if configure_logging:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        )
+    if dev_dir:
+        # Use provided directory
+        dev_dir = Path(dev_dir)
+    elif ephemeral:
+        # Use temp directory that will be cleaned up
+        import uuid
+        dev_dir = Path(tempfile.gettempdir()) / f"ingestify-dev-{uuid.uuid4().hex[:8]}"
+    else:
+        # Use persistent directory
+        dev_dir = Path(tempfile.gettempdir()) / "ingestify-dev"
+    dev_dir.mkdir(parents=True, exist_ok=True)
+    metadata_url = f"sqlite:///{dev_dir / 'database.db'}"
+    file_url = f"file://{dev_dir}"
+    logger.info(f"Dev mode: storing data in {dev_dir}")
+    engine = get_engine(
+        metadata_url=metadata_url,
+        file_url=file_url,
+        bucket="main",
+        disable_events=True,
+    )
+    data_spec_versions_obj = DataSpecVersionCollection.from_dict(data_spec_versions)
+    engine.add_ingestion_plan(
+        IngestionPlan(
+            source=source,
+            dataset_type=dataset_type,
+            selectors=[Selector.build({}, data_spec_versions=data_spec_versions_obj)],
+            fetch_policy=FetchPolicy(),
+            data_spec_versions=data_spec_versions_obj,
+        )
+    )
+    return engine
+def debug_source(
+    source: Source,
+    *,
+    dataset_type: str,
+    data_spec_versions: dict,
+    ephemeral: bool = True,
+    configure_logging: bool = True,
+    dev_dir: Optional[str] = None,
+    **kwargs,
+) -> IngestionEngine:
+    """
+    Debug helper - creates a dev engine, runs ingestion, and shows results.
+    This is a convenience wrapper around get_dev_engine() that does everything:
+    creates the engine, runs ingestion, and displays results.
+    Args:
+        source: The source to debug
+        dataset_type: Dataset type (e.g., "match")
+        data_spec_versions: Dict like {"match": "v1"} - explicit, no defaults!
+        ephemeral: If True, uses temp dir. If False, uses persistent /tmp storage.
+        configure_logging: If True, configures basic logging (default: True)
+        dev_dir: Optional custom directory for data storage (overrides ephemeral)
+        **kwargs: Selector arguments. For sources with discover_selectors(), these
+                  filter discovered selectors. Otherwise passed to find_datasets().
+    Returns:
+        IngestionEngine: The engine used for ingestion (for further inspection)
+    Example:
+        >>> # Simple source without discover_selectors
+        >>> source = StatsBombHOPSS3(name="test", s3_bucket="my-bucket", s3_prefix="HOPS")
+        >>> engine = debug_source(source, dataset_type="hops", data_spec_versions={"hops": "v1"})
+        >>> # Source with discover_selectors - discovers all competitions
+        >>> source = StatsBombMatchAPI(name="test", ...)
+        >>> engine = debug_source(
+        ...     source,
+        ...     dataset_type="match",
+        ...     data_spec_versions={"match": "v6"}
+        ... )
+        >>> # Filter discovered selectors
+        >>> engine = debug_source(
+        ...     source,
+        ...     dataset_type="match",
+        ...     data_spec_versions={"match": "v6"},
+        ...     competition_id=46  # Filters to specific competition
+        ... )
+    """
+    logger.info(f"Debug mode for source: {source.name}")
+    engine = get_dev_engine(
+        source=source,
+        dataset_type=dataset_type,
+        data_spec_versions=data_spec_versions,
+        ephemeral=ephemeral,
+        configure_logging=configure_logging,
+        dev_dir=dev_dir,
+    )
+    # Run ingestion
+    # Empty selector {} automatically triggers discover_selectors() if available
+    # kwargs filter discovered selectors or are passed to find_datasets()
+    engine.run(**kwargs)
+    # Show results
+    datasets = engine.store.get_dataset_collection()
+    logger.info("=" * 60)
+    logger.info(f"✓ Ingestion complete: {len(datasets)} dataset(s)")
+    logger.info("=" * 60)
+    return engine

{ingestify-0.7.0 → ingestify-0.8.0}/ingestify.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: ingestify
-Version: 0.7.0
+Version: 0.8.0
 Summary: Data Ingestion Framework
 Author: Koen Vossen
 Author-email: info@koenvossen.nl
@@ -68,6 +68,43 @@ Ingestify fixes that by building **your own data lake** of untouched provider fi
 pip install ingestify            # or: pip install git+https://github.com/PySport/ingestify.git
 ```
+### Developing a new Source
+When developing a new `Source`, use the `debug_source()` helper for rapid iteration:
+```python
+from ingestify import Source, debug_source
+class MyCustomSource(Source):
+    provider = "my_provider"
+    def __init__(self, name: str, api_key: str):
+        super().__init__(name)
+        self.api_key = api_key
+    def find_datasets(self, dataset_type, data_spec_versions, **kwargs):
+        # Your source implementation
+        ...
+# Quick debug - runs full ingestion with temp storage
+if __name__ == "__main__":
+    source = MyCustomSource(name="test", api_key="...")
+    debug_source(
+        source,
+        dataset_type="match",
+        data_spec_versions={"events": "v1"},
+    )
+```
+The `debug_source()` helper:
+- ✅ Creates an ephemeral dev engine with temp storage
+- ✅ Configures logging automatically
+- ✅ Runs the full ingestion cycle
+- ✅ Shows storage location and results
+Perfect for testing your source before adding it to production config!
 ### Minimal `config.yaml`
 ```yaml
@@ -166,8 +203,16 @@ pip install kloppy
 ```
 ```python
+import logging, sys
 from ingestify.main import get_engine
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    stream=sys.stderr,
+)
 engine = get_engine(
     metadata_url="sqlite:///database_open_data/catalog.db",
     file_url="file://database_open_data/files/"
@@ -179,12 +224,13 @@ dataset_iter = engine.iter_datasets(
     provider="statsbomb",
     dataset_type="match",
-    competition_id=43,
-    season_id=281
+    competition_id=43,  # "FIFA World Cup"
+    #season_id=281
 )
 for dataset in dataset_iter:
     kloppy_dataset = engine.load_dataset_with_kloppy(dataset)
+    logging.info(f"Loaded {kloppy_dataset}")
 ```