PyPI - sibi-flux - Versions diffs - 2025.12.0__py3-none-any.whl - Mend

sibi-flux 2025.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

sibi_dst/__init__.py +44 -0
sibi_flux/__init__.py +49 -0
sibi_flux/artifacts/__init__.py +7 -0
sibi_flux/artifacts/base.py +166 -0
sibi_flux/artifacts/parquet.py +360 -0
sibi_flux/artifacts/parquet_engine/__init__.py +5 -0
sibi_flux/artifacts/parquet_engine/executor.py +204 -0
sibi_flux/artifacts/parquet_engine/manifest.py +101 -0
sibi_flux/artifacts/parquet_engine/planner.py +544 -0
sibi_flux/conf/settings.py +131 -0
sibi_flux/core/__init__.py +5 -0
sibi_flux/core/managed_resource/__init__.py +3 -0
sibi_flux/core/managed_resource/_managed_resource.py +733 -0
sibi_flux/core/type_maps/__init__.py +100 -0
sibi_flux/dask_cluster/__init__.py +47 -0
sibi_flux/dask_cluster/async_core.py +27 -0
sibi_flux/dask_cluster/client_manager.py +549 -0
sibi_flux/dask_cluster/core.py +322 -0
sibi_flux/dask_cluster/exceptions.py +34 -0
sibi_flux/dask_cluster/utils.py +49 -0
sibi_flux/datacube/__init__.py +3 -0
sibi_flux/datacube/_data_cube.py +332 -0
sibi_flux/datacube/config_engine.py +152 -0
sibi_flux/datacube/field_factory.py +48 -0
sibi_flux/datacube/field_registry.py +122 -0
sibi_flux/datacube/generator.py +677 -0
sibi_flux/datacube/orchestrator.py +171 -0
sibi_flux/dataset/__init__.py +3 -0
sibi_flux/dataset/_dataset.py +162 -0
sibi_flux/df_enricher/__init__.py +56 -0
sibi_flux/df_enricher/async_enricher.py +201 -0
sibi_flux/df_enricher/merger.py +253 -0
sibi_flux/df_enricher/specs.py +45 -0
sibi_flux/df_enricher/types.py +12 -0
sibi_flux/df_helper/__init__.py +5 -0
sibi_flux/df_helper/_df_helper.py +450 -0
sibi_flux/df_helper/backends/__init__.py +34 -0
sibi_flux/df_helper/backends/_params.py +173 -0
sibi_flux/df_helper/backends/_strategies.py +295 -0
sibi_flux/df_helper/backends/http/__init__.py +5 -0
sibi_flux/df_helper/backends/http/_http_config.py +122 -0
sibi_flux/df_helper/backends/parquet/__init__.py +7 -0
sibi_flux/df_helper/backends/parquet/_parquet_options.py +268 -0
sibi_flux/df_helper/backends/sqlalchemy/__init__.py +9 -0
sibi_flux/df_helper/backends/sqlalchemy/_db_connection.py +256 -0
sibi_flux/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
sibi_flux/df_helper/backends/sqlalchemy/_io_dask.py +386 -0
sibi_flux/df_helper/backends/sqlalchemy/_load_from_db.py +134 -0
sibi_flux/df_helper/backends/sqlalchemy/_model_registry.py +239 -0
sibi_flux/df_helper/backends/sqlalchemy/_sql_model_builder.py +42 -0
sibi_flux/df_helper/backends/utils.py +32 -0
sibi_flux/df_helper/core/__init__.py +15 -0
sibi_flux/df_helper/core/_defaults.py +104 -0
sibi_flux/df_helper/core/_filter_handler.py +617 -0
sibi_flux/df_helper/core/_params_config.py +185 -0
sibi_flux/df_helper/core/_query_config.py +17 -0
sibi_flux/df_validator/__init__.py +3 -0
sibi_flux/df_validator/_df_validator.py +222 -0
sibi_flux/logger/__init__.py +1 -0
sibi_flux/logger/_logger.py +480 -0
sibi_flux/mcp/__init__.py +26 -0
sibi_flux/mcp/client.py +150 -0
sibi_flux/mcp/router.py +126 -0
sibi_flux/orchestration/__init__.py +9 -0
sibi_flux/orchestration/_artifact_orchestrator.py +346 -0
sibi_flux/orchestration/_pipeline_executor.py +212 -0
sibi_flux/osmnx_helper/__init__.py +22 -0
sibi_flux/osmnx_helper/_pbf_handler.py +384 -0
sibi_flux/osmnx_helper/graph_loader.py +225 -0
sibi_flux/osmnx_helper/utils.py +100 -0
sibi_flux/pipelines/__init__.py +3 -0
sibi_flux/pipelines/base.py +218 -0
sibi_flux/py.typed +0 -0
sibi_flux/readers/__init__.py +3 -0
sibi_flux/readers/base.py +82 -0
sibi_flux/readers/parquet.py +106 -0
sibi_flux/utils/__init__.py +53 -0
sibi_flux/utils/boilerplate/__init__.py +19 -0
sibi_flux/utils/boilerplate/base_attacher.py +45 -0
sibi_flux/utils/boilerplate/base_cube_router.py +283 -0
sibi_flux/utils/boilerplate/base_data_cube.py +132 -0
sibi_flux/utils/boilerplate/base_pipeline_template.py +54 -0
sibi_flux/utils/boilerplate/hybrid_data_loader.py +193 -0
sibi_flux/utils/clickhouse_writer/__init__.py +6 -0
sibi_flux/utils/clickhouse_writer/_clickhouse_writer.py +225 -0
sibi_flux/utils/common.py +7 -0
sibi_flux/utils/credentials/__init__.py +3 -0
sibi_flux/utils/credentials/_config_manager.py +155 -0
sibi_flux/utils/dask_utils.py +14 -0
sibi_flux/utils/data_utils/__init__.py +3 -0
sibi_flux/utils/data_utils/_data_utils.py +389 -0
sibi_flux/utils/dataframe_utils.py +52 -0
sibi_flux/utils/date_utils/__init__.py +10 -0
sibi_flux/utils/date_utils/_business_days.py +220 -0
sibi_flux/utils/date_utils/_date_utils.py +311 -0
sibi_flux/utils/date_utils/_file_age_checker.py +319 -0
sibi_flux/utils/file_utils.py +48 -0
sibi_flux/utils/filepath_generator/__init__.py +5 -0
sibi_flux/utils/filepath_generator/_filepath_generator.py +185 -0
sibi_flux/utils/parquet_saver/__init__.py +6 -0
sibi_flux/utils/parquet_saver/_parquet_saver.py +436 -0
sibi_flux/utils/parquet_saver/_write_gatekeeper.py +33 -0
sibi_flux/utils/retry.py +46 -0
sibi_flux/utils/storage/__init__.py +7 -0
sibi_flux/utils/storage/_fs_registry.py +112 -0
sibi_flux/utils/storage/_storage_manager.py +257 -0
sibi_flux/utils/storage/factory.py +33 -0
sibi_flux-2025.12.0.dist-info/METADATA +283 -0
sibi_flux-2025.12.0.dist-info/RECORD +110 -0
sibi_flux-2025.12.0.dist-info/WHEEL +4 -0

sibi_flux/dask_cluster/core.py ADDED Viewed

@@ -0,0 +1,322 @@
+"""
+Core functionality for resilient Dask operations.
+- Dry Run: Graph complexity inspection and logging to OpenObserve.
+- Resilience: Auto-healing via persistent client registry.
+- Invariants: Strict prohibition of local fallback for Dask DataFrames.
+"""
+from __future__ import annotations
+import asyncio
+import logging
+from typing import Any, Callable, Dict, List, Optional, TypeVar
+import dask
+import dask.dataframe as dd
+import pandas as pd
+try:
+    from dask.distributed import Client, Future
+    from dask.distributed import wait as dask_wait
+except ImportError:
+    Client = object
+    Future = object
+    def dask_wait(*args, **kwargs):
+        pass
+# Project-specific imports
+from .client_manager import get_persistent_client
+from .exceptions import RECOVERABLE_COMMS
+from .utils import _to_int_safe
+T = TypeVar("T")
+# ---------------------------------------------------------------------------
+# Late-Binding & Helpers
+# ---------------------------------------------------------------------------
+def _get_log():
+    """Late-binds the Logger to prevent circular imports during init."""
+    try:
+        from sibi_flux.logger import Logger
+        return Logger.default_logger(logger_name="dask_cluster.core")
+    except ImportError:
+        return logging.getLogger("dask_cluster.core")
+def _is_dask_dataframe_like(obj: Any) -> bool:
+    """Checks if object is a Dask collection relying on distributed state."""
+    return isinstance(obj, (dd.DataFrame, dd.Series)) or hasattr(obj, "_meta")
+def _get_active_client(
+    provided_client: Optional[Client], logger=None
+) -> Optional[Client]:
+    """Retrieves a healthy client, healing the persistent one if necessary."""
+    if provided_client and provided_client.status == "running":
+        return provided_client
+    try:
+        # get_persistent_client handles internal healing/watchdog logic
+        return get_persistent_client(logger=logger)
+    except Exception:
+        return None
+# ---------------------------------------------------------------------------
+# Inspection & Dry Run
+# ---------------------------------------------------------------------------
+def get_graph_metrics(obj: Any) -> Dict[str, Any]:
+    """Extract complexity metrics from a Dask object for observability."""
+    try:
+        # If it's a list (e.g., for safe_gather), check if any item is dask-backed
+        if isinstance(obj, list):
+            obj = obj[0] if obj and hasattr(obj[0], "__dask_graph__") else None
+        if obj is None or not hasattr(obj, "__dask_graph__"):
+            return {"is_dask": False}
+        graph = obj.__dask_graph__()
+        return {
+            "type": type(obj).__name__,
+            "is_dask": True,
+            "task_count": len(graph),
+            "n_partitions": getattr(obj, "npartitions", "N/A"),
+            "layers": (
+                len(getattr(graph, "layers", []))
+                if hasattr(graph, "layers")
+                else "unknown"
+            ),
+        }
+    except Exception as e:
+        return {"error_extracting_metrics": str(e)}
+# ---------------------------------------------------------------------------
+# Resilient Execution Engine
+# ---------------------------------------------------------------------------
+def _execute_with_resilience(
+    op: Callable[..., T],
+    obj: Any,
+    dask_client: Optional[Client],
+    logger=None,
+    dry_run: bool = False,
+    **kwargs,
+) -> Optional[T]:
+    """
+    Orchestrates Dask operations with dry-run logging and a single-retry
+    healing mechanism for communication failures.
+    """
+    log = logger or _get_log()
+    # 1. Observability: Log graph complexity to OpenObserve
+    metrics = get_graph_metrics(obj)
+    if metrics.get("is_dask"):
+        log.info(
+            "Dask Graph Inspection",
+            extra={"graph_metrics": metrics, "dry_run": dry_run},
+        )
+        if dry_run:
+            log.info("Dry Run: Execution skipped.")
+            return None
+    # 2. Execution with Auto-Healing
+    active_client = _get_active_client(dask_client, logger=log)
+    try:
+        return op(obj, active_client, **kwargs)
+    except RECOVERABLE_COMMS as e:
+        log.warning(f"Dask comm failure ({type(e).__name__}). Healing and retrying.")
+        # Trigger explicit heal via singleton refresh
+        active_client = get_persistent_client(logger=log)
+        if active_client:
+            log.info("Client healed. Resubmitting task.")
+            return op(obj, active_client, **kwargs)
+        # Guard: Never fall back to local compute for DataFrames (Memory Safety)
+        if _is_dask_dataframe_like(obj):
+            raise RuntimeError(
+                "Distributed client lost and cannot be healed. "
+                "Local fallback forbidden for DataFrames."
+            ) from e
+        log.warning("Falling back to local threaded compute (safe for non-DataFrame).")
+        return obj.compute(scheduler="threads")
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+def _compute_impl(obj: Any, client: Optional[Client]) -> Any:
+    if client:
+        res = client.compute(obj)
+        return res.result() if isinstance(res, Future) else res
+    return obj.compute()
+def safe_compute(
+    obj: Any, dask_client: Optional[Client] = None, logger=None, dry_run: bool = False
+) -> Any:
+    """Compute with auto-healing and optional dry-run complexity logging."""
+    return _execute_with_resilience(
+        _compute_impl, obj, dask_client, logger, dry_run=dry_run
+    )
+def safe_persist(obj: Any, dask_client: Optional[Client] = None, logger=None) -> Any:
+    """Persist a collection to distributed memory with auto-healing."""
+    def _persist_op(o, c):
+        return c.persist(o) if c else o.persist()
+    return _execute_with_resilience(_persist_op, obj, dask_client, logger)
+def safe_gather(
+    objs: List[Any], dask_client: Optional[Client] = None, logger=None
+) -> List[Any]:
+    """Gather multiple futures or collections into local memory."""
+    if not objs:
+        return []
+    def _gather_op(items, client):
+        if client:
+            return client.gather(client.compute(items))
+        return list(dask.compute(*items, scheduler="threads"))
+    return _execute_with_resilience(_gather_op, objs, dask_client, logger)
+def safe_wait(
+    obj: Any,
+    dask_client: Optional[Client] = None,
+    timeout: Optional[float] = None,
+    logger=None,
+) -> Any:
+    """Wait for completion. Safe from local-fallback for DataFrames."""
+    log = logger or _get_log()
+    client = _get_active_client(dask_client, logger=log)
+    try:
+        if client:
+            dask_wait(obj, timeout=timeout)
+        elif not _is_dask_dataframe_like(obj) and hasattr(obj, "compute"):
+            obj.compute(scheduler="threads")
+        return obj
+    except Exception as e:
+        log.warning(f"safe_wait: {type(e).__name__}: {e}")
+        return obj
+def safe_dry_run(obj: Any, logger=None) -> Dict[str, Any]:
+    """Utility to log and return graph metrics without execution."""
+    metrics = get_graph_metrics(obj)
+    _get_log().info("Manual Dask Dry Run", extra={"graph_metrics": metrics})
+    return metrics
+# ---------------------------------------------------------------------------
+# Heuristic Emptiness Checks
+# ---------------------------------------------------------------------------
+def dask_is_probably_empty(ddf: dd.DataFrame) -> bool:
+    """Metadata check (zero partitions)."""
+    return getattr(ddf, "npartitions", 0) == 0
+def dask_is_empty_truthful(
+    ddf: dd.DataFrame, dask_client: Optional[Client] = None, logger=None
+) -> bool:
+    """Expensive but accurate full-table count check."""
+    total = safe_compute(
+        ddf.map_partitions(len, meta=("n", "int64")).sum(),
+        dask_client=dask_client,
+        logger=logger,
+    )
+    return int(_to_int_safe(total)) == 0
+def dask_is_empty(
+    ddf: dd.DataFrame,
+    *,
+    sample: int = 4,
+    dask_client: Optional[Client] = None,
+    logger=None,
+) -> bool:
+    """
+    Multi-stage check:
+    1. Metadata
+    2. Parallel sampling of first K partitions
+    3. Truthful sum (fallback)
+    """
+    if dask_is_probably_empty(ddf):
+        return True
+    k = min(max(sample, 1), ddf.npartitions)
+    try:
+        parts = [
+            ddf.get_partition(i).map_partitions(len, meta=("n", "int64"))
+            for i in range(k)
+        ]
+        probes = safe_gather(parts, dask_client, logger=logger)
+        if any(_to_int_safe(n) > 0 for n in probes):
+            return False
+        if k == ddf.npartitions:
+            return True
+    except Exception as e:
+        _get_log().warning(f"dask_is_empty probe failed: {e}")
+        return False
+    return dask_is_empty_truthful(ddf, dask_client=dask_client, logger=logger)
+# ---------------------------------------------------------------------------
+# Data Extraction
+# ---------------------------------------------------------------------------
+class UniqueValuesExtractor:
+    """Resilient unique value extraction from Dask columns."""
+    def __init__(self, dask_client: Optional[Client] = None, logger=None):
+        self.dask_client = dask_client
+        self.logger = logger
+    async def extract_unique_values(
+        self, df: dd.DataFrame, *columns: str, limit: int = 100_000
+    ) -> Dict[str, List[Any]]:
+        async def _extract(col):
+            # Optimization: drop duplicates on the distributed collection first
+            unique_dd = df[col].dropna().drop_duplicates()
+            # Fetch only the stats head to avoid OOM
+            # npartitions=-1 forces a logical 'head' across partitions if needed,
+            # but usually for unique values we want to be careful.
+            # Using compute=True on head triggers the fetch.
+            # We run this in a thread because head(compute=True) is blocking
+            res = await asyncio.to_thread(
+                lambda: unique_dd.head(limit, npartitions=-1, compute=True)
+            )
+            if len(res) >= limit:
+                if self.logger:
+                    self.logger.warning(
+                        f"Unique value extraction for column '{col}' truncated at {limit} items. "
+                        "High cardinality detected.",
+                        extra={"column": col, "limit": limit},
+                    )
+            return col, res.tolist()
+        results = await asyncio.gather(*(_extract(c) for c in columns))
+        return dict(results)

sibi_flux/dask_cluster/exceptions.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""
+Custom exceptions for the dask_cluster package.
+"""
+try:
+    # distributed >=2024 uses this location
+    from distributed.comm.core import CommClosedError  # type: ignore
+except ImportError:  # pragma: no cover
+    class CommClosedError(Exception):  # type: ignore[no-redef]
+        """Fallback CommClosedError for older distributed versions."""
+        pass
+try:
+    from tornado.iostream import StreamClosedError  # type: ignore
+except ImportError:  # pragma: no cover
+    class StreamClosedError(Exception):  # type: ignore[no-redef]
+        """Fallback StreamClosedError for missing tornado."""
+        pass
+# Common exception set considered recoverable by rebind-and-retry
+RECOVERABLE_COMMS = (
+    CommClosedError,
+    StreamClosedError,
+    TimeoutError,
+    ConnectionError,
+    OSError,
+    RuntimeError,
+)

sibi_flux/dask_cluster/utils.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""
+Utility functions for the dask_cluster package.
+"""
+import numpy as np  # type: ignore
+import pandas as pd
+from typing import Any
+def _to_int_safe(x: Any, default: int = 0) -> int:
+    """
+    Safely convert a value to integer with fallback defaults.
+    """
+    if x is None:
+        return default
+    if isinstance(x, (int, np.integer)) and not isinstance(x, bool):
+        return int(x)
+    if isinstance(x, (float, np.floating)):
+        try:
+            return int(x)
+        except Exception:
+            return default
+    if isinstance(x, np.generic):
+        try:
+            return int(x.item())
+        except Exception:
+            return default
+    if isinstance(x, (pd.Series, pd.Index, list, tuple, np.ndarray)):
+        try:
+            arr = np.asarray(x)
+            if arr.size == 0:
+                return default
+            return _to_int_safe(arr.ravel()[0], default=default)
+        except Exception:
+            return default
+    if hasattr(x, "item"):
+        try:
+            return _to_int_safe(x.item(), default=default)
+        except Exception:
+            return default
+    if hasattr(x, "iloc"):
+        try:
+            return _to_int_safe(x.iloc[0], default=default)
+        except Exception:
+            return default
+    try:
+        return int(x)
+    except Exception:
+        return default

sibi_flux/datacube/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from ._data_cube import Datacube, DatacubeConfig
+__all__ = ["Datacube", "DatacubeConfig"]