PyPI - sibi-dst - Versions diffs - 2025.8.5__tar.gz → 2025.8.7__tar.gz - Mend

sibi-dst 2025.8.5tar.gz → 2025.8.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

{sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 2025.8.5
+Version: 2025.8.7
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sibi-dst"
-version = "2025.8.5"
+version = "2025.8.7"
 description = "Data Science Toolkit"
 authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
 readme = "README.md"

{sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/_df_helper.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import asyncio
 from typing import Any, Dict, Optional, TypeVar, Union
 import dask.dataframe as dd
@@ -104,7 +105,6 @@ class HttpBackend(BaseBackend):
         return self.total_records, result
-# ---- Main DfHelper ----
 class DfHelper(ManagedResource):
     _BACKEND_STRATEGIES = {
         "sqlalchemy": SqlAlchemyBackend,
@@ -198,6 +198,37 @@ class DfHelper(ManagedResource):
         df = df.persist() if persist else df
         return df.compute() if as_pandas else df
+    async def load_async(
+            self,
+            *,
+            persist: bool = False,
+            as_pandas: bool = False,
+            prefer_native: bool = False,
+            **options,
+    ):
+        """
+        Async load that prefers native async backends when available,
+        otherwise runs the sync `load()` in a worker thread via asyncio.to_thread.
+        Args:
+            persist: same as `load`
+            as_pandas: same as `load`
+            prefer_native: if True and the backend overrides `aload`, use it.
+                           otherwise force thread offload of `load()`.
+            **options: forwarded to `load` / `aload`
+        """
+        # If the backend provided an override for `aload`, use it
+        if prefer_native and type(self.backend_strategy).aload is not BaseBackend.aload:
+            return await self.aload(persist=persist, as_pandas=as_pandas, **options)
+        # Fall back to offloading the sync path to a thread
+        return await asyncio.to_thread(
+            self.load,
+            persist=persist,
+            as_pandas=as_pandas,
+            **options,
+        )
     # ---------- dataframe post-processing ----------
     def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
         self.logger.debug("Post-processing DataFrame.")
@@ -240,9 +271,12 @@ class DfHelper(ManagedResource):
         return df
     # ---------- sinks ----------
-    def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
-        fs: AbstractFileSystem = kwargs.get("fs", self.fs)
-        path: str = kwargs.get("parquet_storage_path")
+    def save_to_parquet(self, df: dd.DataFrame, **kwargs):
+        fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
+        path: str = kwargs.pop("parquet_storage_path", self.backend_parquet.parquet_storage_path if self.backend_parquet else None)
+        parquet_filename = kwargs.pop("parquet_filename" or self._backend_params.parquet_filename if self.backend_parquet else None)
+        if not parquet_filename:
+            raise ValueError("A 'parquet_filename' keyword argument must be provided.")
         if not fs:
             raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
         if not path:
@@ -268,11 +302,11 @@ class DfHelper(ManagedResource):
         if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
             self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
             return
-        with ClickHouseWriter(debug=self.debug, logger=self.logger, fs=self.fs, verbose=self.verbose, **credentials) as writer:
+        with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
             writer.save_to_clickhouse(df)
             self.logger.debug("Save to ClickHouse completed.")
-    # ---------- convenience period loaders ----------
+    # ---------- period loaders ----------
     def load_period(self, dt_field: str, start: str, end: str, **kwargs):
         final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
         return self.load(**final_kwargs)

sibi_dst-2025.8.7/sibi_dst/utils/async_utils.py ADDED Viewed

@@ -0,0 +1,12 @@
+import asyncio
+import dask.dataframe as dd
+def is_dask_dataframe(df):
+    """Check if the given object is a Dask DataFrame."""
+    return isinstance(df, dd.DataFrame)
+async def to_thread(func, *args, **kwargs):
+    """Explicit helper to keep code clear where we hop off the event loop."""
+    return await asyncio.to_thread(func, *args, **kwargs)

sibi_dst-2025.8.7/sibi_dst/utils/clickhouse_writer.py ADDED Viewed

@@ -0,0 +1,264 @@
+from __future__ import annotations
+import threading
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
+import pandas as pd
+import dask.dataframe as dd
+import clickhouse_connect
+from . import ManagedResource
+class ClickHouseWriter(ManagedResource):
+    """
+    Write a Dask DataFrame to ClickHouse with:
+      - Safe Dask checks (no df.empty)
+      - Nullable dtype mapping
+      - Optional overwrite (drop + recreate)
+      - Partitioned, batched inserts
+      - Per-thread clients to avoid session conflicts
+    """
+    # Default dtype mapping (pandas/dask → ClickHouse)
+    DTYPE_MAP: ClassVar[Dict[str, str]] = {
+        "int64": "Int64",
+        "Int64": "Int64",  # pandas nullable Int64
+        "int32": "Int32",
+        "Int32": "Int32",
+        "float64": "Float64",
+        "Float64": "Float64",
+        "float32": "Float32",
+        "bool": "UInt8",
+        "boolean": "UInt8",
+        "object": "String",
+        "string": "String",
+        "category": "String",
+        "datetime64[ns]": "DateTime",
+        "datetime64[ns, UTC]": "DateTime",
+    }
+    def __init__(
+        self,
+        *,
+        host: str = "localhost",
+        port: int = 8123,
+        database: str = "sibi_data",
+        user: str = "default",
+        password: str = "",
+        table: str = "test_sibi_table",
+        order_by: str = "id",
+        engine: Optional[str] = None,  # e.g. "ENGINE MergeTree ORDER BY (`id`)"
+        max_workers: int = 4,
+        insert_chunksize: int = 50_000,
+        overwrite: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.host = host
+        self.port = int(port)
+        self.database = database
+        self.user = user
+        self.password = password
+        self.table = table
+        self.order_by = order_by
+        self.engine = engine  # if None → default MergeTree ORDER BY
+        self.max_workers = int(max_workers)
+        self.insert_chunksize = int(insert_chunksize)
+        self.overwrite = bool(overwrite)
+        # one client per thread to avoid session contention
+        self._tlocal = threading.local()
+    # ------------- public -------------
+    def save_to_clickhouse(self, df: dd.DataFrame, *, overwrite: Optional[bool] = None) -> None:
+        """
+        Persist a Dask DataFrame into ClickHouse.
+        Args:
+            df: Dask DataFrame
+            overwrite: Optional override for dropping/recreating table
+        """
+        if not isinstance(df, dd.DataFrame):
+            raise TypeError("ClickHouseWriter.save_to_clickhouse expects a dask.dataframe.DataFrame.")
+        # small, cheap check: head(1) to detect empty
+        head = df.head(1, npartitions=-1, compute=True)
+        if head.empty:
+            self.logger.info("Dask DataFrame appears empty (head(1) returned 0 rows). Nothing to write.")
+            return
+        # lazily fill missing values per-partition (no global compute)
+        df = df.map_partitions(type(self)._fill_missing_partition, meta=df._meta)
+        # (re)create table
+        ow = self.overwrite if overwrite is None else bool(overwrite)
+        dtypes = df._meta_nonempty.dtypes  # metadata-only types (no compute)
+        schema_sql = self._generate_clickhouse_schema(dtypes)
+        engine_sql = self._default_engine_sql() if not self.engine else self.engine
+        if ow:
+            self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
+            self.logger.info(f"Dropped table {self.table} (overwrite=True)")
+        create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql};"
+        self._command(create_sql)
+        self.logger.info(f"Ensured table {self.table} exists")
+        # write partitions concurrently
+        parts = list(df.to_delayed())
+        if not parts:
+            self.logger.info("No partitions to write.")
+            return
+        self.logger.info(f"Writing {len(parts)} partitions to ClickHouse (max_workers={self.max_workers})")
+        with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
+            futures = {ex.submit(self._write_one_partition, part, idx): idx for idx, part in enumerate(parts)}
+            for fut in as_completed(futures):
+                idx = futures[fut]
+                try:
+                    fut.result()
+                except Exception as e:
+                    self.logger.error(f"Partition {idx} failed: {e}", exc_info=self.debug)
+                    raise
+        self.logger.info(f"Completed writing {len(parts)} partitions to {self.table}")
+    # ------------- schema & types -------------
+    def _generate_clickhouse_schema(self, dask_dtypes: pd.Series) -> str:
+        cols: Iterable[Tuple[str, Any]] = dask_dtypes.items()
+        pieces = []
+        for col, dtype in cols:
+            ch_type = self._map_dtype(dtype)
+            # Use Nullable for non-numeric/string columns that may carry NaN/None,
+            # and for datetimes to be safe with missing values.
+            if self._should_mark_nullable(dtype):
+                ch_type = f"Nullable({ch_type})"
+            pieces.append(f"{self._ident(col)} {ch_type}")
+        return ", ".join(pieces)
+    def _map_dtype(self, dtype: Any) -> str:
+        # Handle pandas extension dtypes explicitly
+        if isinstance(dtype, pd.Int64Dtype):
+            return "Int64"
+        if isinstance(dtype, pd.Int32Dtype):
+            return "Int32"
+        if isinstance(dtype, pd.BooleanDtype):
+            return "UInt8"
+        if isinstance(dtype, pd.Float64Dtype):
+            return "Float64"
+        if isinstance(dtype, pd.StringDtype):
+            return "String"
+        if "datetime64" in str(dtype):
+            return "DateTime"
+        return self.DTYPE_MAP.get(str(dtype), "String")
+    def _should_mark_nullable(self, dtype: Any) -> bool:
+        s = str(dtype)
+        if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
+            return True
+        if "datetime64" in s:
+            return True
+        # object/category almost always nullable
+        if s in ("object", "category", "string"):
+            return True
+        return False
+    def _default_engine_sql(self) -> str:
+        # minimal MergeTree clause; quote order_by safely
+        ob = self.order_by if self.order_by.startswith("(") else f"(`{self.order_by}`)"
+        return f"ENGINE = MergeTree ORDER BY {ob}"
+    # ------------- partition write -------------
+    def _write_one_partition(self, part, index: int) -> None:
+        # Compute partition → pandas
+        pdf: pd.DataFrame = part.compute()
+        if pdf.empty:
+            self.logger.debug(f"Partition {index} empty; skipping")
+            return
+        # Ensure column ordering is stable
+        cols = list(pdf.columns)
+        # Split into batches (to avoid giant single insert)
+        for start in range(0, len(pdf), self.insert_chunksize):
+            batch = pdf.iloc[start:start + self.insert_chunksize]
+            if batch.empty:
+                continue
+            self._insert_df(cols, batch)
+        self.logger.debug(f"Partition {index} inserted ({len(pdf)} rows)")
+    def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
+        client = self._get_client()
+        # clickhouse-connect supports insert_df
+        client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
+    # ------------- missing values (lazy) -------------
+    @staticmethod
+    def _fill_missing_partition(pdf: pd.DataFrame) -> pd.DataFrame:
+        # (unchanged body)
+        for col in pdf.columns:
+            s = pdf[col]
+            if pd.api.types.is_integer_dtype(s.dtype):
+                if pd.api.types.is_extension_array_dtype(s.dtype):
+                    pdf[col] = s.fillna(pd.NA)
+                else:
+                    pdf[col] = s.fillna(0)
+            elif pd.api.types.is_bool_dtype(s.dtype):
+                pdf[col] = s.fillna(pd.NA)
+            elif pd.api.types.is_float_dtype(s.dtype):
+                pdf[col] = s.fillna(0.0)
+            elif pd.api.types.is_datetime64_any_dtype(s.dtype):
+                pass
+            else:
+                pdf[col] = s.fillna("")
+        return pdf
+    # ------------- low-level helpers -------------
+    def _get_client(self):
+        cli = getattr(self._tlocal, "client", None)
+        if cli is not None:
+            return cli
+        cli = clickhouse_connect.get_client(
+            host=self.host,
+            port=self.port,
+            database=self.database,
+            username=self.user,  # clickhouse-connect uses 'username'
+            password=self.password,
+        )
+        self._tlocal.client = cli
+        return cli
+    def _command(self, sql: str) -> None:
+        client = self._get_client()
+        client.command(sql)
+    @staticmethod
+    def _ident(name: str) -> str:
+        # minimal identifier quoting
+        if name.startswith("`") and name.endswith("`"):
+            return name
+        return f"`{name}`"
+    # ------------- context cleanup -------------
+    def _cleanup(self):
+        # close client in this thread (the manager calls _cleanup in the owning thread)
+        cli = getattr(self._tlocal, "client", None)
+        try:
+            if cli is not None:
+                cli.close()
+        except Exception:
+            pass
+        finally:
+            if hasattr(self._tlocal, "client"):
+                delattr(self._tlocal, "client")

{sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/storage_config.py RENAMED Viewed

@@ -6,13 +6,13 @@ from .storage_manager import StorageManager
 from .credentials import ConfigManager
 class StorageConfig:
-    def __init__(self, config:ConfigManager, depots:dict=None):
+    def __init__(self, config:ConfigManager, depots:dict=None, clear_existing=False, write_mode="full-access"):
         self.conf = config
         self.depots = depots
         self._initialize_storage()
         self.storage_manager = StorageManager(self.base_storage, self.filesystem_type, self.filesystem_options)
         if self.depots is not None:
-            self.depot_paths, self.depot_names = self.storage_manager.rebuild_depot_paths(depots)
+            self.depot_paths, self.depot_names = self.storage_manager.rebuild_depot_paths(depots, clear_existing=clear_existing, write_mode=write_mode)
         else:
             self.depot_paths = None
             self.depot_names = None

sibi_dst-2025.8.7/sibi_dst/utils/storage_hive.py ADDED Viewed

@@ -0,0 +1,195 @@
+from __future__ import annotations
+import pandas as pd
+import dask.dataframe as dd
+from typing import Iterable, Optional, List, Tuple, Union
+import fsspec
+DNFFilter = List[List[Tuple[str, str, Union[str, int]]]]
+class HiveDatePartitionedStore:
+    """
+    Dask-only Parquet store with Hive-style yyyy=…/mm=…/dd=… partitions.
+    - `write(...)` safely "overwrites" S3 prefixes via per-object deletes (no bulk DeleteObjects).
+    - `read_range(...)` builds DNF filters and auto-matches partition types (string vs int).
+    """
+    def __init__(
+        self,
+        path: str,
+        *,
+        filesystem=None,                # fsspec filesystem or None to infer from path
+        date_col: str = "tracking_dt",
+        compression: str = "zstd",
+        partition_values_as_strings: bool = True,  # keep mm=07, dd=01 folder names
+        logger=None,
+    ) -> None:
+        self.path = path
+        self.fs = filesystem or fsspec.open(path).fs
+        self.date_col = date_col
+        self.compression = compression
+        self.partition_values_as_strings = partition_values_as_strings
+        self.log = logger
+    # ----------------- public API -----------------
+    def write(
+        self,
+        df: dd.DataFrame,
+        *,
+        repartition: Optional[int] = None,
+        overwrite: bool = False,
+    ) -> None:
+        """Write Dask DataFrame to Hive-style yyyy/mm/dd partitions."""
+        self._require_col(df, self.date_col)
+        ser = dd.to_datetime(df[self.date_col], errors="coerce")
+        if self.partition_values_as_strings:
+            parts = {
+                "yyyy": ser.dt.strftime("%Y"),
+                "mm":   ser.dt.strftime("%m"),
+                "dd":   ser.dt.strftime("%d"),
+            }
+        else:
+            parts = {
+                "yyyy": ser.dt.year.astype("int32"),
+                "mm":   ser.dt.month.astype("int8"),
+                "dd":   ser.dt.day.astype("int8"),
+            }
+        df = df.assign(**{self.date_col: ser}, **parts)
+        if repartition:
+            df = df.repartition(npartitions=repartition)
+        if overwrite:
+            self._safe_rm_prefix(self.path)
+        if self.log:
+            self.log.info(f"Writing parquet to {self.path} (hive yyyy/mm/dd)…")
+        df.to_parquet(
+            self.path,
+            engine="pyarrow",
+            write_index=False,
+            filesystem=self.fs,
+            partition_on=["yyyy", "mm", "dd"],
+            compression=self.compression,
+            overwrite=False,  # we pre-cleaned if overwrite=True
+        )
+    def read_range(
+        self,
+        start: Union[str, pd.Timestamp],
+        end: Union[str, pd.Timestamp],
+        *,
+        columns: Optional[Iterable[str]] = None,
+    ) -> dd.DataFrame:
+        """
+        Read a date window with partition pruning. Tries string filters first,
+        falls back to integer filters if Arrow infers partition types as ints.
+        """
+        str_filters = self._dnf_filters_for_range_str(start, end)
+        try:
+            return dd.read_parquet(
+                self.path,
+                engine="pyarrow",
+                filesystem=self.fs,
+                columns=list(columns) if columns else None,
+                filters=str_filters,
+            )
+        except Exception:
+            int_filters = self._dnf_filters_for_range_int(start, end)
+            return dd.read_parquet(
+                self.path,
+                engine="pyarrow",
+                filesystem=self.fs,
+                columns=list(columns) if columns else None,
+                filters=int_filters,
+            )
+    # Convenience: full month / single day
+    def read_month(self, year: int, month: int, *, columns=None) -> dd.DataFrame:
+        start = pd.Timestamp(year=year, month=month, day=1)
+        end = (start + pd.offsets.MonthEnd(0))
+        return self.read_range(start, end, columns=columns)
+    def read_day(self, year: int, month: int, day: int, *, columns=None) -> dd.DataFrame:
+        ts = pd.Timestamp(year=year, month=month, day=day)
+        return self.read_range(ts, ts, columns=columns)
+    # ----------------- internals -----------------
+    @staticmethod
+    def _pad2(n: int) -> str:
+        return f"{n:02d}"
+    def _safe_rm_prefix(self, path: str) -> None:
+        """Per-object delete to avoid S3 bulk DeleteObjects (and Content-MD5 issues)."""
+        if not self.fs.exists(path):
+            return
+        if self.log:
+            self.log.info(f"Cleaning prefix (safe delete): {path}")
+        for k in self.fs.find(path):
+            try:
+                (self.fs.rm_file(k) if hasattr(self.fs, "rm_file") else self.fs.rm(k, recursive=False))
+            except Exception as e:
+                if self.log:
+                    self.log.warning(f"Could not delete {k}: {e}")
+    @staticmethod
+    def _require_col(df: dd.DataFrame, col: str) -> None:
+        if col not in df.columns:
+            raise KeyError(f"'{col}' not in DataFrame")
+    # ---- DNF builders (string vs int) ----
+    def _dnf_filters_for_range_str(self, start, end) -> DNFFilter:
+        s, e = pd.Timestamp(start), pd.Timestamp(end)
+        if s > e:
+            raise ValueError("start > end")
+        sY, sM, sD = s.year, s.month, s.day
+        eY, eM, eD = e.year, e.month, e.day
+        p2 = self._pad2
+        if sY == eY and sM == eM:
+            return [[("yyyy","==",str(sY)),("mm","==",p2(sM)),("dd",">=",p2(sD)),("dd","<=",p2(eD))]]
+        clauses: DNFFilter = [
+            [("yyyy","==",str(sY)),("mm","==",p2(sM)),("dd",">=",p2(sD))],
+            [("yyyy","==",str(eY)),("mm","==",p2(eM)),("dd","<=",p2(eD))]
+        ]
+        if sY == eY:
+            for m in range(sM+1, eM):
+                clauses.append([("yyyy","==",str(sY)),("mm","==",p2(m))])
+            return clauses
+        for m in range(sM+1, 13):
+            clauses.append([("yyyy","==",str(sY)),("mm","==",p2(m))])
+        for y in range(sY+1, eY):
+            clauses.append([("yyyy","==",str(y))])
+        for m in range(1, eM):
+            clauses.append([("yyyy","==",str(eY)),("mm","==",p2(m))])
+        return clauses
+    @staticmethod
+    def _dnf_filters_for_range_int(start, end) -> DNFFilter:
+        s, e = pd.Timestamp(start), pd.Timestamp(end)
+        if s > e:
+            raise ValueError("start > end")
+        sY, sM, sD = s.year, s.month, s.day
+        eY, eM, eD = e.year, e.month, e.day
+        if sY == eY and sM == eM:
+            return [[("yyyy","==",sY),("mm","==",sM),("dd",">=",sD),("dd","<=",eD)]]
+        clauses: DNFFilter = [
+            [("yyyy","==",sY),("mm","==",sM),("dd",">=",sD)],
+            [("yyyy","==",eY),("mm","==",eM),("dd","<=",eD)],
+        ]
+        if sY == eY:
+            for m in range(sM+1, eM):
+                clauses.append([("yyyy","==",sY),("mm","==",m)])
+            return clauses
+        for m in range(sM+1, 13):
+            clauses.append([("yyyy","==",sY),("mm","==",m)])
+        for y in range(sY+1, eY):
+            clauses.append([("yyyy","==",y)])
+        for m in range(1, eM):
+            clauses.append([("yyyy","==",eY),("mm","==",m)])
+        return clauses

{sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/storage_manager.py RENAMED Viewed

@@ -83,7 +83,7 @@ class StorageManager:
                 self.fs.rm(sub_path, recursive=True)
             self.fs.mkdirs(sub_path, exist_ok=True)
-    def rebuild_depot_paths(self, depots, clear_existing=False):
+    def rebuild_depot_paths(self, depots, clear_existing=False, write_mode="full-access"):
         """
         Rebuilds depot_paths (dictionary) and depot_name (SimpleNamespace).
         Handles clear_existing scenario by resetting directories when required.
@@ -96,7 +96,8 @@ class StorageManager:
             depot_path = self.join_paths(self.storage_path, depot)
             if self.debug:
                 print(f"Rebuilding depot at: {depot_path}")
-            self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
+            if write_mode == "full-access":
+                self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
         # Generate depot_paths dictionary
         self.depot_paths = {

sibi-dst 2025.8.5__tar.gz → 2025.8.7__tar.gz

sibi-dst 2025.8.5tar.gz → 2025.8.7tar.gz