PyPI - sibi-dst - Versions diffs - 2025.9.5__tar.gz → 2025.9.7__tar.gz - Mend

sibi-dst 2025.9.5tar.gz → 2025.9.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 2025.9.5
+Version: 2025.9.7
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sibi-dst"
-version = "2025.9.5"
+version = "2025.9.7"
 description = "Data Science Toolkit"
 authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
 readme = "README.md"

{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/__init__.py RENAMED Viewed

@@ -3,6 +3,7 @@ from .base_data_cube import BaseDataCube
 from .base_attacher import make_attacher
 from .base_parquet_reader import BaseParquetReader
 from .hybrid_data_loader import HybridDataLoader
+from .base_pipeline import BasePipeline
 __all__ = [
     "BaseDataCube",
@@ -10,5 +11,6 @@ __all__ = [
     "make_attacher",
     "BaseParquetReader",
     "HybridDataLoader",
+    "BasePipeline",
 ]

sibi_dst-2025.9.7/sibi_dst/utils/boilerplate/base_pipeline.py ADDED Viewed

@@ -0,0 +1,178 @@
+from __future__ import annotations
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+from typing import Type, Any, Callable, List
+import pandas as pd
+import dask.dataframe as dd
+from sibi_dst.utils import ManagedResource, ParquetSaver
+from sibi_dst.df_helper import ParquetReader
+from sibi_dst.utils.dask_utils import dask_is_empty
+class DateRangeHelper:
+    @staticmethod
+    def generate_daily_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[str]:
+        start = pd.to_datetime(start_date)
+        end = pd.to_datetime(end_date)
+        return [d.strftime(date_format) for d in pd.date_range(start, end, freq="D")]
+    @staticmethod
+    def generate_monthly_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[tuple[str, str]]:
+        """
+        Generate (start_date, end_date) tuples for each calendar month in range.
+        Always includes the first and last month, even if partial.
+        """
+        start = pd.to_datetime(start_date)
+        end = pd.to_datetime(end_date)
+        ranges = []
+        current = start.replace(day=1)
+        while current <= end:
+            month_end = (current + pd.offsets.MonthEnd(0)).normalize()
+            ranges.append((
+                current.strftime(date_format),
+                min(month_end, end).strftime(date_format)
+            ))
+            current += pd.DateOffset(months=1)
+        return ranges
+class BasePipeline(ManagedResource):
+    def __init__(
+        self,
+        start_date: str,
+        end_date: str,
+        dataset_cls: Type,
+        parquet_storage_path: str,
+        *,
+        fs: Any,
+        filename: str = "dataset",
+        date_field: str = "date",
+        max_workers: int = 4,
+        dataset_kwargs: dict = None,
+        **kwargs,
+    ):
+        kwargs["fs"] = fs
+        super().__init__(**kwargs)
+        self.start_date = start_date
+        self.end_date = end_date
+        self.fs = fs
+        self.filename = filename
+        self.date_field = date_field
+        self.max_workers = max_workers
+        self.storage_path = parquet_storage_path.rstrip("/")
+        self.df: dd.DataFrame | None = None
+        self.ds = dataset_cls(
+            start_date=self.start_date,
+            end_date=self.end_date,
+            debug=self.debug,
+            logger=self.logger,
+            **(dataset_kwargs or {}),
+        )
+    def _get_storage_path_for_date(self, date: pd.Timestamp) -> str:
+        return f"{self.storage_path}/{date.year}/{date.month:02d}/{date.day:02d}"
+    def _get_output_filename(self, fmt: str = "parquet") -> str:
+        return f"{self.filename}.{fmt}"
+    async def aload(self, **kwargs) -> dd.DataFrame:
+        await self.emit("status", message="Loading dataset...", progress=5)
+        self.df = await self.ds.aload(**kwargs)
+        return self.df
+    async def to_parquet(self, **kwargs) -> None:
+        df = await self.aload(**kwargs)
+        if dask_is_empty(df):
+            self.logger.warning("No data to save.")
+            return
+        df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
+        dates = DateRangeHelper.generate_daily_ranges(self.start_date, self.end_date)
+        tasks = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            for date_str in dates:
+                date_obj = pd.to_datetime(date_str).date()
+                df_day = df[df[self.date_field].dt.date == date_obj]
+                if dask_is_empty(df_day):
+                    self.logger.info(f"No data for {date_obj}, skipping.")
+                    continue
+                path = self._get_storage_path_for_date(pd.Timestamp(date_obj))
+                await self.emit("status", message=f"Saving data for {date_obj}")
+                saver = ParquetSaver(
+                    df_result=df_day,
+                    parquet_storage_path=path,
+                    fs=self.fs,
+                    debug=self.debug,
+                    logger=self.logger,
+                )
+                tasks.append(
+                    asyncio.get_running_loop().run_in_executor(
+                        executor, saver.save_to_parquet, self._get_output_filename()
+                    )
+                )
+            await asyncio.gather(*tasks)
+        await self.emit("complete", message="All partitions written.")
+    async def from_parquet(self, **kwargs) -> dd.DataFrame:
+        reader = ParquetReader(
+            parquet_start_date=self.start_date,
+            parquet_end_date=self.end_date,
+            parquet_storage_path=self.storage_path,
+            parquet_filename=self._get_output_filename(),
+            fs=self.fs,
+            debug=self.debug,
+            logger=self.logger,
+        )
+        return await reader.aload(**kwargs)
+    async def to_clickhouse(self, clk_conf: dict, **kwargs):
+        """
+        Writes daily-partitioned data to ClickHouse using concurrent threads.
+        """
+        from sibi_dst.utils import ClickHouseWriter
+        df = await self.from_parquet(**kwargs)
+        if dask_is_empty(df):
+            self.logger.warning("No data to write to ClickHouse.")
+            return
+        df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
+        df = df.persist()
+        unique_dates = df[self.date_field].dt.date.dropna().unique().compute()
+        if len(unique_dates)==0:
+            self.logger.warning("No valid dates found for partitioning.")
+            return
+        clk = ClickHouseWriter(**clk_conf)
+        loop = asyncio.get_running_loop()
+        tasks = []
+        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
+            for date in unique_dates:
+                df_day = df[df[self.date_field].dt.date == date]
+                if dask_is_empty(df_day):
+                    self.logger.info(f"[ClickHouse] No data for {date}, skipping.")
+                    continue
+                self.logger.info(f"[ClickHouse] Writing {len(df_day)} rows for {date}")
+                tasks.append(
+                    loop.run_in_executor(executor, clk.save_to_clickhouse, df_day)
+                )
+            await asyncio.gather(*tasks)
+        self.logger.info(f"ClickHouse write complete for {len(unique_dates)} daily partitions.")
+__all__ = ["BasePipeline"]

{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/clickhouse_writer.py RENAMED Viewed

@@ -7,6 +7,7 @@ from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
 import pandas as pd
 import dask.dataframe as dd
 import clickhouse_connect
+import numpy as np
 from . import ManagedResource
@@ -27,6 +28,7 @@ class ClickHouseWriter(ManagedResource):
       - Optional overwrite (drop + recreate)
       - Partitioned, batched inserts
       - Per-thread clients to avoid session conflicts
+      - Proper PyArrow dtype handling
     """
     # Default dtype mapping (pandas/dask → ClickHouse)
@@ -109,7 +111,11 @@ class ClickHouseWriter(ManagedResource):
             return
         # lazily fill missing values per-partition (no global compute)
-        df = df.map_partitions(type(self)._fill_missing_partition, meta=df._meta)
+        # Use the new method that ensures correct types for ClickHouse
+        df = df.map_partitions(
+            type(self)._process_partition_for_clickhouse_compatible,
+            meta=df._meta
+        )
         # (re)create table
         ow = self.overwrite if overwrite is None else bool(overwrite)
@@ -121,7 +127,7 @@ class ClickHouseWriter(ManagedResource):
             self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
             self.logger.info(f"Dropped table {self.table} (overwrite=True)")
-        create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql};"
+        create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql}"
         self._command(create_sql)
         self.logger.info(f"Ensured table {self.table} exists")
@@ -159,6 +165,26 @@ class ClickHouseWriter(ManagedResource):
         return ", ".join(pieces)
     def _map_dtype(self, dtype: Any) -> str:
+        dtype_str = str(dtype).lower()
+        # Handle PyArrow dtypes
+        if "[pyarrow]" in dtype_str:
+            if "int64" in dtype_str:
+                return "Int64"
+            elif "int32" in dtype_str:
+                return "Int32"
+            elif "float64" in dtype_str or "double" in dtype_str:
+                return "Float64"
+            elif "float32" in dtype_str:
+                return "Float32"
+            elif "bool" in dtype_str:
+                return "UInt8"
+            elif "timestamp" in dtype_str: # PyArrow timestamp
+                return "DateTime"
+            elif "string" in dtype_str: # PyArrow string
+                return "String"
+            else:
+                return "String" # fallback
         # Handle pandas extension dtypes explicitly
         if isinstance(dtype, pd.Int64Dtype):
             return "Int64"
@@ -170,19 +196,29 @@ class ClickHouseWriter(ManagedResource):
             return "Float64"
         if isinstance(dtype, pd.StringDtype):
             return "String"
-        if "datetime64" in str(dtype):
+        if "datetime64" in dtype_str:
             return "DateTime"
         return self.DTYPE_MAP.get(str(dtype), "String")
     def _should_mark_nullable(self, dtype: Any) -> bool:
-        s = str(dtype)
+        dtype_str = str(dtype).lower()
+        # PyArrow types are generally nullable, but let's be specific
+        if "[pyarrow]" in dtype_str:
+             # For PyArrow, make strings and timestamps nullable, numbers usually not unless data has nulls
+             base_type = dtype_str.replace("[pyarrow]", "")
+             if base_type in ["string", "large_string"] or "timestamp" in base_type:
+                 return True
+             # For numeric PyArrow, check if the actual data contains nulls (hard to do here)
+             # Let's default to not nullable for numeric unless explicitly needed
+             return False # Conservative for PyArrow numerics
         if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
             return True
-        if "datetime64" in s:
+        if "datetime64" in dtype_str:
             return True
         # object/category almost always nullable
-        if s in ("object", "category", "string"):
+        if dtype_str in ("object", "category", "string"):
             return True
         return False
@@ -203,6 +239,10 @@ class ClickHouseWriter(ManagedResource):
         # Ensure column ordering is stable
         cols = list(pdf.columns)
+        # --- CRITICAL FIX: Ensure datetime columns are compatible BEFORE insertion ---
+        # This is the key step to prevent the numpy.datetime64 error
+        pdf = self._ensure_clickhouse_compatible_datetime_types(pdf)
         # Split into batches (to avoid giant single insert)
         for start in range(0, len(pdf), self.insert_chunksize):
             batch = pdf.iloc[start:start + self.insert_chunksize]
@@ -215,30 +255,116 @@ class ClickHouseWriter(ManagedResource):
     def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
         client = self._get_client()
         # clickhouse-connect supports insert_df
+        # The df passed here should now have compatible datetime types
         client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
-    # ------------- missing values (lazy) -------------
+    # ------------- missing values & type conversion (lazy) -------------
     @staticmethod
-    def _fill_missing_partition(pdf: pd.DataFrame) -> pd.DataFrame:
-        # (unchanged body)
+    def _process_partition_for_clickhouse_compatible(pdf: pd.DataFrame) -> pd.DataFrame:
+        """
+        Process a partition to fill missing values and ensure initial data types are consistent.
+        This is the first step of data preparation.
+        """
+        pdf = pdf.copy() # Avoid modifying original
         for col in pdf.columns:
             s = pdf[col]
-            if pd.api.types.is_integer_dtype(s.dtype):
+            dtype_str = str(s.dtype).lower()
+            # --- Handle PyArrow dtypes ---
+            if "[pyarrow]" in dtype_str:
+                try:
+                    if "string" in dtype_str:
+                        # Convert PyArrow string to object, fillna with empty string
+                        pdf[col] = s.astype('object').fillna("")
+                    elif "timestamp" in dtype_str:
+                        # Convert PyArrow timestamp to pandas datetime, NaT for nulls
+                        pdf[col] = pd.to_datetime(s, errors='coerce') # errors='coerce' handles conversion issues
+                    elif "int" in dtype_str:
+                        # Convert PyArrow int to pandas int, fillna with 0 for non-nullable
+                        pdf[col] = s.fillna(0)
+                    elif "float" in dtype_str or "double" in dtype_str:
+                        pdf[col] = s.fillna(0.0)
+                    elif "bool" in dtype_str:
+                         pdf[col] = s.fillna(False) # Or pd.NA if you prefer
+                    else:
+                        # Fallback: convert to object and then to string
+                        pdf[col] = s.astype('object').astype(str).fillna("")
+                except Exception as e:
+                    # If conversion fails, fall back to object and string
+                    pdf[col] = s.astype('object').astype(str).fillna("")
+            # --- Handle standard pandas dtypes ---
+            elif pd.api.types.is_integer_dtype(s.dtype):
                 if pd.api.types.is_extension_array_dtype(s.dtype):
                     pdf[col] = s.fillna(pd.NA)
                 else:
                     pdf[col] = s.fillna(0)
             elif pd.api.types.is_bool_dtype(s.dtype):
-                pdf[col] = s.fillna(pd.NA)
+                pdf[col] = s.fillna(pd.NA) # Or False
             elif pd.api.types.is_float_dtype(s.dtype):
                 pdf[col] = s.fillna(0.0)
             elif pd.api.types.is_datetime64_any_dtype(s.dtype):
+                # Datetimes - leave as is for now, will be handled in final step
                 pass
             else:
-                pdf[col] = s.fillna("")
+                # For object/string/category columns, ensure they're strings
+                pdf[col] = s.astype(str).fillna("")
         return pdf
+    def _ensure_clickhouse_compatible_datetime_types(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Final conversion step: Ensure datetime columns are in a format compatible
+        with clickhouse-connect driver. Specifically, convert numpy.datetime64 to
+        pandas.Timestamp or Python datetime objects.
+        This is called just before insertion.
+        """
+        df = df.copy()
+        for col in df.columns:
+            s = df[col]
+            # Check if the column is datetime-like
+            if pd.api.types.is_datetime64_any_dtype(s.dtype):
+                # --- Robust conversion to ensure compatibility ---
+                # 1. Convert to pandas datetime explicitly
+                df[col] = pd.to_datetime(s, utc=True) # Ensures timezone handling
+                # 2. Replace NaT with None for nullable columns (clickhouse-connect handles this)
+                #    This is often sufficient, but let's be extra sure about the object type
+                # 3. Ensure the underlying objects are pandas.Timestamp (which have .timestamp())
+                #    The pd.to_datetime should handle this, but accessing .dt accessor reinforces it.
+                #    If there are still issues, we can force object conversion:
+                # df[col] = df[col].dt.to_pydatetime() # Converts to numpy array of datetime64 or None
+                # But pd.Timestamp is better. Let's try accessing .dt to ensure it's proper:
+                try:
+                    _ = df[col].dt # Accessing .dt confirms it's datetime-like
+                except:
+                    # If .dt fails, it means conversion wasn't clean, force it
+                    self.logger.debug(f"Forcing datetime conversion for column {col}")
+                    df[col] = pd.to_datetime(df[col].astype('object'), utc=True)
+                # --- Final check and explicit conversion if needed ---
+                # If the error persists, we might need to explicitly convert the array elements.
+                # Let's add a check for the first non-null element in a sample:
+                sample_series = df[col].dropna()
+                if len(sample_series) > 0:
+                    first_val = sample_series.iloc[0]
+                    if isinstance(first_val, np.datetime64):
+                        self.logger.warning(f"Column {col} still contains numpy.datetime64 after conversion. Forcing object conversion.")
+                        # Force conversion to object array of pandas.Timestamp or None
+                        def convert_val(v):
+                            if pd.isna(v):
+                                return None
+                            if isinstance(v, np.datetime64):
+                                # Convert numpy.datetime64 to pandas.Timestamp
+                                return pd.Timestamp(v)
+                            return v
+                        df[col] = df[col].apply(convert_val)
+        return df
     # ------------- low-level helpers -------------
     def _get_client(self):
@@ -284,4 +410,3 @@ class ClickHouseWriter(ManagedResource):
         finally:
             if hasattr(self._tlocal, "client"):
                 delattr(self._tlocal, "client")

{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/dask_utils.py RENAMED Viewed

@@ -31,7 +31,7 @@ def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
     k = min(max(sample, 1), ddf.npartitions)
     probes = dask.compute(*[
         ddf.get_partition(i).map_partitions(len) for i in range(k)
-    ])
+    ], scheduler="threads")
     if any(_to_int_safe(n) > 0 for n in probes):
         return False