PyPI - sibi-dst - Versions diffs - 2025.9.5__py3-none-any.whl → 2025.9.6__py3-none-any.whl - Mend

sibi-dst 2025.9.5py3-none-any.whl → 2025.9.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

sibi_dst/utils/clickhouse_writer.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
 import pandas as pd
 import dask.dataframe as dd
 import clickhouse_connect
+import numpy as np
 from . import ManagedResource
@@ -27,6 +28,7 @@ class ClickHouseWriter(ManagedResource):
       - Optional overwrite (drop + recreate)
       - Partitioned, batched inserts
       - Per-thread clients to avoid session conflicts
+      - Proper PyArrow dtype handling
     """
     # Default dtype mapping (pandas/dask → ClickHouse)
@@ -109,7 +111,11 @@ class ClickHouseWriter(ManagedResource):
             return
         # lazily fill missing values per-partition (no global compute)
-        df = df.map_partitions(type(self)._fill_missing_partition, meta=df._meta)
+        # Use the new method that ensures correct types for ClickHouse
+        df = df.map_partitions(
+            type(self)._process_partition_for_clickhouse_compatible,
+            meta=df._meta
+        )
         # (re)create table
         ow = self.overwrite if overwrite is None else bool(overwrite)
@@ -121,7 +127,7 @@ class ClickHouseWriter(ManagedResource):
             self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
             self.logger.info(f"Dropped table {self.table} (overwrite=True)")
-        create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql};"
+        create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql}"
         self._command(create_sql)
         self.logger.info(f"Ensured table {self.table} exists")
@@ -159,6 +165,26 @@ class ClickHouseWriter(ManagedResource):
         return ", ".join(pieces)
     def _map_dtype(self, dtype: Any) -> str:
+        dtype_str = str(dtype).lower()
+        # Handle PyArrow dtypes
+        if "[pyarrow]" in dtype_str:
+            if "int64" in dtype_str:
+                return "Int64"
+            elif "int32" in dtype_str:
+                return "Int32"
+            elif "float64" in dtype_str or "double" in dtype_str:
+                return "Float64"
+            elif "float32" in dtype_str:
+                return "Float32"
+            elif "bool" in dtype_str:
+                return "UInt8"
+            elif "timestamp" in dtype_str: # PyArrow timestamp
+                return "DateTime"
+            elif "string" in dtype_str: # PyArrow string
+                return "String"
+            else:
+                return "String" # fallback
         # Handle pandas extension dtypes explicitly
         if isinstance(dtype, pd.Int64Dtype):
             return "Int64"
@@ -170,19 +196,29 @@ class ClickHouseWriter(ManagedResource):
             return "Float64"
         if isinstance(dtype, pd.StringDtype):
             return "String"
-        if "datetime64" in str(dtype):
+        if "datetime64" in dtype_str:
             return "DateTime"
         return self.DTYPE_MAP.get(str(dtype), "String")
     def _should_mark_nullable(self, dtype: Any) -> bool:
-        s = str(dtype)
+        dtype_str = str(dtype).lower()
+        # PyArrow types are generally nullable, but let's be specific
+        if "[pyarrow]" in dtype_str:
+             # For PyArrow, make strings and timestamps nullable, numbers usually not unless data has nulls
+             base_type = dtype_str.replace("[pyarrow]", "")
+             if base_type in ["string", "large_string"] or "timestamp" in base_type:
+                 return True
+             # For numeric PyArrow, check if the actual data contains nulls (hard to do here)
+             # Let's default to not nullable for numeric unless explicitly needed
+             return False # Conservative for PyArrow numerics
         if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
             return True
-        if "datetime64" in s:
+        if "datetime64" in dtype_str:
             return True
         # object/category almost always nullable
-        if s in ("object", "category", "string"):
+        if dtype_str in ("object", "category", "string"):
             return True
         return False
@@ -203,6 +239,10 @@ class ClickHouseWriter(ManagedResource):
         # Ensure column ordering is stable
         cols = list(pdf.columns)
+        # --- CRITICAL FIX: Ensure datetime columns are compatible BEFORE insertion ---
+        # This is the key step to prevent the numpy.datetime64 error
+        pdf = self._ensure_clickhouse_compatible_datetime_types(pdf)
         # Split into batches (to avoid giant single insert)
         for start in range(0, len(pdf), self.insert_chunksize):
             batch = pdf.iloc[start:start + self.insert_chunksize]
@@ -215,30 +255,116 @@ class ClickHouseWriter(ManagedResource):
     def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
         client = self._get_client()
         # clickhouse-connect supports insert_df
+        # The df passed here should now have compatible datetime types
         client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
-    # ------------- missing values (lazy) -------------
+    # ------------- missing values & type conversion (lazy) -------------
     @staticmethod
-    def _fill_missing_partition(pdf: pd.DataFrame) -> pd.DataFrame:
-        # (unchanged body)
+    def _process_partition_for_clickhouse_compatible(pdf: pd.DataFrame) -> pd.DataFrame:
+        """
+        Process a partition to fill missing values and ensure initial data types are consistent.
+        This is the first step of data preparation.
+        """
+        pdf = pdf.copy() # Avoid modifying original
         for col in pdf.columns:
             s = pdf[col]
-            if pd.api.types.is_integer_dtype(s.dtype):
+            dtype_str = str(s.dtype).lower()
+            # --- Handle PyArrow dtypes ---
+            if "[pyarrow]" in dtype_str:
+                try:
+                    if "string" in dtype_str:
+                        # Convert PyArrow string to object, fillna with empty string
+                        pdf[col] = s.astype('object').fillna("")
+                    elif "timestamp" in dtype_str:
+                        # Convert PyArrow timestamp to pandas datetime, NaT for nulls
+                        pdf[col] = pd.to_datetime(s, errors='coerce') # errors='coerce' handles conversion issues
+                    elif "int" in dtype_str:
+                        # Convert PyArrow int to pandas int, fillna with 0 for non-nullable
+                        pdf[col] = s.fillna(0)
+                    elif "float" in dtype_str or "double" in dtype_str:
+                        pdf[col] = s.fillna(0.0)
+                    elif "bool" in dtype_str:
+                         pdf[col] = s.fillna(False) # Or pd.NA if you prefer
+                    else:
+                        # Fallback: convert to object and then to string
+                        pdf[col] = s.astype('object').astype(str).fillna("")
+                except Exception as e:
+                    # If conversion fails, fall back to object and string
+                    pdf[col] = s.astype('object').astype(str).fillna("")
+            # --- Handle standard pandas dtypes ---
+            elif pd.api.types.is_integer_dtype(s.dtype):
                 if pd.api.types.is_extension_array_dtype(s.dtype):
                     pdf[col] = s.fillna(pd.NA)
                 else:
                     pdf[col] = s.fillna(0)
             elif pd.api.types.is_bool_dtype(s.dtype):
-                pdf[col] = s.fillna(pd.NA)
+                pdf[col] = s.fillna(pd.NA) # Or False
             elif pd.api.types.is_float_dtype(s.dtype):
                 pdf[col] = s.fillna(0.0)
             elif pd.api.types.is_datetime64_any_dtype(s.dtype):
+                # Datetimes - leave as is for now, will be handled in final step
                 pass
             else:
-                pdf[col] = s.fillna("")
+                # For object/string/category columns, ensure they're strings
+                pdf[col] = s.astype(str).fillna("")
         return pdf
+    def _ensure_clickhouse_compatible_datetime_types(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Final conversion step: Ensure datetime columns are in a format compatible
+        with clickhouse-connect driver. Specifically, convert numpy.datetime64 to
+        pandas.Timestamp or Python datetime objects.
+        This is called just before insertion.
+        """
+        df = df.copy()
+        for col in df.columns:
+            s = df[col]
+            # Check if the column is datetime-like
+            if pd.api.types.is_datetime64_any_dtype(s.dtype):
+                # --- Robust conversion to ensure compatibility ---
+                # 1. Convert to pandas datetime explicitly
+                df[col] = pd.to_datetime(s, utc=True) # Ensures timezone handling
+                # 2. Replace NaT with None for nullable columns (clickhouse-connect handles this)
+                #    This is often sufficient, but let's be extra sure about the object type
+                # 3. Ensure the underlying objects are pandas.Timestamp (which have .timestamp())
+                #    The pd.to_datetime should handle this, but accessing .dt accessor reinforces it.
+                #    If there are still issues, we can force object conversion:
+                # df[col] = df[col].dt.to_pydatetime() # Converts to numpy array of datetime64 or None
+                # But pd.Timestamp is better. Let's try accessing .dt to ensure it's proper:
+                try:
+                    _ = df[col].dt # Accessing .dt confirms it's datetime-like
+                except:
+                    # If .dt fails, it means conversion wasn't clean, force it
+                    self.logger.debug(f"Forcing datetime conversion for column {col}")
+                    df[col] = pd.to_datetime(df[col].astype('object'), utc=True)
+                # --- Final check and explicit conversion if needed ---
+                # If the error persists, we might need to explicitly convert the array elements.
+                # Let's add a check for the first non-null element in a sample:
+                sample_series = df[col].dropna()
+                if len(sample_series) > 0:
+                    first_val = sample_series.iloc[0]
+                    if isinstance(first_val, np.datetime64):
+                        self.logger.warning(f"Column {col} still contains numpy.datetime64 after conversion. Forcing object conversion.")
+                        # Force conversion to object array of pandas.Timestamp or None
+                        def convert_val(v):
+                            if pd.isna(v):
+                                return None
+                            if isinstance(v, np.datetime64):
+                                # Convert numpy.datetime64 to pandas.Timestamp
+                                return pd.Timestamp(v)
+                            return v
+                        df[col] = df[col].apply(convert_val)
+        return df
     # ------------- low-level helpers -------------
     def _get_client(self):
@@ -284,4 +410,3 @@ class ClickHouseWriter(ManagedResource):
         finally:
             if hasattr(self._tlocal, "client"):
                 delattr(self._tlocal, "client")

sibi_dst/utils/dask_utils.py CHANGED Viewed

@@ -31,7 +31,7 @@ def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
     k = min(max(sample, 1), ddf.npartitions)
     probes = dask.compute(*[
         ddf.get_partition(i).map_partitions(len) for i in range(k)
-    ])
+    ], scheduler="threads")
     if any(_to_int_safe(n) > 0 for n in probes):
         return False

{sibi_dst-2025.9.5.dist-info → sibi_dst-2025.9.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 2025.9.5
+Version: 2025.9.6
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-2025.9.5.dist-info → sibi_dst-2025.9.6.dist-info}/RECORD RENAMED Viewed

@@ -47,9 +47,9 @@ sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh
 sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
 sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
 sibi_dst/utils/business_days.py,sha256=dP0Xj4FhTBIvZZrZYLOHZl5zOpDAgWkD4p_1a7BOT7I,8461
-sibi_dst/utils/clickhouse_writer.py,sha256=JCjLfPfsDDAvoMJeh0uVqVL5Je6mPcZn-G_EL9Pk6ms,10364
+sibi_dst/utils/clickhouse_writer.py,sha256=AOv0bYFzAI0u4dEkEBoUqtHekwPMISdNT5-ywCtDe4I,17049
 sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
-sibi_dst/utils/dask_utils.py,sha256=FURwrNqij6ptxFhI4v7yaGkyOIIyW9lSPpMfE9-kxHY,1970
+sibi_dst/utils/dask_utils.py,sha256=QhFcmpH4fXAy6b3DugIX5JvH4h-P3M3hXKnBYTLRkq0,1991
 sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
 sibi_dst/utils/data_utils.py,sha256=7bLidEjppieNoozDFb6OuRY0W995cxg4tiGAlkGfePI,7768
 sibi_dst/utils/data_wrapper.py,sha256=axHOmCG9cBJgjf5m8jpzsCCZzXJgynGs44rGe6FUrzk,29906
@@ -93,6 +93,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
 sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
 sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
 sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
-sibi_dst-2025.9.5.dist-info/METADATA,sha256=HjRtVuHQj3IFf2ABuponSz4ahNMtbTRetLqQC7TSJjc,2710
-sibi_dst-2025.9.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-sibi_dst-2025.9.5.dist-info/RECORD,,
+sibi_dst-2025.9.6.dist-info/METADATA,sha256=e9vt1wbHivyTJhyubiEjJcMFBNDF1m9nERTlBgYvq9o,2710
+sibi_dst-2025.9.6.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+sibi_dst-2025.9.6.dist-info/RECORD,,

{sibi_dst-2025.9.5.dist-info → sibi_dst-2025.9.6.dist-info}/WHEEL RENAMED Viewed

File without changes

sibi-dst 2025.9.5__py3-none-any.whl → 2025.9.6__py3-none-any.whl

sibi-dst 2025.9.5py3-none-any.whl → 2025.9.6py3-none-any.whl