sibi-dst 2025.9.5__py3-none-any.whl → 2025.9.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@ from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
7
7
  import pandas as pd
8
8
  import dask.dataframe as dd
9
9
  import clickhouse_connect
10
+ import numpy as np
10
11
 
11
12
  from . import ManagedResource
12
13
 
@@ -27,6 +28,7 @@ class ClickHouseWriter(ManagedResource):
27
28
  - Optional overwrite (drop + recreate)
28
29
  - Partitioned, batched inserts
29
30
  - Per-thread clients to avoid session conflicts
31
+ - Proper PyArrow dtype handling
30
32
  """
31
33
 
32
34
  # Default dtype mapping (pandas/dask → ClickHouse)
@@ -109,7 +111,11 @@ class ClickHouseWriter(ManagedResource):
109
111
  return
110
112
 
111
113
  # lazily fill missing values per-partition (no global compute)
112
- df = df.map_partitions(type(self)._fill_missing_partition, meta=df._meta)
114
+ # Use the new method that ensures correct types for ClickHouse
115
+ df = df.map_partitions(
116
+ type(self)._process_partition_for_clickhouse_compatible,
117
+ meta=df._meta
118
+ )
113
119
 
114
120
  # (re)create table
115
121
  ow = self.overwrite if overwrite is None else bool(overwrite)
@@ -121,7 +127,7 @@ class ClickHouseWriter(ManagedResource):
121
127
  self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
122
128
  self.logger.info(f"Dropped table {self.table} (overwrite=True)")
123
129
 
124
- create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql};"
130
+ create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql}"
125
131
  self._command(create_sql)
126
132
  self.logger.info(f"Ensured table {self.table} exists")
127
133
 
@@ -159,6 +165,26 @@ class ClickHouseWriter(ManagedResource):
159
165
  return ", ".join(pieces)
160
166
 
161
167
  def _map_dtype(self, dtype: Any) -> str:
168
+ dtype_str = str(dtype).lower()
169
+ # Handle PyArrow dtypes
170
+ if "[pyarrow]" in dtype_str:
171
+ if "int64" in dtype_str:
172
+ return "Int64"
173
+ elif "int32" in dtype_str:
174
+ return "Int32"
175
+ elif "float64" in dtype_str or "double" in dtype_str:
176
+ return "Float64"
177
+ elif "float32" in dtype_str:
178
+ return "Float32"
179
+ elif "bool" in dtype_str:
180
+ return "UInt8"
181
+ elif "timestamp" in dtype_str: # PyArrow timestamp
182
+ return "DateTime"
183
+ elif "string" in dtype_str: # PyArrow string
184
+ return "String"
185
+ else:
186
+ return "String" # fallback
187
+
162
188
  # Handle pandas extension dtypes explicitly
163
189
  if isinstance(dtype, pd.Int64Dtype):
164
190
  return "Int64"
@@ -170,19 +196,29 @@ class ClickHouseWriter(ManagedResource):
170
196
  return "Float64"
171
197
  if isinstance(dtype, pd.StringDtype):
172
198
  return "String"
173
- if "datetime64" in str(dtype):
199
+ if "datetime64" in dtype_str:
174
200
  return "DateTime"
175
201
 
176
202
  return self.DTYPE_MAP.get(str(dtype), "String")
177
203
 
178
204
  def _should_mark_nullable(self, dtype: Any) -> bool:
179
- s = str(dtype)
205
+ dtype_str = str(dtype).lower()
206
+ # PyArrow types are generally nullable, but let's be specific
207
+ if "[pyarrow]" in dtype_str:
208
+ # For PyArrow, make strings and timestamps nullable, numbers usually not unless data has nulls
209
+ base_type = dtype_str.replace("[pyarrow]", "")
210
+ if base_type in ["string", "large_string"] or "timestamp" in base_type:
211
+ return True
212
+ # For numeric PyArrow, check if the actual data contains nulls (hard to do here)
213
+ # Let's default to not nullable for numeric unless explicitly needed
214
+ return False # Conservative for PyArrow numerics
215
+
180
216
  if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
181
217
  return True
182
- if "datetime64" in s:
218
+ if "datetime64" in dtype_str:
183
219
  return True
184
220
  # object/category almost always nullable
185
- if s in ("object", "category", "string"):
221
+ if dtype_str in ("object", "category", "string"):
186
222
  return True
187
223
  return False
188
224
 
@@ -203,6 +239,10 @@ class ClickHouseWriter(ManagedResource):
203
239
  # Ensure column ordering is stable
204
240
  cols = list(pdf.columns)
205
241
 
242
+ # --- CRITICAL FIX: Ensure datetime columns are compatible BEFORE insertion ---
243
+ # This is the key step to prevent the numpy.datetime64 error
244
+ pdf = self._ensure_clickhouse_compatible_datetime_types(pdf)
245
+
206
246
  # Split into batches (to avoid giant single insert)
207
247
  for start in range(0, len(pdf), self.insert_chunksize):
208
248
  batch = pdf.iloc[start:start + self.insert_chunksize]
@@ -215,30 +255,116 @@ class ClickHouseWriter(ManagedResource):
215
255
  def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
216
256
  client = self._get_client()
217
257
  # clickhouse-connect supports insert_df
258
+ # The df passed here should now have compatible datetime types
218
259
  client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
219
260
 
220
- # ------------- missing values (lazy) -------------
261
+ # ------------- missing values & type conversion (lazy) -------------
221
262
 
222
263
  @staticmethod
223
- def _fill_missing_partition(pdf: pd.DataFrame) -> pd.DataFrame:
224
- # (unchanged body)
264
+ def _process_partition_for_clickhouse_compatible(pdf: pd.DataFrame) -> pd.DataFrame:
265
+ """
266
+ Process a partition to fill missing values and ensure initial data types are consistent.
267
+ This is the first step of data preparation.
268
+ """
269
+ pdf = pdf.copy() # Avoid modifying original
270
+
225
271
  for col in pdf.columns:
226
272
  s = pdf[col]
227
- if pd.api.types.is_integer_dtype(s.dtype):
273
+ dtype_str = str(s.dtype).lower()
274
+
275
+ # --- Handle PyArrow dtypes ---
276
+ if "[pyarrow]" in dtype_str:
277
+ try:
278
+ if "string" in dtype_str:
279
+ # Convert PyArrow string to object, fillna with empty string
280
+ pdf[col] = s.astype('object').fillna("")
281
+ elif "timestamp" in dtype_str:
282
+ # Convert PyArrow timestamp to pandas datetime, NaT for nulls
283
+ pdf[col] = pd.to_datetime(s, errors='coerce') # errors='coerce' handles conversion issues
284
+ elif "int" in dtype_str:
285
+ # Convert PyArrow int to pandas int, fillna with 0 for non-nullable
286
+ pdf[col] = s.fillna(0)
287
+ elif "float" in dtype_str or "double" in dtype_str:
288
+ pdf[col] = s.fillna(0.0)
289
+ elif "bool" in dtype_str:
290
+ pdf[col] = s.fillna(False) # Or pd.NA if you prefer
291
+ else:
292
+ # Fallback: convert to object and then to string
293
+ pdf[col] = s.astype('object').astype(str).fillna("")
294
+ except Exception as e:
295
+ # If conversion fails, fall back to object and string
296
+ pdf[col] = s.astype('object').astype(str).fillna("")
297
+
298
+ # --- Handle standard pandas dtypes ---
299
+ elif pd.api.types.is_integer_dtype(s.dtype):
228
300
  if pd.api.types.is_extension_array_dtype(s.dtype):
229
301
  pdf[col] = s.fillna(pd.NA)
230
302
  else:
231
303
  pdf[col] = s.fillna(0)
232
304
  elif pd.api.types.is_bool_dtype(s.dtype):
233
- pdf[col] = s.fillna(pd.NA)
305
+ pdf[col] = s.fillna(pd.NA) # Or False
234
306
  elif pd.api.types.is_float_dtype(s.dtype):
235
307
  pdf[col] = s.fillna(0.0)
236
308
  elif pd.api.types.is_datetime64_any_dtype(s.dtype):
309
+ # Datetimes - leave as is for now, will be handled in final step
237
310
  pass
238
311
  else:
239
- pdf[col] = s.fillna("")
312
+ # For object/string/category columns, ensure they're strings
313
+ pdf[col] = s.astype(str).fillna("")
314
+
240
315
  return pdf
241
316
 
317
+ def _ensure_clickhouse_compatible_datetime_types(self, df: pd.DataFrame) -> pd.DataFrame:
318
+ """
319
+ Final conversion step: Ensure datetime columns are in a format compatible
320
+ with clickhouse-connect driver. Specifically, convert numpy.datetime64 to
321
+ pandas.Timestamp or Python datetime objects.
322
+ This is called just before insertion.
323
+ """
324
+ df = df.copy()
325
+ for col in df.columns:
326
+ s = df[col]
327
+ # Check if the column is datetime-like
328
+ if pd.api.types.is_datetime64_any_dtype(s.dtype):
329
+ # --- Robust conversion to ensure compatibility ---
330
+ # 1. Convert to pandas datetime explicitly
331
+ df[col] = pd.to_datetime(s, utc=True) # Ensures timezone handling
332
+
333
+ # 2. Replace NaT with None for nullable columns (clickhouse-connect handles this)
334
+ # This is often sufficient, but let's be extra sure about the object type
335
+ # 3. Ensure the underlying objects are pandas.Timestamp (which have .timestamp())
336
+ # The pd.to_datetime should handle this, but accessing .dt accessor reinforces it.
337
+ # If there are still issues, we can force object conversion:
338
+ # df[col] = df[col].dt.to_pydatetime() # Converts to numpy array of datetime64 or None
339
+ # But pd.Timestamp is better. Let's try accessing .dt to ensure it's proper:
340
+ try:
341
+ _ = df[col].dt # Accessing .dt confirms it's datetime-like
342
+ except:
343
+ # If .dt fails, it means conversion wasn't clean, force it
344
+ self.logger.debug(f"Forcing datetime conversion for column {col}")
345
+ df[col] = pd.to_datetime(df[col].astype('object'), utc=True)
346
+
347
+ # --- Final check and explicit conversion if needed ---
348
+ # If the error persists, we might need to explicitly convert the array elements.
349
+ # Let's add a check for the first non-null element in a sample:
350
+ sample_series = df[col].dropna()
351
+ if len(sample_series) > 0:
352
+ first_val = sample_series.iloc[0]
353
+ if isinstance(first_val, np.datetime64):
354
+ self.logger.warning(f"Column {col} still contains numpy.datetime64 after conversion. Forcing object conversion.")
355
+ # Force conversion to object array of pandas.Timestamp or None
356
+ def convert_val(v):
357
+ if pd.isna(v):
358
+ return None
359
+ if isinstance(v, np.datetime64):
360
+ # Convert numpy.datetime64 to pandas.Timestamp
361
+ return pd.Timestamp(v)
362
+ return v
363
+ df[col] = df[col].apply(convert_val)
364
+
365
+ return df
366
+
367
+
242
368
  # ------------- low-level helpers -------------
243
369
 
244
370
  def _get_client(self):
@@ -284,4 +410,3 @@ class ClickHouseWriter(ManagedResource):
284
410
  finally:
285
411
  if hasattr(self._tlocal, "client"):
286
412
  delattr(self._tlocal, "client")
287
-
@@ -31,7 +31,7 @@ def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
31
31
  k = min(max(sample, 1), ddf.npartitions)
32
32
  probes = dask.compute(*[
33
33
  ddf.get_partition(i).map_partitions(len) for i in range(k)
34
- ])
34
+ ], scheduler="threads")
35
35
 
36
36
  if any(_to_int_safe(n) > 0 for n in probes):
37
37
  return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.9.5
3
+ Version: 2025.9.6
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -47,9 +47,9 @@ sibi_dst/utils/boilerplate/base_parquet_artifact.py,sha256=oqPbjHFfChA9j1WL-eDAh
47
47
  sibi_dst/utils/boilerplate/base_parquet_reader.py,sha256=3kN9_bbxyX-WuJLMBWejeApW2V_BDArSljhSUOAOhVU,692
48
48
  sibi_dst/utils/boilerplate/hybrid_data_loader.py,sha256=Tazn7QL3FmWKVMXxzkvxPrG_2ucsPHvSotIW9dBLoNc,6018
49
49
  sibi_dst/utils/business_days.py,sha256=dP0Xj4FhTBIvZZrZYLOHZl5zOpDAgWkD4p_1a7BOT7I,8461
50
- sibi_dst/utils/clickhouse_writer.py,sha256=JCjLfPfsDDAvoMJeh0uVqVL5Je6mPcZn-G_EL9Pk6ms,10364
50
+ sibi_dst/utils/clickhouse_writer.py,sha256=AOv0bYFzAI0u4dEkEBoUqtHekwPMISdNT5-ywCtDe4I,17049
51
51
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
52
- sibi_dst/utils/dask_utils.py,sha256=FURwrNqij6ptxFhI4v7yaGkyOIIyW9lSPpMfE9-kxHY,1970
52
+ sibi_dst/utils/dask_utils.py,sha256=QhFcmpH4fXAy6b3DugIX5JvH4h-P3M3hXKnBYTLRkq0,1991
53
53
  sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
54
54
  sibi_dst/utils/data_utils.py,sha256=7bLidEjppieNoozDFb6OuRY0W995cxg4tiGAlkGfePI,7768
55
55
  sibi_dst/utils/data_wrapper.py,sha256=axHOmCG9cBJgjf5m8jpzsCCZzXJgynGs44rGe6FUrzk,29906
@@ -93,6 +93,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
93
93
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
94
94
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
95
95
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
96
- sibi_dst-2025.9.5.dist-info/METADATA,sha256=HjRtVuHQj3IFf2ABuponSz4ahNMtbTRetLqQC7TSJjc,2710
97
- sibi_dst-2025.9.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
98
- sibi_dst-2025.9.5.dist-info/RECORD,,
96
+ sibi_dst-2025.9.6.dist-info/METADATA,sha256=e9vt1wbHivyTJhyubiEjJcMFBNDF1m9nERTlBgYvq9o,2710
97
+ sibi_dst-2025.9.6.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
98
+ sibi_dst-2025.9.6.dist-info/RECORD,,