sibi-dst 2025.8.5__tar.gz → 2025.8.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/PKG-INFO +1 -1
  2. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/pyproject.toml +1 -1
  3. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/_df_helper.py +40 -6
  4. sibi_dst-2025.8.7/sibi_dst/utils/async_utils.py +12 -0
  5. sibi_dst-2025.8.7/sibi_dst/utils/clickhouse_writer.py +264 -0
  6. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/storage_config.py +2 -2
  7. sibi_dst-2025.8.7/sibi_dst/utils/storage_hive.py +195 -0
  8. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/storage_manager.py +3 -2
  9. sibi_dst-2025.8.5/sibi_dst/utils/clickhouse_writer.py +0 -501
  10. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/README.md +0 -0
  11. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/__init__.py +0 -0
  12. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/__init__.py +0 -0
  13. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
  14. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
  15. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  16. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  17. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/__init__.py +0 -0
  18. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  19. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  20. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  21. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  22. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  23. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  24. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
  25. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  26. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  27. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
  28. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  29. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/core/__init__.py +0 -0
  30. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/core/_defaults.py +0 -0
  31. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  32. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/core/_params_config.py +0 -0
  33. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/core/_query_config.py +0 -0
  34. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/df_helper/data_cleaner.py +0 -0
  35. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/geopy_helper/__init__.py +0 -0
  36. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  37. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/geopy_helper/utils.py +0 -0
  38. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/osmnx_helper/__init__.py +0 -0
  39. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  40. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  41. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  42. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
  43. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  44. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
  45. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/osmnx_helper/utils.py +0 -0
  46. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/tests/__init__.py +0 -0
  47. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  48. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/__init__.py +0 -0
  49. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/base.py +0 -0
  50. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/business_days.py +0 -0
  51. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/credentials.py +0 -0
  52. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/data_from_http_source.py +0 -0
  53. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/data_utils.py +0 -0
  54. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/data_wrapper.py +0 -0
  55. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/date_utils.py +0 -0
  56. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/df_utils.py +0 -0
  57. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/file_age_checker.py +0 -0
  58. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/file_utils.py +0 -0
  59. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/filepath_generator.py +0 -0
  60. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/log_utils.py +0 -0
  61. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/manifest_manager.py +0 -0
  62. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/parquet_saver.py +0 -0
  63. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/periods.py +0 -0
  64. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/phone_formatter.py +0 -0
  65. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/update_planner.py +0 -0
  66. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/utils/webdav_client.py +0 -0
  67. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/__init__.py +0 -0
  68. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/__init__.py +0 -0
  69. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  70. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  71. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  72. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  73. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  74. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  75. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  76. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  77. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  78. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  79. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  80. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  81. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  82. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  83. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  84. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  85. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/utils/__init__.py +0 -0
  86. {sibi_dst-2025.8.5 → sibi_dst-2025.8.7}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.8.5
3
+ Version: 2025.8.7
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.8.5"
3
+ version = "2025.8.7"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from typing import Any, Dict, Optional, TypeVar, Union
4
5
 
5
6
  import dask.dataframe as dd
@@ -104,7 +105,6 @@ class HttpBackend(BaseBackend):
104
105
  return self.total_records, result
105
106
 
106
107
 
107
- # ---- Main DfHelper ----
108
108
  class DfHelper(ManagedResource):
109
109
  _BACKEND_STRATEGIES = {
110
110
  "sqlalchemy": SqlAlchemyBackend,
@@ -198,6 +198,37 @@ class DfHelper(ManagedResource):
198
198
  df = df.persist() if persist else df
199
199
  return df.compute() if as_pandas else df
200
200
 
201
+ async def load_async(
202
+ self,
203
+ *,
204
+ persist: bool = False,
205
+ as_pandas: bool = False,
206
+ prefer_native: bool = False,
207
+ **options,
208
+ ):
209
+ """
210
+ Async load that prefers native async backends when available,
211
+ otherwise runs the sync `load()` in a worker thread via asyncio.to_thread.
212
+
213
+ Args:
214
+ persist: same as `load`
215
+ as_pandas: same as `load`
216
+ prefer_native: if True and the backend overrides `aload`, use it.
217
+ otherwise force thread offload of `load()`.
218
+ **options: forwarded to `load` / `aload`
219
+ """
220
+ # If the backend provided an override for `aload`, use it
221
+ if prefer_native and type(self.backend_strategy).aload is not BaseBackend.aload:
222
+ return await self.aload(persist=persist, as_pandas=as_pandas, **options)
223
+
224
+ # Fall back to offloading the sync path to a thread
225
+ return await asyncio.to_thread(
226
+ self.load,
227
+ persist=persist,
228
+ as_pandas=as_pandas,
229
+ **options,
230
+ )
231
+
201
232
  # ---------- dataframe post-processing ----------
202
233
  def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
203
234
  self.logger.debug("Post-processing DataFrame.")
@@ -240,9 +271,12 @@ class DfHelper(ManagedResource):
240
271
  return df
241
272
 
242
273
  # ---------- sinks ----------
243
- def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
244
- fs: AbstractFileSystem = kwargs.get("fs", self.fs)
245
- path: str = kwargs.get("parquet_storage_path")
274
+ def save_to_parquet(self, df: dd.DataFrame, **kwargs):
275
+ fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
276
+ path: str = kwargs.pop("parquet_storage_path", self.backend_parquet.parquet_storage_path if self.backend_parquet else None)
277
+ parquet_filename = kwargs.pop("parquet_filename" or self._backend_params.parquet_filename if self.backend_parquet else None)
278
+ if not parquet_filename:
279
+ raise ValueError("A 'parquet_filename' keyword argument must be provided.")
246
280
  if not fs:
247
281
  raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
248
282
  if not path:
@@ -268,11 +302,11 @@ class DfHelper(ManagedResource):
268
302
  if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
269
303
  self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
270
304
  return
271
- with ClickHouseWriter(debug=self.debug, logger=self.logger, fs=self.fs, verbose=self.verbose, **credentials) as writer:
305
+ with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
272
306
  writer.save_to_clickhouse(df)
273
307
  self.logger.debug("Save to ClickHouse completed.")
274
308
 
275
- # ---------- convenience period loaders ----------
309
+ # ---------- period loaders ----------
276
310
  def load_period(self, dt_field: str, start: str, end: str, **kwargs):
277
311
  final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
278
312
  return self.load(**final_kwargs)
@@ -0,0 +1,12 @@
1
+ import asyncio
2
+ import dask.dataframe as dd
3
+
4
+
5
+ def is_dask_dataframe(df):
6
+ """Check if the given object is a Dask DataFrame."""
7
+ return isinstance(df, dd.DataFrame)
8
+
9
+ async def to_thread(func, *args, **kwargs):
10
+ """Explicit helper to keep code clear where we hop off the event loop."""
11
+ return await asyncio.to_thread(func, *args, **kwargs)
12
+
@@ -0,0 +1,264 @@
1
+ from __future__ import annotations
2
+
3
+ import threading
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
6
+
7
+ import pandas as pd
8
+ import dask.dataframe as dd
9
+ import clickhouse_connect
10
+
11
+ from . import ManagedResource
12
+
13
+
14
+ class ClickHouseWriter(ManagedResource):
15
+ """
16
+ Write a Dask DataFrame to ClickHouse with:
17
+ - Safe Dask checks (no df.empty)
18
+ - Nullable dtype mapping
19
+ - Optional overwrite (drop + recreate)
20
+ - Partitioned, batched inserts
21
+ - Per-thread clients to avoid session conflicts
22
+ """
23
+
24
+ # Default dtype mapping (pandas/dask → ClickHouse)
25
+ DTYPE_MAP: ClassVar[Dict[str, str]] = {
26
+ "int64": "Int64",
27
+ "Int64": "Int64", # pandas nullable Int64
28
+ "int32": "Int32",
29
+ "Int32": "Int32",
30
+ "float64": "Float64",
31
+ "Float64": "Float64",
32
+ "float32": "Float32",
33
+ "bool": "UInt8",
34
+ "boolean": "UInt8",
35
+ "object": "String",
36
+ "string": "String",
37
+ "category": "String",
38
+ "datetime64[ns]": "DateTime",
39
+ "datetime64[ns, UTC]": "DateTime",
40
+ }
41
+
42
+ def __init__(
43
+ self,
44
+ *,
45
+ host: str = "localhost",
46
+ port: int = 8123,
47
+ database: str = "sibi_data",
48
+ user: str = "default",
49
+ password: str = "",
50
+ table: str = "test_sibi_table",
51
+ order_by: str = "id",
52
+ engine: Optional[str] = None, # e.g. "ENGINE MergeTree ORDER BY (`id`)"
53
+ max_workers: int = 4,
54
+ insert_chunksize: int = 50_000,
55
+ overwrite: bool = False,
56
+ **kwargs: Any,
57
+ ):
58
+ super().__init__(**kwargs)
59
+ self.host = host
60
+ self.port = int(port)
61
+ self.database = database
62
+ self.user = user
63
+ self.password = password
64
+ self.table = table
65
+ self.order_by = order_by
66
+ self.engine = engine # if None → default MergeTree ORDER BY
67
+ self.max_workers = int(max_workers)
68
+ self.insert_chunksize = int(insert_chunksize)
69
+ self.overwrite = bool(overwrite)
70
+
71
+ # one client per thread to avoid session contention
72
+ self._tlocal = threading.local()
73
+
74
+ # ------------- public -------------
75
+
76
+ def save_to_clickhouse(self, df: dd.DataFrame, *, overwrite: Optional[bool] = None) -> None:
77
+ """
78
+ Persist a Dask DataFrame into ClickHouse.
79
+
80
+ Args:
81
+ df: Dask DataFrame
82
+ overwrite: Optional override for dropping/recreating table
83
+ """
84
+ if not isinstance(df, dd.DataFrame):
85
+ raise TypeError("ClickHouseWriter.save_to_clickhouse expects a dask.dataframe.DataFrame.")
86
+
87
+ # small, cheap check: head(1) to detect empty
88
+ head = df.head(1, npartitions=-1, compute=True)
89
+ if head.empty:
90
+ self.logger.info("Dask DataFrame appears empty (head(1) returned 0 rows). Nothing to write.")
91
+ return
92
+
93
+ # lazily fill missing values per-partition (no global compute)
94
+ df = df.map_partitions(type(self)._fill_missing_partition, meta=df._meta)
95
+
96
+ # (re)create table
97
+ ow = self.overwrite if overwrite is None else bool(overwrite)
98
+ dtypes = df._meta_nonempty.dtypes # metadata-only types (no compute)
99
+ schema_sql = self._generate_clickhouse_schema(dtypes)
100
+ engine_sql = self._default_engine_sql() if not self.engine else self.engine
101
+
102
+ if ow:
103
+ self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
104
+ self.logger.info(f"Dropped table {self.table} (overwrite=True)")
105
+
106
+ create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql};"
107
+ self._command(create_sql)
108
+ self.logger.info(f"Ensured table {self.table} exists")
109
+
110
+ # write partitions concurrently
111
+ parts = list(df.to_delayed())
112
+ if not parts:
113
+ self.logger.info("No partitions to write.")
114
+ return
115
+
116
+ self.logger.info(f"Writing {len(parts)} partitions to ClickHouse (max_workers={self.max_workers})")
117
+ with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
118
+ futures = {ex.submit(self._write_one_partition, part, idx): idx for idx, part in enumerate(parts)}
119
+ for fut in as_completed(futures):
120
+ idx = futures[fut]
121
+ try:
122
+ fut.result()
123
+ except Exception as e:
124
+ self.logger.error(f"Partition {idx} failed: {e}", exc_info=self.debug)
125
+ raise
126
+
127
+ self.logger.info(f"Completed writing {len(parts)} partitions to {self.table}")
128
+
129
+ # ------------- schema & types -------------
130
+
131
+ def _generate_clickhouse_schema(self, dask_dtypes: pd.Series) -> str:
132
+ cols: Iterable[Tuple[str, Any]] = dask_dtypes.items()
133
+ pieces = []
134
+ for col, dtype in cols:
135
+ ch_type = self._map_dtype(dtype)
136
+ # Use Nullable for non-numeric/string columns that may carry NaN/None,
137
+ # and for datetimes to be safe with missing values.
138
+ if self._should_mark_nullable(dtype):
139
+ ch_type = f"Nullable({ch_type})"
140
+ pieces.append(f"{self._ident(col)} {ch_type}")
141
+ return ", ".join(pieces)
142
+
143
+ def _map_dtype(self, dtype: Any) -> str:
144
+ # Handle pandas extension dtypes explicitly
145
+ if isinstance(dtype, pd.Int64Dtype):
146
+ return "Int64"
147
+ if isinstance(dtype, pd.Int32Dtype):
148
+ return "Int32"
149
+ if isinstance(dtype, pd.BooleanDtype):
150
+ return "UInt8"
151
+ if isinstance(dtype, pd.Float64Dtype):
152
+ return "Float64"
153
+ if isinstance(dtype, pd.StringDtype):
154
+ return "String"
155
+ if "datetime64" in str(dtype):
156
+ return "DateTime"
157
+
158
+ return self.DTYPE_MAP.get(str(dtype), "String")
159
+
160
+ def _should_mark_nullable(self, dtype: Any) -> bool:
161
+ s = str(dtype)
162
+ if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
163
+ return True
164
+ if "datetime64" in s:
165
+ return True
166
+ # object/category almost always nullable
167
+ if s in ("object", "category", "string"):
168
+ return True
169
+ return False
170
+
171
+ def _default_engine_sql(self) -> str:
172
+ # minimal MergeTree clause; quote order_by safely
173
+ ob = self.order_by if self.order_by.startswith("(") else f"(`{self.order_by}`)"
174
+ return f"ENGINE = MergeTree ORDER BY {ob}"
175
+
176
+ # ------------- partition write -------------
177
+
178
+ def _write_one_partition(self, part, index: int) -> None:
179
+ # Compute partition → pandas
180
+ pdf: pd.DataFrame = part.compute()
181
+ if pdf.empty:
182
+ self.logger.debug(f"Partition {index} empty; skipping")
183
+ return
184
+
185
+ # Ensure column ordering is stable
186
+ cols = list(pdf.columns)
187
+
188
+ # Split into batches (to avoid giant single insert)
189
+ for start in range(0, len(pdf), self.insert_chunksize):
190
+ batch = pdf.iloc[start:start + self.insert_chunksize]
191
+ if batch.empty:
192
+ continue
193
+ self._insert_df(cols, batch)
194
+
195
+ self.logger.debug(f"Partition {index} inserted ({len(pdf)} rows)")
196
+
197
+ def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
198
+ client = self._get_client()
199
+ # clickhouse-connect supports insert_df
200
+ client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
201
+
202
+ # ------------- missing values (lazy) -------------
203
+
204
+ @staticmethod
205
+ def _fill_missing_partition(pdf: pd.DataFrame) -> pd.DataFrame:
206
+ # (unchanged body)
207
+ for col in pdf.columns:
208
+ s = pdf[col]
209
+ if pd.api.types.is_integer_dtype(s.dtype):
210
+ if pd.api.types.is_extension_array_dtype(s.dtype):
211
+ pdf[col] = s.fillna(pd.NA)
212
+ else:
213
+ pdf[col] = s.fillna(0)
214
+ elif pd.api.types.is_bool_dtype(s.dtype):
215
+ pdf[col] = s.fillna(pd.NA)
216
+ elif pd.api.types.is_float_dtype(s.dtype):
217
+ pdf[col] = s.fillna(0.0)
218
+ elif pd.api.types.is_datetime64_any_dtype(s.dtype):
219
+ pass
220
+ else:
221
+ pdf[col] = s.fillna("")
222
+ return pdf
223
+
224
+ # ------------- low-level helpers -------------
225
+
226
+ def _get_client(self):
227
+ cli = getattr(self._tlocal, "client", None)
228
+ if cli is not None:
229
+ return cli
230
+ cli = clickhouse_connect.get_client(
231
+ host=self.host,
232
+ port=self.port,
233
+ database=self.database,
234
+ username=self.user, # clickhouse-connect uses 'username'
235
+ password=self.password,
236
+ )
237
+ self._tlocal.client = cli
238
+ return cli
239
+
240
+ def _command(self, sql: str) -> None:
241
+ client = self._get_client()
242
+ client.command(sql)
243
+
244
+ @staticmethod
245
+ def _ident(name: str) -> str:
246
+ # minimal identifier quoting
247
+ if name.startswith("`") and name.endswith("`"):
248
+ return name
249
+ return f"`{name}`"
250
+
251
+ # ------------- context cleanup -------------
252
+
253
+ def _cleanup(self):
254
+ # close client in this thread (the manager calls _cleanup in the owning thread)
255
+ cli = getattr(self._tlocal, "client", None)
256
+ try:
257
+ if cli is not None:
258
+ cli.close()
259
+ except Exception:
260
+ pass
261
+ finally:
262
+ if hasattr(self._tlocal, "client"):
263
+ delattr(self._tlocal, "client")
264
+
@@ -6,13 +6,13 @@ from .storage_manager import StorageManager
6
6
  from .credentials import ConfigManager
7
7
 
8
8
  class StorageConfig:
9
- def __init__(self, config:ConfigManager, depots:dict=None):
9
+ def __init__(self, config:ConfigManager, depots:dict=None, clear_existing=False, write_mode="full-access"):
10
10
  self.conf = config
11
11
  self.depots = depots
12
12
  self._initialize_storage()
13
13
  self.storage_manager = StorageManager(self.base_storage, self.filesystem_type, self.filesystem_options)
14
14
  if self.depots is not None:
15
- self.depot_paths, self.depot_names = self.storage_manager.rebuild_depot_paths(depots)
15
+ self.depot_paths, self.depot_names = self.storage_manager.rebuild_depot_paths(depots, clear_existing=clear_existing, write_mode=write_mode)
16
16
  else:
17
17
  self.depot_paths = None
18
18
  self.depot_names = None
@@ -0,0 +1,195 @@
1
+ from __future__ import annotations
2
+ import pandas as pd
3
+ import dask.dataframe as dd
4
+ from typing import Iterable, Optional, List, Tuple, Union
5
+ import fsspec
6
+
7
+ DNFFilter = List[List[Tuple[str, str, Union[str, int]]]]
8
+
9
+
10
+ class HiveDatePartitionedStore:
11
+ """
12
+ Dask-only Parquet store with Hive-style yyyy=…/mm=…/dd=… partitions.
13
+
14
+ - `write(...)` safely "overwrites" S3 prefixes via per-object deletes (no bulk DeleteObjects).
15
+ - `read_range(...)` builds DNF filters and auto-matches partition types (string vs int).
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ path: str,
21
+ *,
22
+ filesystem=None, # fsspec filesystem or None to infer from path
23
+ date_col: str = "tracking_dt",
24
+ compression: str = "zstd",
25
+ partition_values_as_strings: bool = True, # keep mm=07, dd=01 folder names
26
+ logger=None,
27
+ ) -> None:
28
+ self.path = path
29
+ self.fs = filesystem or fsspec.open(path).fs
30
+ self.date_col = date_col
31
+ self.compression = compression
32
+ self.partition_values_as_strings = partition_values_as_strings
33
+ self.log = logger
34
+
35
+ # ----------------- public API -----------------
36
+
37
+ def write(
38
+ self,
39
+ df: dd.DataFrame,
40
+ *,
41
+ repartition: Optional[int] = None,
42
+ overwrite: bool = False,
43
+ ) -> None:
44
+ """Write Dask DataFrame to Hive-style yyyy/mm/dd partitions."""
45
+ self._require_col(df, self.date_col)
46
+ ser = dd.to_datetime(df[self.date_col], errors="coerce")
47
+
48
+ if self.partition_values_as_strings:
49
+ parts = {
50
+ "yyyy": ser.dt.strftime("%Y"),
51
+ "mm": ser.dt.strftime("%m"),
52
+ "dd": ser.dt.strftime("%d"),
53
+ }
54
+ else:
55
+ parts = {
56
+ "yyyy": ser.dt.year.astype("int32"),
57
+ "mm": ser.dt.month.astype("int8"),
58
+ "dd": ser.dt.day.astype("int8"),
59
+ }
60
+
61
+ df = df.assign(**{self.date_col: ser}, **parts)
62
+
63
+ if repartition:
64
+ df = df.repartition(npartitions=repartition)
65
+
66
+ if overwrite:
67
+ self._safe_rm_prefix(self.path)
68
+
69
+ if self.log:
70
+ self.log.info(f"Writing parquet to {self.path} (hive yyyy/mm/dd)…")
71
+
72
+ df.to_parquet(
73
+ self.path,
74
+ engine="pyarrow",
75
+ write_index=False,
76
+ filesystem=self.fs,
77
+ partition_on=["yyyy", "mm", "dd"],
78
+ compression=self.compression,
79
+ overwrite=False, # we pre-cleaned if overwrite=True
80
+ )
81
+
82
+ def read_range(
83
+ self,
84
+ start: Union[str, pd.Timestamp],
85
+ end: Union[str, pd.Timestamp],
86
+ *,
87
+ columns: Optional[Iterable[str]] = None,
88
+ ) -> dd.DataFrame:
89
+ """
90
+ Read a date window with partition pruning. Tries string filters first,
91
+ falls back to integer filters if Arrow infers partition types as ints.
92
+ """
93
+ str_filters = self._dnf_filters_for_range_str(start, end)
94
+ try:
95
+ return dd.read_parquet(
96
+ self.path,
97
+ engine="pyarrow",
98
+ filesystem=self.fs,
99
+ columns=list(columns) if columns else None,
100
+ filters=str_filters,
101
+ )
102
+ except Exception:
103
+ int_filters = self._dnf_filters_for_range_int(start, end)
104
+ return dd.read_parquet(
105
+ self.path,
106
+ engine="pyarrow",
107
+ filesystem=self.fs,
108
+ columns=list(columns) if columns else None,
109
+ filters=int_filters,
110
+ )
111
+
112
+ # Convenience: full month / single day
113
+ def read_month(self, year: int, month: int, *, columns=None) -> dd.DataFrame:
114
+ start = pd.Timestamp(year=year, month=month, day=1)
115
+ end = (start + pd.offsets.MonthEnd(0))
116
+ return self.read_range(start, end, columns=columns)
117
+
118
+ def read_day(self, year: int, month: int, day: int, *, columns=None) -> dd.DataFrame:
119
+ ts = pd.Timestamp(year=year, month=month, day=day)
120
+ return self.read_range(ts, ts, columns=columns)
121
+
122
+ # ----------------- internals -----------------
123
+
124
+ @staticmethod
125
+ def _pad2(n: int) -> str:
126
+ return f"{n:02d}"
127
+
128
+ def _safe_rm_prefix(self, path: str) -> None:
129
+ """Per-object delete to avoid S3 bulk DeleteObjects (and Content-MD5 issues)."""
130
+ if not self.fs.exists(path):
131
+ return
132
+ if self.log:
133
+ self.log.info(f"Cleaning prefix (safe delete): {path}")
134
+ for k in self.fs.find(path):
135
+ try:
136
+ (self.fs.rm_file(k) if hasattr(self.fs, "rm_file") else self.fs.rm(k, recursive=False))
137
+ except Exception as e:
138
+ if self.log:
139
+ self.log.warning(f"Could not delete {k}: {e}")
140
+
141
+ @staticmethod
142
+ def _require_col(df: dd.DataFrame, col: str) -> None:
143
+ if col not in df.columns:
144
+ raise KeyError(f"'{col}' not in DataFrame")
145
+
146
+ # ---- DNF builders (string vs int) ----
147
+ def _dnf_filters_for_range_str(self, start, end) -> DNFFilter:
148
+ s, e = pd.Timestamp(start), pd.Timestamp(end)
149
+ if s > e:
150
+ raise ValueError("start > end")
151
+ sY, sM, sD = s.year, s.month, s.day
152
+ eY, eM, eD = e.year, e.month, e.day
153
+ p2 = self._pad2
154
+ if sY == eY and sM == eM:
155
+ return [[("yyyy","==",str(sY)),("mm","==",p2(sM)),("dd",">=",p2(sD)),("dd","<=",p2(eD))]]
156
+ clauses: DNFFilter = [
157
+ [("yyyy","==",str(sY)),("mm","==",p2(sM)),("dd",">=",p2(sD))],
158
+ [("yyyy","==",str(eY)),("mm","==",p2(eM)),("dd","<=",p2(eD))]
159
+ ]
160
+ if sY == eY:
161
+ for m in range(sM+1, eM):
162
+ clauses.append([("yyyy","==",str(sY)),("mm","==",p2(m))])
163
+ return clauses
164
+ for m in range(sM+1, 13):
165
+ clauses.append([("yyyy","==",str(sY)),("mm","==",p2(m))])
166
+ for y in range(sY+1, eY):
167
+ clauses.append([("yyyy","==",str(y))])
168
+ for m in range(1, eM):
169
+ clauses.append([("yyyy","==",str(eY)),("mm","==",p2(m))])
170
+ return clauses
171
+
172
+ @staticmethod
173
+ def _dnf_filters_for_range_int(start, end) -> DNFFilter:
174
+ s, e = pd.Timestamp(start), pd.Timestamp(end)
175
+ if s > e:
176
+ raise ValueError("start > end")
177
+ sY, sM, sD = s.year, s.month, s.day
178
+ eY, eM, eD = e.year, e.month, e.day
179
+ if sY == eY and sM == eM:
180
+ return [[("yyyy","==",sY),("mm","==",sM),("dd",">=",sD),("dd","<=",eD)]]
181
+ clauses: DNFFilter = [
182
+ [("yyyy","==",sY),("mm","==",sM),("dd",">=",sD)],
183
+ [("yyyy","==",eY),("mm","==",eM),("dd","<=",eD)],
184
+ ]
185
+ if sY == eY:
186
+ for m in range(sM+1, eM):
187
+ clauses.append([("yyyy","==",sY),("mm","==",m)])
188
+ return clauses
189
+ for m in range(sM+1, 13):
190
+ clauses.append([("yyyy","==",sY),("mm","==",m)])
191
+ for y in range(sY+1, eY):
192
+ clauses.append([("yyyy","==",y)])
193
+ for m in range(1, eM):
194
+ clauses.append([("yyyy","==",eY),("mm","==",m)])
195
+ return clauses
@@ -83,7 +83,7 @@ class StorageManager:
83
83
  self.fs.rm(sub_path, recursive=True)
84
84
  self.fs.mkdirs(sub_path, exist_ok=True)
85
85
 
86
- def rebuild_depot_paths(self, depots, clear_existing=False):
86
+ def rebuild_depot_paths(self, depots, clear_existing=False, write_mode="full-access"):
87
87
  """
88
88
  Rebuilds depot_paths (dictionary) and depot_name (SimpleNamespace).
89
89
  Handles clear_existing scenario by resetting directories when required.
@@ -96,7 +96,8 @@ class StorageManager:
96
96
  depot_path = self.join_paths(self.storage_path, depot)
97
97
  if self.debug:
98
98
  print(f"Rebuilding depot at: {depot_path}")
99
- self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
99
+ if write_mode == "full-access":
100
+ self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
100
101
 
101
102
  # Generate depot_paths dictionary
102
103
  self.depot_paths = {