sibi-dst 2025.8.5__tar.gz → 2025.8.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/PKG-INFO +1 -1
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/pyproject.toml +1 -1
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/_df_helper.py +3 -3
- sibi_dst-2025.8.6/sibi_dst/utils/clickhouse_writer.py +264 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/storage_config.py +2 -2
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/storage_manager.py +3 -2
- sibi_dst-2025.8.5/sibi_dst/utils/clickhouse_writer.py +0 -501
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/README.md +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/base.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/business_days.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/data_wrapper.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/file_age_checker.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/periods.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/update_planner.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -241,8 +241,8 @@ class DfHelper(ManagedResource):
|
|
241
241
|
|
242
242
|
# ---------- sinks ----------
|
243
243
|
def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
|
244
|
-
fs: AbstractFileSystem = kwargs.
|
245
|
-
path: str = kwargs.
|
244
|
+
fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
|
245
|
+
path: str = kwargs.pop("parquet_storage_path")
|
246
246
|
if not fs:
|
247
247
|
raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
|
248
248
|
if not path:
|
@@ -268,7 +268,7 @@ class DfHelper(ManagedResource):
|
|
268
268
|
if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
|
269
269
|
self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
|
270
270
|
return
|
271
|
-
with ClickHouseWriter(debug=self.debug, logger=self.logger,
|
271
|
+
with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
272
272
|
writer.save_to_clickhouse(df)
|
273
273
|
self.logger.debug("Save to ClickHouse completed.")
|
274
274
|
|
@@ -0,0 +1,264 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import threading
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
|
+
from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import dask.dataframe as dd
|
9
|
+
import clickhouse_connect
|
10
|
+
|
11
|
+
from . import ManagedResource
|
12
|
+
|
13
|
+
|
14
|
+
class ClickHouseWriter(ManagedResource):
|
15
|
+
"""
|
16
|
+
Write a Dask DataFrame to ClickHouse with:
|
17
|
+
- Safe Dask checks (no df.empty)
|
18
|
+
- Nullable dtype mapping
|
19
|
+
- Optional overwrite (drop + recreate)
|
20
|
+
- Partitioned, batched inserts
|
21
|
+
- Per-thread clients to avoid session conflicts
|
22
|
+
"""
|
23
|
+
|
24
|
+
# Default dtype mapping (pandas/dask → ClickHouse)
|
25
|
+
DTYPE_MAP: ClassVar[Dict[str, str]] = {
|
26
|
+
"int64": "Int64",
|
27
|
+
"Int64": "Int64", # pandas nullable Int64
|
28
|
+
"int32": "Int32",
|
29
|
+
"Int32": "Int32",
|
30
|
+
"float64": "Float64",
|
31
|
+
"Float64": "Float64",
|
32
|
+
"float32": "Float32",
|
33
|
+
"bool": "UInt8",
|
34
|
+
"boolean": "UInt8",
|
35
|
+
"object": "String",
|
36
|
+
"string": "String",
|
37
|
+
"category": "String",
|
38
|
+
"datetime64[ns]": "DateTime",
|
39
|
+
"datetime64[ns, UTC]": "DateTime",
|
40
|
+
}
|
41
|
+
|
42
|
+
def __init__(
|
43
|
+
self,
|
44
|
+
*,
|
45
|
+
host: str = "localhost",
|
46
|
+
port: int = 8123,
|
47
|
+
database: str = "sibi_data",
|
48
|
+
user: str = "default",
|
49
|
+
password: str = "",
|
50
|
+
table: str = "test_sibi_table",
|
51
|
+
order_by: str = "id",
|
52
|
+
engine: Optional[str] = None, # e.g. "ENGINE MergeTree ORDER BY (`id`)"
|
53
|
+
max_workers: int = 4,
|
54
|
+
insert_chunksize: int = 50_000,
|
55
|
+
overwrite: bool = False,
|
56
|
+
**kwargs: Any,
|
57
|
+
):
|
58
|
+
super().__init__(**kwargs)
|
59
|
+
self.host = host
|
60
|
+
self.port = int(port)
|
61
|
+
self.database = database
|
62
|
+
self.user = user
|
63
|
+
self.password = password
|
64
|
+
self.table = table
|
65
|
+
self.order_by = order_by
|
66
|
+
self.engine = engine # if None → default MergeTree ORDER BY
|
67
|
+
self.max_workers = int(max_workers)
|
68
|
+
self.insert_chunksize = int(insert_chunksize)
|
69
|
+
self.overwrite = bool(overwrite)
|
70
|
+
|
71
|
+
# one client per thread to avoid session contention
|
72
|
+
self._tlocal = threading.local()
|
73
|
+
|
74
|
+
# ------------- public -------------
|
75
|
+
|
76
|
+
def save_to_clickhouse(self, df: dd.DataFrame, *, overwrite: Optional[bool] = None) -> None:
|
77
|
+
"""
|
78
|
+
Persist a Dask DataFrame into ClickHouse.
|
79
|
+
|
80
|
+
Args:
|
81
|
+
df: Dask DataFrame
|
82
|
+
overwrite: Optional override for dropping/recreating table
|
83
|
+
"""
|
84
|
+
if not isinstance(df, dd.DataFrame):
|
85
|
+
raise TypeError("ClickHouseWriter.save_to_clickhouse expects a dask.dataframe.DataFrame.")
|
86
|
+
|
87
|
+
# small, cheap check: head(1) to detect empty
|
88
|
+
head = df.head(1, npartitions=-1, compute=True)
|
89
|
+
if head.empty:
|
90
|
+
self.logger.info("Dask DataFrame appears empty (head(1) returned 0 rows). Nothing to write.")
|
91
|
+
return
|
92
|
+
|
93
|
+
# lazily fill missing values per-partition (no global compute)
|
94
|
+
df = df.map_partitions(type(self)._fill_missing_partition, meta=df._meta)
|
95
|
+
|
96
|
+
# (re)create table
|
97
|
+
ow = self.overwrite if overwrite is None else bool(overwrite)
|
98
|
+
dtypes = df._meta_nonempty.dtypes # metadata-only types (no compute)
|
99
|
+
schema_sql = self._generate_clickhouse_schema(dtypes)
|
100
|
+
engine_sql = self._default_engine_sql() if not self.engine else self.engine
|
101
|
+
|
102
|
+
if ow:
|
103
|
+
self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
|
104
|
+
self.logger.info(f"Dropped table {self.table} (overwrite=True)")
|
105
|
+
|
106
|
+
create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql};"
|
107
|
+
self._command(create_sql)
|
108
|
+
self.logger.info(f"Ensured table {self.table} exists")
|
109
|
+
|
110
|
+
# write partitions concurrently
|
111
|
+
parts = list(df.to_delayed())
|
112
|
+
if not parts:
|
113
|
+
self.logger.info("No partitions to write.")
|
114
|
+
return
|
115
|
+
|
116
|
+
self.logger.info(f"Writing {len(parts)} partitions to ClickHouse (max_workers={self.max_workers})")
|
117
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
118
|
+
futures = {ex.submit(self._write_one_partition, part, idx): idx for idx, part in enumerate(parts)}
|
119
|
+
for fut in as_completed(futures):
|
120
|
+
idx = futures[fut]
|
121
|
+
try:
|
122
|
+
fut.result()
|
123
|
+
except Exception as e:
|
124
|
+
self.logger.error(f"Partition {idx} failed: {e}", exc_info=self.debug)
|
125
|
+
raise
|
126
|
+
|
127
|
+
self.logger.info(f"Completed writing {len(parts)} partitions to {self.table}")
|
128
|
+
|
129
|
+
# ------------- schema & types -------------
|
130
|
+
|
131
|
+
def _generate_clickhouse_schema(self, dask_dtypes: pd.Series) -> str:
|
132
|
+
cols: Iterable[Tuple[str, Any]] = dask_dtypes.items()
|
133
|
+
pieces = []
|
134
|
+
for col, dtype in cols:
|
135
|
+
ch_type = self._map_dtype(dtype)
|
136
|
+
# Use Nullable for non-numeric/string columns that may carry NaN/None,
|
137
|
+
# and for datetimes to be safe with missing values.
|
138
|
+
if self._should_mark_nullable(dtype):
|
139
|
+
ch_type = f"Nullable({ch_type})"
|
140
|
+
pieces.append(f"{self._ident(col)} {ch_type}")
|
141
|
+
return ", ".join(pieces)
|
142
|
+
|
143
|
+
def _map_dtype(self, dtype: Any) -> str:
|
144
|
+
# Handle pandas extension dtypes explicitly
|
145
|
+
if isinstance(dtype, pd.Int64Dtype):
|
146
|
+
return "Int64"
|
147
|
+
if isinstance(dtype, pd.Int32Dtype):
|
148
|
+
return "Int32"
|
149
|
+
if isinstance(dtype, pd.BooleanDtype):
|
150
|
+
return "UInt8"
|
151
|
+
if isinstance(dtype, pd.Float64Dtype):
|
152
|
+
return "Float64"
|
153
|
+
if isinstance(dtype, pd.StringDtype):
|
154
|
+
return "String"
|
155
|
+
if "datetime64" in str(dtype):
|
156
|
+
return "DateTime"
|
157
|
+
|
158
|
+
return self.DTYPE_MAP.get(str(dtype), "String")
|
159
|
+
|
160
|
+
def _should_mark_nullable(self, dtype: Any) -> bool:
|
161
|
+
s = str(dtype)
|
162
|
+
if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
|
163
|
+
return True
|
164
|
+
if "datetime64" in s:
|
165
|
+
return True
|
166
|
+
# object/category almost always nullable
|
167
|
+
if s in ("object", "category", "string"):
|
168
|
+
return True
|
169
|
+
return False
|
170
|
+
|
171
|
+
def _default_engine_sql(self) -> str:
|
172
|
+
# minimal MergeTree clause; quote order_by safely
|
173
|
+
ob = self.order_by if self.order_by.startswith("(") else f"(`{self.order_by}`)"
|
174
|
+
return f"ENGINE = MergeTree ORDER BY {ob}"
|
175
|
+
|
176
|
+
# ------------- partition write -------------
|
177
|
+
|
178
|
+
def _write_one_partition(self, part, index: int) -> None:
|
179
|
+
# Compute partition → pandas
|
180
|
+
pdf: pd.DataFrame = part.compute()
|
181
|
+
if pdf.empty:
|
182
|
+
self.logger.debug(f"Partition {index} empty; skipping")
|
183
|
+
return
|
184
|
+
|
185
|
+
# Ensure column ordering is stable
|
186
|
+
cols = list(pdf.columns)
|
187
|
+
|
188
|
+
# Split into batches (to avoid giant single insert)
|
189
|
+
for start in range(0, len(pdf), self.insert_chunksize):
|
190
|
+
batch = pdf.iloc[start:start + self.insert_chunksize]
|
191
|
+
if batch.empty:
|
192
|
+
continue
|
193
|
+
self._insert_df(cols, batch)
|
194
|
+
|
195
|
+
self.logger.debug(f"Partition {index} inserted ({len(pdf)} rows)")
|
196
|
+
|
197
|
+
def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
|
198
|
+
client = self._get_client()
|
199
|
+
# clickhouse-connect supports insert_df
|
200
|
+
client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
|
201
|
+
|
202
|
+
# ------------- missing values (lazy) -------------
|
203
|
+
|
204
|
+
@staticmethod
|
205
|
+
def _fill_missing_partition(pdf: pd.DataFrame) -> pd.DataFrame:
|
206
|
+
# (unchanged body)
|
207
|
+
for col in pdf.columns:
|
208
|
+
s = pdf[col]
|
209
|
+
if pd.api.types.is_integer_dtype(s.dtype):
|
210
|
+
if pd.api.types.is_extension_array_dtype(s.dtype):
|
211
|
+
pdf[col] = s.fillna(pd.NA)
|
212
|
+
else:
|
213
|
+
pdf[col] = s.fillna(0)
|
214
|
+
elif pd.api.types.is_bool_dtype(s.dtype):
|
215
|
+
pdf[col] = s.fillna(pd.NA)
|
216
|
+
elif pd.api.types.is_float_dtype(s.dtype):
|
217
|
+
pdf[col] = s.fillna(0.0)
|
218
|
+
elif pd.api.types.is_datetime64_any_dtype(s.dtype):
|
219
|
+
pass
|
220
|
+
else:
|
221
|
+
pdf[col] = s.fillna("")
|
222
|
+
return pdf
|
223
|
+
|
224
|
+
# ------------- low-level helpers -------------
|
225
|
+
|
226
|
+
def _get_client(self):
|
227
|
+
cli = getattr(self._tlocal, "client", None)
|
228
|
+
if cli is not None:
|
229
|
+
return cli
|
230
|
+
cli = clickhouse_connect.get_client(
|
231
|
+
host=self.host,
|
232
|
+
port=self.port,
|
233
|
+
database=self.database,
|
234
|
+
username=self.user, # clickhouse-connect uses 'username'
|
235
|
+
password=self.password,
|
236
|
+
)
|
237
|
+
self._tlocal.client = cli
|
238
|
+
return cli
|
239
|
+
|
240
|
+
def _command(self, sql: str) -> None:
|
241
|
+
client = self._get_client()
|
242
|
+
client.command(sql)
|
243
|
+
|
244
|
+
@staticmethod
|
245
|
+
def _ident(name: str) -> str:
|
246
|
+
# minimal identifier quoting
|
247
|
+
if name.startswith("`") and name.endswith("`"):
|
248
|
+
return name
|
249
|
+
return f"`{name}`"
|
250
|
+
|
251
|
+
# ------------- context cleanup -------------
|
252
|
+
|
253
|
+
def _cleanup(self):
|
254
|
+
# close client in this thread (the manager calls _cleanup in the owning thread)
|
255
|
+
cli = getattr(self._tlocal, "client", None)
|
256
|
+
try:
|
257
|
+
if cli is not None:
|
258
|
+
cli.close()
|
259
|
+
except Exception:
|
260
|
+
pass
|
261
|
+
finally:
|
262
|
+
if hasattr(self._tlocal, "client"):
|
263
|
+
delattr(self._tlocal, "client")
|
264
|
+
|
@@ -6,13 +6,13 @@ from .storage_manager import StorageManager
|
|
6
6
|
from .credentials import ConfigManager
|
7
7
|
|
8
8
|
class StorageConfig:
|
9
|
-
def __init__(self, config:ConfigManager, depots:dict=None):
|
9
|
+
def __init__(self, config:ConfigManager, depots:dict=None, clear_existing=False, write_mode="full-access"):
|
10
10
|
self.conf = config
|
11
11
|
self.depots = depots
|
12
12
|
self._initialize_storage()
|
13
13
|
self.storage_manager = StorageManager(self.base_storage, self.filesystem_type, self.filesystem_options)
|
14
14
|
if self.depots is not None:
|
15
|
-
self.depot_paths, self.depot_names = self.storage_manager.rebuild_depot_paths(depots)
|
15
|
+
self.depot_paths, self.depot_names = self.storage_manager.rebuild_depot_paths(depots, clear_existing=clear_existing, write_mode=write_mode)
|
16
16
|
else:
|
17
17
|
self.depot_paths = None
|
18
18
|
self.depot_names = None
|
@@ -83,7 +83,7 @@ class StorageManager:
|
|
83
83
|
self.fs.rm(sub_path, recursive=True)
|
84
84
|
self.fs.mkdirs(sub_path, exist_ok=True)
|
85
85
|
|
86
|
-
def rebuild_depot_paths(self, depots, clear_existing=False):
|
86
|
+
def rebuild_depot_paths(self, depots, clear_existing=False, write_mode="full-access"):
|
87
87
|
"""
|
88
88
|
Rebuilds depot_paths (dictionary) and depot_name (SimpleNamespace).
|
89
89
|
Handles clear_existing scenario by resetting directories when required.
|
@@ -96,7 +96,8 @@ class StorageManager:
|
|
96
96
|
depot_path = self.join_paths(self.storage_path, depot)
|
97
97
|
if self.debug:
|
98
98
|
print(f"Rebuilding depot at: {depot_path}")
|
99
|
-
|
99
|
+
if write_mode == "full-access":
|
100
|
+
self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
|
100
101
|
|
101
102
|
# Generate depot_paths dictionary
|
102
103
|
self.depot_paths = {
|
@@ -1,501 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import threading
|
4
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
5
|
-
from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
|
6
|
-
|
7
|
-
import pandas as pd
|
8
|
-
import dask.dataframe as dd
|
9
|
-
import clickhouse_connect
|
10
|
-
|
11
|
-
from . import ManagedResource
|
12
|
-
|
13
|
-
|
14
|
-
class ClickHouseWriter(ManagedResource):
|
15
|
-
"""
|
16
|
-
Write a Dask DataFrame to ClickHouse with:
|
17
|
-
- Safe Dask checks (no df.empty)
|
18
|
-
- Nullable dtype mapping
|
19
|
-
- Optional overwrite (drop + recreate)
|
20
|
-
- Partitioned, batched inserts
|
21
|
-
- Per-thread clients to avoid session conflicts
|
22
|
-
"""
|
23
|
-
|
24
|
-
# Default dtype mapping (pandas/dask → ClickHouse)
|
25
|
-
DTYPE_MAP: ClassVar[Dict[str, str]] = {
|
26
|
-
"int64": "Int64",
|
27
|
-
"Int64": "Int64", # pandas nullable Int64
|
28
|
-
"int32": "Int32",
|
29
|
-
"Int32": "Int32",
|
30
|
-
"float64": "Float64",
|
31
|
-
"Float64": "Float64",
|
32
|
-
"float32": "Float32",
|
33
|
-
"bool": "UInt8",
|
34
|
-
"boolean": "UInt8",
|
35
|
-
"object": "String",
|
36
|
-
"string": "String",
|
37
|
-
"category": "String",
|
38
|
-
"datetime64[ns]": "DateTime",
|
39
|
-
"datetime64[ns, UTC]": "DateTime",
|
40
|
-
}
|
41
|
-
|
42
|
-
def __init__(
|
43
|
-
self,
|
44
|
-
*,
|
45
|
-
host: str = "localhost",
|
46
|
-
port: int = 8123,
|
47
|
-
database: str = "sibi_data",
|
48
|
-
user: str = "default",
|
49
|
-
password: str = "",
|
50
|
-
table: str = "test_sibi_table",
|
51
|
-
order_by: str = "id",
|
52
|
-
engine: Optional[str] = None, # e.g. "ENGINE MergeTree ORDER BY (`id`)"
|
53
|
-
max_workers: int = 4,
|
54
|
-
insert_chunksize: int = 50_000,
|
55
|
-
overwrite: bool = False,
|
56
|
-
**kwargs: Any,
|
57
|
-
):
|
58
|
-
super().__init__(**kwargs)
|
59
|
-
self.host = host
|
60
|
-
self.port = int(port)
|
61
|
-
self.database = database
|
62
|
-
self.user = user
|
63
|
-
self.password = password
|
64
|
-
self.table = table
|
65
|
-
self.order_by = order_by
|
66
|
-
self.engine = engine # if None → default MergeTree ORDER BY
|
67
|
-
self.max_workers = int(max_workers)
|
68
|
-
self.insert_chunksize = int(insert_chunksize)
|
69
|
-
self.overwrite = bool(overwrite)
|
70
|
-
|
71
|
-
# one client per thread to avoid session contention
|
72
|
-
self._tlocal = threading.local()
|
73
|
-
|
74
|
-
# ------------- public -------------
|
75
|
-
|
76
|
-
def save_to_clickhouse(self, df: dd.DataFrame, *, overwrite: Optional[bool] = None) -> None:
|
77
|
-
"""
|
78
|
-
Persist a Dask DataFrame into ClickHouse.
|
79
|
-
|
80
|
-
Args:
|
81
|
-
df: Dask DataFrame
|
82
|
-
overwrite: Optional override for dropping/recreating table
|
83
|
-
"""
|
84
|
-
if not isinstance(df, dd.DataFrame):
|
85
|
-
raise TypeError("ClickHouseWriter.save_to_clickhouse expects a dask.dataframe.DataFrame.")
|
86
|
-
|
87
|
-
# small, cheap check: head(1) to detect empty
|
88
|
-
head = df.head(1, npartitions=-1, compute=True)
|
89
|
-
if head.empty:
|
90
|
-
self.logger.info("Dask DataFrame appears empty (head(1) returned 0 rows). Nothing to write.")
|
91
|
-
return
|
92
|
-
|
93
|
-
# lazily fill missing values per-partition (no global compute)
|
94
|
-
df = df.map_partitions(self._fill_missing_partition, meta=df)
|
95
|
-
|
96
|
-
# (re)create table
|
97
|
-
ow = self.overwrite if overwrite is None else bool(overwrite)
|
98
|
-
dtypes = df._meta_nonempty.dtypes # metadata-only types (no compute)
|
99
|
-
schema_sql = self._generate_clickhouse_schema(dtypes)
|
100
|
-
engine_sql = self._default_engine_sql() if not self.engine else self.engine
|
101
|
-
|
102
|
-
if ow:
|
103
|
-
self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
|
104
|
-
self.logger.info(f"Dropped table {self.table} (overwrite=True)")
|
105
|
-
|
106
|
-
create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql};"
|
107
|
-
self._command(create_sql)
|
108
|
-
self.logger.info(f"Ensured table {self.table} exists")
|
109
|
-
|
110
|
-
# write partitions concurrently
|
111
|
-
parts = list(df.to_delayed())
|
112
|
-
if not parts:
|
113
|
-
self.logger.info("No partitions to write.")
|
114
|
-
return
|
115
|
-
|
116
|
-
self.logger.info(f"Writing {len(parts)} partitions to ClickHouse (max_workers={self.max_workers})")
|
117
|
-
with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
|
118
|
-
futures = {ex.submit(self._write_one_partition, part, idx): idx for idx, part in enumerate(parts)}
|
119
|
-
for fut in as_completed(futures):
|
120
|
-
idx = futures[fut]
|
121
|
-
try:
|
122
|
-
fut.result()
|
123
|
-
except Exception as e:
|
124
|
-
self.logger.error(f"Partition {idx} failed: {e}", exc_info=self.debug)
|
125
|
-
raise
|
126
|
-
|
127
|
-
self.logger.info(f"Completed writing {len(parts)} partitions to {self.table}")
|
128
|
-
|
129
|
-
# ------------- schema & types -------------
|
130
|
-
|
131
|
-
def _generate_clickhouse_schema(self, dask_dtypes: pd.Series) -> str:
|
132
|
-
cols: Iterable[Tuple[str, Any]] = dask_dtypes.items()
|
133
|
-
pieces = []
|
134
|
-
for col, dtype in cols:
|
135
|
-
ch_type = self._map_dtype(dtype)
|
136
|
-
# Use Nullable for non-numeric/string columns that may carry NaN/None,
|
137
|
-
# and for datetimes to be safe with missing values.
|
138
|
-
if self._should_mark_nullable(dtype):
|
139
|
-
ch_type = f"Nullable({ch_type})"
|
140
|
-
pieces.append(f"{self._ident(col)} {ch_type}")
|
141
|
-
return ", ".join(pieces)
|
142
|
-
|
143
|
-
def _map_dtype(self, dtype: Any) -> str:
|
144
|
-
# Handle pandas extension dtypes explicitly
|
145
|
-
if isinstance(dtype, pd.Int64Dtype):
|
146
|
-
return "Int64"
|
147
|
-
if isinstance(dtype, pd.Int32Dtype):
|
148
|
-
return "Int32"
|
149
|
-
if isinstance(dtype, pd.BooleanDtype):
|
150
|
-
return "UInt8"
|
151
|
-
if isinstance(dtype, pd.Float64Dtype):
|
152
|
-
return "Float64"
|
153
|
-
if isinstance(dtype, pd.StringDtype):
|
154
|
-
return "String"
|
155
|
-
if "datetime64" in str(dtype):
|
156
|
-
return "DateTime"
|
157
|
-
|
158
|
-
return self.DTYPE_MAP.get(str(dtype), "String")
|
159
|
-
|
160
|
-
def _should_mark_nullable(self, dtype: Any) -> bool:
|
161
|
-
s = str(dtype)
|
162
|
-
if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
|
163
|
-
return True
|
164
|
-
if "datetime64" in s:
|
165
|
-
return True
|
166
|
-
# object/category almost always nullable
|
167
|
-
if s in ("object", "category", "string"):
|
168
|
-
return True
|
169
|
-
return False
|
170
|
-
|
171
|
-
def _default_engine_sql(self) -> str:
|
172
|
-
# minimal MergeTree clause; quote order_by safely
|
173
|
-
ob = self.order_by if self.order_by.startswith("(") else f"(`{self.order_by}`)"
|
174
|
-
return f"ENGINE = MergeTree ORDER BY {ob}"
|
175
|
-
|
176
|
-
# ------------- partition write -------------
|
177
|
-
|
178
|
-
def _write_one_partition(self, part, index: int) -> None:
|
179
|
-
# Compute partition → pandas
|
180
|
-
pdf: pd.DataFrame = part.compute()
|
181
|
-
if pdf.empty:
|
182
|
-
self.logger.debug(f"Partition {index} empty; skipping")
|
183
|
-
return
|
184
|
-
|
185
|
-
# Ensure column ordering is stable
|
186
|
-
cols = list(pdf.columns)
|
187
|
-
|
188
|
-
# Split into batches (to avoid giant single insert)
|
189
|
-
for start in range(0, len(pdf), self.insert_chunksize):
|
190
|
-
batch = pdf.iloc[start:start + self.insert_chunksize]
|
191
|
-
if batch.empty:
|
192
|
-
continue
|
193
|
-
self._insert_df(cols, batch)
|
194
|
-
|
195
|
-
self.logger.debug(f"Partition {index} inserted ({len(pdf)} rows)")
|
196
|
-
|
197
|
-
def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
|
198
|
-
client = self._get_client()
|
199
|
-
# clickhouse-connect supports insert_df
|
200
|
-
client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
|
201
|
-
|
202
|
-
# ------------- missing values (lazy) -------------
|
203
|
-
|
204
|
-
def _fill_missing_partition(self, pdf: pd.DataFrame) -> pd.DataFrame:
|
205
|
-
# Fill by dtype family; leave real NaT for datetimes so Nullable(DateTime) accepts NULL
|
206
|
-
for col in pdf.columns:
|
207
|
-
s = pdf[col]
|
208
|
-
if pd.api.types.is_integer_dtype(s.dtype):
|
209
|
-
# pandas nullable IntX supports NA → fill where needed
|
210
|
-
if pd.api.types.is_extension_array_dtype(s.dtype):
|
211
|
-
pdf[col] = s.fillna(pd.NA)
|
212
|
-
else:
|
213
|
-
pdf[col] = s.fillna(0)
|
214
|
-
elif pd.api.types.is_bool_dtype(s.dtype):
|
215
|
-
# boolean pandas extension supports NA, ClickHouse uses UInt8; keep NA → Nullable
|
216
|
-
pdf[col] = s.fillna(pd.NA)
|
217
|
-
elif pd.api.types.is_float_dtype(s.dtype):
|
218
|
-
pdf[col] = s.fillna(0.0)
|
219
|
-
elif pd.api.types.is_datetime64_any_dtype(s.dtype):
|
220
|
-
# keep NaT; ClickHouse Nullable(DateTime) will take NULL
|
221
|
-
pass
|
222
|
-
else:
|
223
|
-
pdf[col] = s.fillna("")
|
224
|
-
return pdf
|
225
|
-
|
226
|
-
# ------------- low-level helpers -------------
|
227
|
-
|
228
|
-
def _get_client(self):
|
229
|
-
cli = getattr(self._tlocal, "client", None)
|
230
|
-
if cli is not None:
|
231
|
-
return cli
|
232
|
-
cli = clickhouse_connect.get_client(
|
233
|
-
host=self.host,
|
234
|
-
port=self.port,
|
235
|
-
database=self.database,
|
236
|
-
username=self.user, # clickhouse-connect uses 'username'
|
237
|
-
password=self.password,
|
238
|
-
)
|
239
|
-
self._tlocal.client = cli
|
240
|
-
return cli
|
241
|
-
|
242
|
-
def _command(self, sql: str) -> None:
|
243
|
-
client = self._get_client()
|
244
|
-
client.command(sql)
|
245
|
-
|
246
|
-
@staticmethod
|
247
|
-
def _ident(name: str) -> str:
|
248
|
-
# minimal identifier quoting
|
249
|
-
if name.startswith("`") and name.endswith("`"):
|
250
|
-
return name
|
251
|
-
return f"`{name}`"
|
252
|
-
|
253
|
-
# ------------- context cleanup -------------
|
254
|
-
|
255
|
-
def _cleanup(self):
|
256
|
-
# close client in this thread (the manager calls _cleanup in the owning thread)
|
257
|
-
cli = getattr(self._tlocal, "client", None)
|
258
|
-
try:
|
259
|
-
if cli is not None:
|
260
|
-
cli.close()
|
261
|
-
except Exception:
|
262
|
-
pass
|
263
|
-
finally:
|
264
|
-
if hasattr(self._tlocal, "client"):
|
265
|
-
delattr(self._tlocal, "client")
|
266
|
-
|
267
|
-
# from concurrent.futures import ThreadPoolExecutor
|
268
|
-
# from typing import ClassVar, Dict
|
269
|
-
#
|
270
|
-
# import clickhouse_connect
|
271
|
-
# import pandas as pd
|
272
|
-
# from clickhouse_driver import Client
|
273
|
-
# import dask.dataframe as dd
|
274
|
-
#
|
275
|
-
# from . import ManagedResource
|
276
|
-
#
|
277
|
-
#
|
278
|
-
# class ClickHouseWriter(ManagedResource):
|
279
|
-
# """
|
280
|
-
# Provides functionality to write a Dask DataFrame to a ClickHouse database using
|
281
|
-
# a specified schema. This class handles the creation of tables, schema generation,
|
282
|
-
# data transformation, and data insertion. It ensures compatibility between Dask
|
283
|
-
# data types and ClickHouse types.
|
284
|
-
#
|
285
|
-
# :ivar clickhouse_host: Host address of the ClickHouse database.
|
286
|
-
# :type clickhouse_host: str
|
287
|
-
# :ivar clickhouse_port: Port of the ClickHouse database.
|
288
|
-
# :type clickhouse_port: int
|
289
|
-
# :ivar clickhouse_dbname: Name of the database to connect to in ClickHouse.
|
290
|
-
# :type clickhouse_dbname: str
|
291
|
-
# :ivar clickhouse_user: Username for database authentication.
|
292
|
-
# :type clickhouse_user: str
|
293
|
-
# :ivar clickhouse_password: Password for database authentication.
|
294
|
-
# :type clickhouse_password: str
|
295
|
-
# :ivar clickhouse_table: Name of the table to store the data in.
|
296
|
-
# :type clickhouse_table: str
|
297
|
-
# :ivar logger: Logger instance for logging messages.
|
298
|
-
# :type logger: logging.Logger
|
299
|
-
# :ivar client: Instance of the ClickHouse database client.
|
300
|
-
# :type client: clickhouse_connect.Client or None
|
301
|
-
# :ivar df: Dask DataFrame to be written into ClickHouse.
|
302
|
-
# :type df: dask.dataframe.DataFrame
|
303
|
-
# :ivar order_by: Field or column name to use for table ordering.
|
304
|
-
# :type order_by: str
|
305
|
-
# """
|
306
|
-
# dtype_to_clickhouse: ClassVar[Dict[str, str]] = {
|
307
|
-
# 'int64': 'Int64',
|
308
|
-
# 'int32': 'Int32',
|
309
|
-
# 'float64': 'Float64',
|
310
|
-
# 'float32': 'Float32',
|
311
|
-
# 'bool': 'UInt8',
|
312
|
-
# 'datetime64[ns]': 'DateTime',
|
313
|
-
# 'object': 'String',
|
314
|
-
# 'category': 'String',
|
315
|
-
# }
|
316
|
-
# df: dd.DataFrame
|
317
|
-
#
|
318
|
-
# def __init__(self, **kwargs):
|
319
|
-
# super().__init__(**kwargs)
|
320
|
-
# self.clickhouse_host = kwargs.setdefault('host', "localhost")
|
321
|
-
# self.clickhouse_port = kwargs.setdefault('port', 8123)
|
322
|
-
# self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
|
323
|
-
# self.clickhouse_user = kwargs.setdefault('user', 'default')
|
324
|
-
# self.clickhouse_password = kwargs.setdefault('password', '')
|
325
|
-
# self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
|
326
|
-
#
|
327
|
-
# #self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
328
|
-
# self.client = None
|
329
|
-
# self.order_by = kwargs.setdefault('order_by', 'id')
|
330
|
-
#
|
331
|
-
# def save_to_clickhouse(self, df, **kwargs):
|
332
|
-
# self.df = df.copy()
|
333
|
-
# self.order_by = kwargs.setdefault('order_by', self.order_by)
|
334
|
-
# if len(self.df.head().index) == 0:
|
335
|
-
# self.logger.debug("Dataframe is empty")
|
336
|
-
# return
|
337
|
-
# self._handle_missing_values()
|
338
|
-
# self._connect()
|
339
|
-
# self._drop_table()
|
340
|
-
# self._create_table_from_dask()
|
341
|
-
# self._write_data()
|
342
|
-
#
|
343
|
-
# def _connect(self):
|
344
|
-
# try:
|
345
|
-
# self.client = clickhouse_connect.get_client(
|
346
|
-
# host=self.clickhouse_host,
|
347
|
-
# port=self.clickhouse_port,
|
348
|
-
# database=self.clickhouse_dbname,
|
349
|
-
# user=self.clickhouse_user,
|
350
|
-
# password=self.clickhouse_password
|
351
|
-
# )
|
352
|
-
# self.logger.debug("Connected to ClickHouse")
|
353
|
-
# except Exception as e:
|
354
|
-
# self.logger.error(e)
|
355
|
-
# raise
|
356
|
-
#
|
357
|
-
# @staticmethod
|
358
|
-
# def _generate_clickhouse_schema(dask_dtypes, dtype_map):
|
359
|
-
# schema = []
|
360
|
-
# for col, dtype in dask_dtypes.items():
|
361
|
-
# # Handle pandas nullable types explicitly
|
362
|
-
# if isinstance(dtype, pd.Int64Dtype): # pandas nullable Int64
|
363
|
-
# clickhouse_type = 'Int64'
|
364
|
-
# elif isinstance(dtype, pd.Float64Dtype): # pandas nullable Float64
|
365
|
-
# clickhouse_type = 'Float64'
|
366
|
-
# elif isinstance(dtype, pd.BooleanDtype): # pandas nullable Boolean
|
367
|
-
# clickhouse_type = 'UInt8'
|
368
|
-
# elif isinstance(dtype, pd.DatetimeTZDtype) or 'datetime' in str(dtype): # Nullable datetime
|
369
|
-
# clickhouse_type = 'Nullable(DateTime)'
|
370
|
-
# elif isinstance(dtype, pd.StringDtype): # pandas nullable String
|
371
|
-
# clickhouse_type = 'String'
|
372
|
-
# else:
|
373
|
-
# # Default mapping using the provided dtype_map
|
374
|
-
# clickhouse_type = dtype_map.get(str(dtype), 'String')
|
375
|
-
# schema.append(f"`{col}` {clickhouse_type}")
|
376
|
-
# return ', '.join(schema)
|
377
|
-
#
|
378
|
-
# def _drop_table(self):
|
379
|
-
# if self.client:
|
380
|
-
# self.client.command('DROP TABLE IF EXISTS {}'.format(self.clickhouse_table))
|
381
|
-
# self.logger.debug(f"Dropped table {self.clickhouse_table}")
|
382
|
-
#
|
383
|
-
# def _create_table_from_dask(self, engine=None):
|
384
|
-
# if engine is None:
|
385
|
-
# engine = f"ENGINE = MergeTree() order by {self.order_by}"
|
386
|
-
# dtypes = self.df.dtypes
|
387
|
-
# clickhouse_schema = self._generate_clickhouse_schema(dtypes, self.dtype_to_clickhouse)
|
388
|
-
# create_table_sql = f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
|
389
|
-
# self.logger.debug(f"Creating table SQL:{create_table_sql}")
|
390
|
-
# if self.client:
|
391
|
-
# self.client.command(create_table_sql)
|
392
|
-
# self.logger.debug("Created table '{}'".format(self.clickhouse_table))
|
393
|
-
#
|
394
|
-
# def _handle_missing_values(self):
|
395
|
-
# """
|
396
|
-
# Handle missing values in the Dask DataFrame before writing to ClickHouse.
|
397
|
-
# """
|
398
|
-
# self.logger.debug("Checking for missing values...")
|
399
|
-
# missing_counts = self.df.isnull().sum().compute()
|
400
|
-
# self.logger.debug(f"Missing values per column:\n{missing_counts}")
|
401
|
-
#
|
402
|
-
# # Replace missing values based on column types
|
403
|
-
# def replace_missing_values(df):
|
404
|
-
# for col in df.columns:
|
405
|
-
# if pd.api.types.is_integer_dtype(df[col]):
|
406
|
-
# df[col] = df[col].fillna(0) # Replace NA with 0 for integers
|
407
|
-
# elif pd.api.types.is_float_dtype(df[col]):
|
408
|
-
# df[col] = df[col].fillna(0.0) # Replace NA with 0.0 for floats
|
409
|
-
# elif pd.api.types.is_bool_dtype(df[col]):
|
410
|
-
# df[col] = df[col].fillna(False) # Replace NA with False for booleans
|
411
|
-
# else:
|
412
|
-
# df[col] = df[col].fillna('') # Replace NA with empty string for other types
|
413
|
-
# return df
|
414
|
-
#
|
415
|
-
# # Apply replacement
|
416
|
-
# self.df = replace_missing_values(self.df)
|
417
|
-
# self.logger.debug("Missing values replaced.")
|
418
|
-
#
|
419
|
-
# def _write_data(self):
|
420
|
-
# """
|
421
|
-
# Writes the Dask DataFrame to a ClickHouse table partition by partition.
|
422
|
-
# """
|
423
|
-
# if len(self.df.index) == 0:
|
424
|
-
# self.logger.debug("No data found. Nothing written.")
|
425
|
-
# return
|
426
|
-
#
|
427
|
-
# for i, partition in enumerate(self.df.to_delayed()):
|
428
|
-
# try:
|
429
|
-
# # Compute the current partition into a pandas DataFrame
|
430
|
-
# df = partition.compute()
|
431
|
-
#
|
432
|
-
# if df.empty:
|
433
|
-
# self.logger.debug(f"Partition {i} is empty. Skipping...")
|
434
|
-
# continue
|
435
|
-
#
|
436
|
-
# self.logger.debug(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
|
437
|
-
#
|
438
|
-
# # Write the partition to the ClickHouse table
|
439
|
-
# self.client.insert_df(self.clickhouse_table, df)
|
440
|
-
# except Exception as e:
|
441
|
-
# self.logger.error(f"Error writing partition {i}: {e}")
|
442
|
-
#
|
443
|
-
# def _write_data_multi_not_working_yet(self):
|
444
|
-
# """
|
445
|
-
# Writes the Dask DataFrame to a ClickHouse table partition by partition.
|
446
|
-
# Ensures a separate client instance is used per thread to avoid session conflicts.
|
447
|
-
# """
|
448
|
-
# if len(self.df.index) == 0:
|
449
|
-
# self.logger.debug("No data found. Nothing written.")
|
450
|
-
# return
|
451
|
-
#
|
452
|
-
# def create_client():
|
453
|
-
# client = Client(
|
454
|
-
# host=self.clickhouse_host,
|
455
|
-
# port=self.clickhouse_port,
|
456
|
-
# database=self.clickhouse_dbname,
|
457
|
-
# user=self.clickhouse_user,
|
458
|
-
# password=self.clickhouse_password
|
459
|
-
# )
|
460
|
-
# """
|
461
|
-
# Create a new instance of the ClickHouse client for each thread.
|
462
|
-
# This avoids session conflicts during concurrent writes.
|
463
|
-
# """
|
464
|
-
# return client
|
465
|
-
#
|
466
|
-
# def write_partition(partition, index):
|
467
|
-
# """
|
468
|
-
# Write a single partition to ClickHouse using a separate client instance.
|
469
|
-
# """
|
470
|
-
# try:
|
471
|
-
# self.logger.debug(f"Starting to process partition {index}")
|
472
|
-
# client = create_client() # Create a new client for the thread
|
473
|
-
#
|
474
|
-
# # Compute the Dask partition into a Pandas DataFrame
|
475
|
-
# df = partition.compute()
|
476
|
-
# if df.empty:
|
477
|
-
# self.logger.debug(f"Partition {index} is empty. Skipping...")
|
478
|
-
# return
|
479
|
-
#
|
480
|
-
# # Convert DataFrame to list of tuples
|
481
|
-
# data = [tuple(row) for row in df.to_numpy()]
|
482
|
-
# columns = df.columns.tolist()
|
483
|
-
#
|
484
|
-
# # Perform the insert
|
485
|
-
# self.logger.debug(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
|
486
|
-
# client.execute(f"INSERT INTO {self.clickhouse_table} ({', '.join(columns)}) VALUES", data)
|
487
|
-
#
|
488
|
-
# except Exception as e:
|
489
|
-
# self.logger.error(f"Error writing partition {index}: {e}")
|
490
|
-
# finally:
|
491
|
-
# if 'client' in locals() and hasattr(client, 'close'):
|
492
|
-
# client.close()
|
493
|
-
# self.logger.debug(f"Closed client for partition {index}")
|
494
|
-
#
|
495
|
-
# try:
|
496
|
-
# # Get delayed partitions and enumerate them
|
497
|
-
# partitions = self.df.to_delayed()
|
498
|
-
# with ThreadPoolExecutor() as executor:
|
499
|
-
# executor.map(write_partition, partitions, range(len(partitions)))
|
500
|
-
# except Exception as e:
|
501
|
-
# self.logger.error(f"Error during multi-partition write: {e}")
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/parquet/_parquet_options.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py
RENAMED
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py
RENAMED
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py
RENAMED
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.8.5 → sibi_dst-2025.8.6}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|