sibi-dst 2025.1.12__py3-none-any.whl → 2025.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
  3. sibi_dst/df_helper/_df_helper.py +417 -117
  4. sibi_dst/df_helper/_parquet_artifact.py +255 -283
  5. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  6. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  7. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  8. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  9. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  10. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  12. sibi_dst/osmnx_helper/__init__.py +1 -0
  13. sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +203 -0
  14. sibi_dst/osmnx_helper/route_path_builder.py +97 -0
  15. sibi_dst/osmnx_helper/utils.py +2 -0
  16. sibi_dst/utils/base.py +302 -96
  17. sibi_dst/utils/clickhouse_writer.py +472 -206
  18. sibi_dst/utils/data_utils.py +139 -186
  19. sibi_dst/utils/data_wrapper.py +317 -73
  20. sibi_dst/utils/date_utils.py +1 -0
  21. sibi_dst/utils/df_utils.py +193 -213
  22. sibi_dst/utils/file_utils.py +3 -2
  23. sibi_dst/utils/filepath_generator.py +314 -152
  24. sibi_dst/utils/log_utils.py +581 -242
  25. sibi_dst/utils/manifest_manager.py +60 -76
  26. sibi_dst/utils/parquet_saver.py +33 -27
  27. sibi_dst/utils/phone_formatter.py +88 -95
  28. sibi_dst/utils/update_planner.py +180 -178
  29. sibi_dst/utils/webdav_client.py +116 -166
  30. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
  31. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +32 -28
  32. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
@@ -1,235 +1,501 @@
1
- from concurrent.futures import ThreadPoolExecutor
2
- from typing import ClassVar, Dict
1
+ from __future__ import annotations
2
+
3
+ import threading
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+ from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
3
6
 
4
- import clickhouse_connect
5
7
  import pandas as pd
6
- from clickhouse_driver import Client
7
8
  import dask.dataframe as dd
9
+ import clickhouse_connect
8
10
 
9
11
  from . import ManagedResource
10
12
 
11
13
 
12
14
  class ClickHouseWriter(ManagedResource):
13
15
  """
14
- Provides functionality to write a Dask DataFrame to a ClickHouse database using
15
- a specified schema. This class handles the creation of tables, schema generation,
16
- data transformation, and data insertion. It ensures compatibility between Dask
17
- data types and ClickHouse types.
18
-
19
- :ivar clickhouse_host: Host address of the ClickHouse database.
20
- :type clickhouse_host: str
21
- :ivar clickhouse_port: Port of the ClickHouse database.
22
- :type clickhouse_port: int
23
- :ivar clickhouse_dbname: Name of the database to connect to in ClickHouse.
24
- :type clickhouse_dbname: str
25
- :ivar clickhouse_user: Username for database authentication.
26
- :type clickhouse_user: str
27
- :ivar clickhouse_password: Password for database authentication.
28
- :type clickhouse_password: str
29
- :ivar clickhouse_table: Name of the table to store the data in.
30
- :type clickhouse_table: str
31
- :ivar logger: Logger instance for logging messages.
32
- :type logger: logging.Logger
33
- :ivar client: Instance of the ClickHouse database client.
34
- :type client: clickhouse_connect.Client or None
35
- :ivar df: Dask DataFrame to be written into ClickHouse.
36
- :type df: dask.dataframe.DataFrame
37
- :ivar order_by: Field or column name to use for table ordering.
38
- :type order_by: str
16
+ Write a Dask DataFrame to ClickHouse with:
17
+ - Safe Dask checks (no df.empty)
18
+ - Nullable dtype mapping
19
+ - Optional overwrite (drop + recreate)
20
+ - Partitioned, batched inserts
21
+ - Per-thread clients to avoid session conflicts
39
22
  """
40
- dtype_to_clickhouse: ClassVar[Dict[str, str]] = {
41
- 'int64': 'Int64',
42
- 'int32': 'Int32',
43
- 'float64': 'Float64',
44
- 'float32': 'Float32',
45
- 'bool': 'UInt8',
46
- 'datetime64[ns]': 'DateTime',
47
- 'object': 'String',
48
- 'category': 'String',
23
+
24
+ # Default dtype mapping (pandas/dask → ClickHouse)
25
+ DTYPE_MAP: ClassVar[Dict[str, str]] = {
26
+ "int64": "Int64",
27
+ "Int64": "Int64", # pandas nullable Int64
28
+ "int32": "Int32",
29
+ "Int32": "Int32",
30
+ "float64": "Float64",
31
+ "Float64": "Float64",
32
+ "float32": "Float32",
33
+ "bool": "UInt8",
34
+ "boolean": "UInt8",
35
+ "object": "String",
36
+ "string": "String",
37
+ "category": "String",
38
+ "datetime64[ns]": "DateTime",
39
+ "datetime64[ns, UTC]": "DateTime",
49
40
  }
50
- df: dd.DataFrame
51
41
 
52
- def __init__(self, **kwargs):
42
+ def __init__(
43
+ self,
44
+ *,
45
+ host: str = "localhost",
46
+ port: int = 8123,
47
+ database: str = "sibi_data",
48
+ user: str = "default",
49
+ password: str = "",
50
+ table: str = "test_sibi_table",
51
+ order_by: str = "id",
52
+ engine: Optional[str] = None, # e.g. "ENGINE MergeTree ORDER BY (`id`)"
53
+ max_workers: int = 4,
54
+ insert_chunksize: int = 50_000,
55
+ overwrite: bool = False,
56
+ **kwargs: Any,
57
+ ):
53
58
  super().__init__(**kwargs)
54
- self.clickhouse_host = kwargs.setdefault('host', "localhost")
55
- self.clickhouse_port = kwargs.setdefault('port', 8123)
56
- self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
57
- self.clickhouse_user = kwargs.setdefault('user', 'default')
58
- self.clickhouse_password = kwargs.setdefault('password', '')
59
- self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
60
-
61
- #self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
62
- self.client = None
63
- self.order_by = kwargs.setdefault('order_by', 'id')
64
-
65
- def save_to_clickhouse(self, df, **kwargs):
66
- self.df = df.copy()
67
- self.order_by = kwargs.setdefault('order_by', self.order_by)
68
- if len(self.df.head().index) == 0:
69
- self.logger.debug("Dataframe is empty")
70
- return
71
- self._handle_missing_values()
72
- self._connect()
73
- self._drop_table()
74
- self._create_table_from_dask()
75
- self._write_data()
59
+ self.host = host
60
+ self.port = int(port)
61
+ self.database = database
62
+ self.user = user
63
+ self.password = password
64
+ self.table = table
65
+ self.order_by = order_by
66
+ self.engine = engine # if None → default MergeTree ORDER BY
67
+ self.max_workers = int(max_workers)
68
+ self.insert_chunksize = int(insert_chunksize)
69
+ self.overwrite = bool(overwrite)
76
70
 
77
- def _connect(self):
78
- try:
79
- self.client = clickhouse_connect.get_client(
80
- host=self.clickhouse_host,
81
- port=self.clickhouse_port,
82
- database=self.clickhouse_dbname,
83
- user=self.clickhouse_user,
84
- password=self.clickhouse_password
85
- )
86
- self.logger.debug("Connected to ClickHouse")
87
- except Exception as e:
88
- self.logger.error(e)
89
- raise
71
+ # one client per thread to avoid session contention
72
+ self._tlocal = threading.local()
90
73
 
91
- @staticmethod
92
- def _generate_clickhouse_schema(dask_dtypes, dtype_map):
93
- schema = []
94
- for col, dtype in dask_dtypes.items():
95
- # Handle pandas nullable types explicitly
96
- if isinstance(dtype, pd.Int64Dtype): # pandas nullable Int64
97
- clickhouse_type = 'Int64'
98
- elif isinstance(dtype, pd.Float64Dtype): # pandas nullable Float64
99
- clickhouse_type = 'Float64'
100
- elif isinstance(dtype, pd.BooleanDtype): # pandas nullable Boolean
101
- clickhouse_type = 'UInt8'
102
- elif isinstance(dtype, pd.DatetimeTZDtype) or 'datetime' in str(dtype): # Nullable datetime
103
- clickhouse_type = 'Nullable(DateTime)'
104
- elif isinstance(dtype, pd.StringDtype): # pandas nullable String
105
- clickhouse_type = 'String'
106
- else:
107
- # Default mapping using the provided dtype_map
108
- clickhouse_type = dtype_map.get(str(dtype), 'String')
109
- schema.append(f"`{col}` {clickhouse_type}")
110
- return ', '.join(schema)
111
-
112
- def _drop_table(self):
113
- if self.client:
114
- self.client.command('DROP TABLE IF EXISTS {}'.format(self.clickhouse_table))
115
- self.logger.debug(f"Dropped table {self.clickhouse_table}")
116
-
117
- def _create_table_from_dask(self, engine=None):
118
- if engine is None:
119
- engine = f"ENGINE = MergeTree() order by {self.order_by}"
120
- dtypes = self.df.dtypes
121
- clickhouse_schema = self._generate_clickhouse_schema(dtypes, self.dtype_to_clickhouse)
122
- create_table_sql = f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
123
- self.logger.debug(f"Creating table SQL:{create_table_sql}")
124
- if self.client:
125
- self.client.command(create_table_sql)
126
- self.logger.debug("Created table '{}'".format(self.clickhouse_table))
127
-
128
- def _handle_missing_values(self):
74
+ # ------------- public -------------
75
+
76
+ def save_to_clickhouse(self, df: dd.DataFrame, *, overwrite: Optional[bool] = None) -> None:
129
77
  """
130
- Handle missing values in the Dask DataFrame before writing to ClickHouse.
78
+ Persist a Dask DataFrame into ClickHouse.
79
+
80
+ Args:
81
+ df: Dask DataFrame
82
+ overwrite: Optional override for dropping/recreating table
131
83
  """
132
- self.logger.debug("Checking for missing values...")
133
- missing_counts = self.df.isnull().sum().compute()
134
- self.logger.debug(f"Missing values per column:\n{missing_counts}")
135
-
136
- # Replace missing values based on column types
137
- def replace_missing_values(df):
138
- for col in df.columns:
139
- if pd.api.types.is_integer_dtype(df[col]):
140
- df[col] = df[col].fillna(0) # Replace NA with 0 for integers
141
- elif pd.api.types.is_float_dtype(df[col]):
142
- df[col] = df[col].fillna(0.0) # Replace NA with 0.0 for floats
143
- elif pd.api.types.is_bool_dtype(df[col]):
144
- df[col] = df[col].fillna(False) # Replace NA with False for booleans
145
- else:
146
- df[col] = df[col].fillna('') # Replace NA with empty string for other types
147
- return df
84
+ if not isinstance(df, dd.DataFrame):
85
+ raise TypeError("ClickHouseWriter.save_to_clickhouse expects a dask.dataframe.DataFrame.")
148
86
 
149
- # Apply replacement
150
- self.df = replace_missing_values(self.df)
151
- self.logger.debug("Missing values replaced.")
87
+ # small, cheap check: head(1) to detect empty
88
+ head = df.head(1, npartitions=-1, compute=True)
89
+ if head.empty:
90
+ self.logger.info("Dask DataFrame appears empty (head(1) returned 0 rows). Nothing to write.")
91
+ return
152
92
 
153
- def _write_data(self):
154
- """
155
- Writes the Dask DataFrame to a ClickHouse table partition by partition.
156
- """
157
- if len(self.df.index) == 0:
158
- self.logger.debug("No data found. Nothing written.")
93
+ # lazily fill missing values per-partition (no global compute)
94
+ df = df.map_partitions(self._fill_missing_partition, meta=df)
95
+
96
+ # (re)create table
97
+ ow = self.overwrite if overwrite is None else bool(overwrite)
98
+ dtypes = df._meta_nonempty.dtypes # metadata-only types (no compute)
99
+ schema_sql = self._generate_clickhouse_schema(dtypes)
100
+ engine_sql = self._default_engine_sql() if not self.engine else self.engine
101
+
102
+ if ow:
103
+ self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
104
+ self.logger.info(f"Dropped table {self.table} (overwrite=True)")
105
+
106
+ create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql};"
107
+ self._command(create_sql)
108
+ self.logger.info(f"Ensured table {self.table} exists")
109
+
110
+ # write partitions concurrently
111
+ parts = list(df.to_delayed())
112
+ if not parts:
113
+ self.logger.info("No partitions to write.")
159
114
  return
160
115
 
161
- for i, partition in enumerate(self.df.to_delayed()):
162
- try:
163
- # Compute the current partition into a pandas DataFrame
164
- df = partition.compute()
116
+ self.logger.info(f"Writing {len(parts)} partitions to ClickHouse (max_workers={self.max_workers})")
117
+ with ThreadPoolExecutor(max_workers=self.max_workers) as ex:
118
+ futures = {ex.submit(self._write_one_partition, part, idx): idx for idx, part in enumerate(parts)}
119
+ for fut in as_completed(futures):
120
+ idx = futures[fut]
121
+ try:
122
+ fut.result()
123
+ except Exception as e:
124
+ self.logger.error(f"Partition {idx} failed: {e}", exc_info=self.debug)
125
+ raise
165
126
 
166
- if df.empty:
167
- self.logger.debug(f"Partition {i} is empty. Skipping...")
168
- continue
127
+ self.logger.info(f"Completed writing {len(parts)} partitions to {self.table}")
169
128
 
170
- self.logger.debug(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
129
+ # ------------- schema & types -------------
171
130
 
172
- # Write the partition to the ClickHouse table
173
- self.client.insert_df(self.clickhouse_table, df)
174
- except Exception as e:
175
- self.logger.error(f"Error writing partition {i}: {e}")
131
+ def _generate_clickhouse_schema(self, dask_dtypes: pd.Series) -> str:
132
+ cols: Iterable[Tuple[str, Any]] = dask_dtypes.items()
133
+ pieces = []
134
+ for col, dtype in cols:
135
+ ch_type = self._map_dtype(dtype)
136
+ # Use Nullable for non-numeric/string columns that may carry NaN/None,
137
+ # and for datetimes to be safe with missing values.
138
+ if self._should_mark_nullable(dtype):
139
+ ch_type = f"Nullable({ch_type})"
140
+ pieces.append(f"{self._ident(col)} {ch_type}")
141
+ return ", ".join(pieces)
176
142
 
177
- def _write_data_multi_not_working_yet(self):
178
- """
179
- Writes the Dask DataFrame to a ClickHouse table partition by partition.
180
- Ensures a separate client instance is used per thread to avoid session conflicts.
181
- """
182
- if len(self.df.index) == 0:
183
- self.logger.debug("No data found. Nothing written.")
143
+ def _map_dtype(self, dtype: Any) -> str:
144
+ # Handle pandas extension dtypes explicitly
145
+ if isinstance(dtype, pd.Int64Dtype):
146
+ return "Int64"
147
+ if isinstance(dtype, pd.Int32Dtype):
148
+ return "Int32"
149
+ if isinstance(dtype, pd.BooleanDtype):
150
+ return "UInt8"
151
+ if isinstance(dtype, pd.Float64Dtype):
152
+ return "Float64"
153
+ if isinstance(dtype, pd.StringDtype):
154
+ return "String"
155
+ if "datetime64" in str(dtype):
156
+ return "DateTime"
157
+
158
+ return self.DTYPE_MAP.get(str(dtype), "String")
159
+
160
+ def _should_mark_nullable(self, dtype: Any) -> bool:
161
+ s = str(dtype)
162
+ if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
163
+ return True
164
+ if "datetime64" in s:
165
+ return True
166
+ # object/category almost always nullable
167
+ if s in ("object", "category", "string"):
168
+ return True
169
+ return False
170
+
171
+ def _default_engine_sql(self) -> str:
172
+ # minimal MergeTree clause; quote order_by safely
173
+ ob = self.order_by if self.order_by.startswith("(") else f"(`{self.order_by}`)"
174
+ return f"ENGINE = MergeTree ORDER BY {ob}"
175
+
176
+ # ------------- partition write -------------
177
+
178
+ def _write_one_partition(self, part, index: int) -> None:
179
+ # Compute partition → pandas
180
+ pdf: pd.DataFrame = part.compute()
181
+ if pdf.empty:
182
+ self.logger.debug(f"Partition {index} empty; skipping")
184
183
  return
185
184
 
186
- def create_client():
187
- client = Client(
188
- host=self.clickhouse_host,
189
- port=self.clickhouse_port,
190
- database=self.clickhouse_dbname,
191
- user=self.clickhouse_user,
192
- password=self.clickhouse_password
193
- )
194
- """
195
- Create a new instance of the ClickHouse client for each thread.
196
- This avoids session conflicts during concurrent writes.
197
- """
198
- return client
199
-
200
- def write_partition(partition, index):
201
- """
202
- Write a single partition to ClickHouse using a separate client instance.
203
- """
204
- try:
205
- self.logger.debug(f"Starting to process partition {index}")
206
- client = create_client() # Create a new client for the thread
207
-
208
- # Compute the Dask partition into a Pandas DataFrame
209
- df = partition.compute()
210
- if df.empty:
211
- self.logger.debug(f"Partition {index} is empty. Skipping...")
212
- return
213
-
214
- # Convert DataFrame to list of tuples
215
- data = [tuple(row) for row in df.to_numpy()]
216
- columns = df.columns.tolist()
217
-
218
- # Perform the insert
219
- self.logger.debug(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
220
- client.execute(f"INSERT INTO {self.clickhouse_table} ({', '.join(columns)}) VALUES", data)
221
-
222
- except Exception as e:
223
- self.logger.error(f"Error writing partition {index}: {e}")
224
- finally:
225
- if 'client' in locals() and hasattr(client, 'close'):
226
- client.close()
227
- self.logger.debug(f"Closed client for partition {index}")
185
+ # Ensure column ordering is stable
186
+ cols = list(pdf.columns)
228
187
 
188
+ # Split into batches (to avoid giant single insert)
189
+ for start in range(0, len(pdf), self.insert_chunksize):
190
+ batch = pdf.iloc[start:start + self.insert_chunksize]
191
+ if batch.empty:
192
+ continue
193
+ self._insert_df(cols, batch)
194
+
195
+ self.logger.debug(f"Partition {index} inserted ({len(pdf)} rows)")
196
+
197
+ def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
198
+ client = self._get_client()
199
+ # clickhouse-connect supports insert_df
200
+ client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
201
+
202
+ # ------------- missing values (lazy) -------------
203
+
204
+ def _fill_missing_partition(self, pdf: pd.DataFrame) -> pd.DataFrame:
205
+ # Fill by dtype family; leave real NaT for datetimes so Nullable(DateTime) accepts NULL
206
+ for col in pdf.columns:
207
+ s = pdf[col]
208
+ if pd.api.types.is_integer_dtype(s.dtype):
209
+ # pandas nullable IntX supports NA → fill where needed
210
+ if pd.api.types.is_extension_array_dtype(s.dtype):
211
+ pdf[col] = s.fillna(pd.NA)
212
+ else:
213
+ pdf[col] = s.fillna(0)
214
+ elif pd.api.types.is_bool_dtype(s.dtype):
215
+ # boolean pandas extension supports NA, ClickHouse uses UInt8; keep NA → Nullable
216
+ pdf[col] = s.fillna(pd.NA)
217
+ elif pd.api.types.is_float_dtype(s.dtype):
218
+ pdf[col] = s.fillna(0.0)
219
+ elif pd.api.types.is_datetime64_any_dtype(s.dtype):
220
+ # keep NaT; ClickHouse Nullable(DateTime) will take NULL
221
+ pass
222
+ else:
223
+ pdf[col] = s.fillna("")
224
+ return pdf
225
+
226
+ # ------------- low-level helpers -------------
227
+
228
+ def _get_client(self):
229
+ cli = getattr(self._tlocal, "client", None)
230
+ if cli is not None:
231
+ return cli
232
+ cli = clickhouse_connect.get_client(
233
+ host=self.host,
234
+ port=self.port,
235
+ database=self.database,
236
+ username=self.user, # clickhouse-connect uses 'username'
237
+ password=self.password,
238
+ )
239
+ self._tlocal.client = cli
240
+ return cli
241
+
242
+ def _command(self, sql: str) -> None:
243
+ client = self._get_client()
244
+ client.command(sql)
245
+
246
+ @staticmethod
247
+ def _ident(name: str) -> str:
248
+ # minimal identifier quoting
249
+ if name.startswith("`") and name.endswith("`"):
250
+ return name
251
+ return f"`{name}`"
252
+
253
+ # ------------- context cleanup -------------
254
+
255
+ def _cleanup(self):
256
+ # close client in this thread (the manager calls _cleanup in the owning thread)
257
+ cli = getattr(self._tlocal, "client", None)
229
258
  try:
230
- # Get delayed partitions and enumerate them
231
- partitions = self.df.to_delayed()
232
- with ThreadPoolExecutor() as executor:
233
- executor.map(write_partition, partitions, range(len(partitions)))
234
- except Exception as e:
235
- self.logger.error(f"Error during multi-partition write: {e}")
259
+ if cli is not None:
260
+ cli.close()
261
+ except Exception:
262
+ pass
263
+ finally:
264
+ if hasattr(self._tlocal, "client"):
265
+ delattr(self._tlocal, "client")
266
+
267
+ # from concurrent.futures import ThreadPoolExecutor
268
+ # from typing import ClassVar, Dict
269
+ #
270
+ # import clickhouse_connect
271
+ # import pandas as pd
272
+ # from clickhouse_driver import Client
273
+ # import dask.dataframe as dd
274
+ #
275
+ # from . import ManagedResource
276
+ #
277
+ #
278
+ # class ClickHouseWriter(ManagedResource):
279
+ # """
280
+ # Provides functionality to write a Dask DataFrame to a ClickHouse database using
281
+ # a specified schema. This class handles the creation of tables, schema generation,
282
+ # data transformation, and data insertion. It ensures compatibility between Dask
283
+ # data types and ClickHouse types.
284
+ #
285
+ # :ivar clickhouse_host: Host address of the ClickHouse database.
286
+ # :type clickhouse_host: str
287
+ # :ivar clickhouse_port: Port of the ClickHouse database.
288
+ # :type clickhouse_port: int
289
+ # :ivar clickhouse_dbname: Name of the database to connect to in ClickHouse.
290
+ # :type clickhouse_dbname: str
291
+ # :ivar clickhouse_user: Username for database authentication.
292
+ # :type clickhouse_user: str
293
+ # :ivar clickhouse_password: Password for database authentication.
294
+ # :type clickhouse_password: str
295
+ # :ivar clickhouse_table: Name of the table to store the data in.
296
+ # :type clickhouse_table: str
297
+ # :ivar logger: Logger instance for logging messages.
298
+ # :type logger: logging.Logger
299
+ # :ivar client: Instance of the ClickHouse database client.
300
+ # :type client: clickhouse_connect.Client or None
301
+ # :ivar df: Dask DataFrame to be written into ClickHouse.
302
+ # :type df: dask.dataframe.DataFrame
303
+ # :ivar order_by: Field or column name to use for table ordering.
304
+ # :type order_by: str
305
+ # """
306
+ # dtype_to_clickhouse: ClassVar[Dict[str, str]] = {
307
+ # 'int64': 'Int64',
308
+ # 'int32': 'Int32',
309
+ # 'float64': 'Float64',
310
+ # 'float32': 'Float32',
311
+ # 'bool': 'UInt8',
312
+ # 'datetime64[ns]': 'DateTime',
313
+ # 'object': 'String',
314
+ # 'category': 'String',
315
+ # }
316
+ # df: dd.DataFrame
317
+ #
318
+ # def __init__(self, **kwargs):
319
+ # super().__init__(**kwargs)
320
+ # self.clickhouse_host = kwargs.setdefault('host', "localhost")
321
+ # self.clickhouse_port = kwargs.setdefault('port', 8123)
322
+ # self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
323
+ # self.clickhouse_user = kwargs.setdefault('user', 'default')
324
+ # self.clickhouse_password = kwargs.setdefault('password', '')
325
+ # self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
326
+ #
327
+ # #self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
328
+ # self.client = None
329
+ # self.order_by = kwargs.setdefault('order_by', 'id')
330
+ #
331
+ # def save_to_clickhouse(self, df, **kwargs):
332
+ # self.df = df.copy()
333
+ # self.order_by = kwargs.setdefault('order_by', self.order_by)
334
+ # if len(self.df.head().index) == 0:
335
+ # self.logger.debug("Dataframe is empty")
336
+ # return
337
+ # self._handle_missing_values()
338
+ # self._connect()
339
+ # self._drop_table()
340
+ # self._create_table_from_dask()
341
+ # self._write_data()
342
+ #
343
+ # def _connect(self):
344
+ # try:
345
+ # self.client = clickhouse_connect.get_client(
346
+ # host=self.clickhouse_host,
347
+ # port=self.clickhouse_port,
348
+ # database=self.clickhouse_dbname,
349
+ # user=self.clickhouse_user,
350
+ # password=self.clickhouse_password
351
+ # )
352
+ # self.logger.debug("Connected to ClickHouse")
353
+ # except Exception as e:
354
+ # self.logger.error(e)
355
+ # raise
356
+ #
357
+ # @staticmethod
358
+ # def _generate_clickhouse_schema(dask_dtypes, dtype_map):
359
+ # schema = []
360
+ # for col, dtype in dask_dtypes.items():
361
+ # # Handle pandas nullable types explicitly
362
+ # if isinstance(dtype, pd.Int64Dtype): # pandas nullable Int64
363
+ # clickhouse_type = 'Int64'
364
+ # elif isinstance(dtype, pd.Float64Dtype): # pandas nullable Float64
365
+ # clickhouse_type = 'Float64'
366
+ # elif isinstance(dtype, pd.BooleanDtype): # pandas nullable Boolean
367
+ # clickhouse_type = 'UInt8'
368
+ # elif isinstance(dtype, pd.DatetimeTZDtype) or 'datetime' in str(dtype): # Nullable datetime
369
+ # clickhouse_type = 'Nullable(DateTime)'
370
+ # elif isinstance(dtype, pd.StringDtype): # pandas nullable String
371
+ # clickhouse_type = 'String'
372
+ # else:
373
+ # # Default mapping using the provided dtype_map
374
+ # clickhouse_type = dtype_map.get(str(dtype), 'String')
375
+ # schema.append(f"`{col}` {clickhouse_type}")
376
+ # return ', '.join(schema)
377
+ #
378
+ # def _drop_table(self):
379
+ # if self.client:
380
+ # self.client.command('DROP TABLE IF EXISTS {}'.format(self.clickhouse_table))
381
+ # self.logger.debug(f"Dropped table {self.clickhouse_table}")
382
+ #
383
+ # def _create_table_from_dask(self, engine=None):
384
+ # if engine is None:
385
+ # engine = f"ENGINE = MergeTree() order by {self.order_by}"
386
+ # dtypes = self.df.dtypes
387
+ # clickhouse_schema = self._generate_clickhouse_schema(dtypes, self.dtype_to_clickhouse)
388
+ # create_table_sql = f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
389
+ # self.logger.debug(f"Creating table SQL:{create_table_sql}")
390
+ # if self.client:
391
+ # self.client.command(create_table_sql)
392
+ # self.logger.debug("Created table '{}'".format(self.clickhouse_table))
393
+ #
394
+ # def _handle_missing_values(self):
395
+ # """
396
+ # Handle missing values in the Dask DataFrame before writing to ClickHouse.
397
+ # """
398
+ # self.logger.debug("Checking for missing values...")
399
+ # missing_counts = self.df.isnull().sum().compute()
400
+ # self.logger.debug(f"Missing values per column:\n{missing_counts}")
401
+ #
402
+ # # Replace missing values based on column types
403
+ # def replace_missing_values(df):
404
+ # for col in df.columns:
405
+ # if pd.api.types.is_integer_dtype(df[col]):
406
+ # df[col] = df[col].fillna(0) # Replace NA with 0 for integers
407
+ # elif pd.api.types.is_float_dtype(df[col]):
408
+ # df[col] = df[col].fillna(0.0) # Replace NA with 0.0 for floats
409
+ # elif pd.api.types.is_bool_dtype(df[col]):
410
+ # df[col] = df[col].fillna(False) # Replace NA with False for booleans
411
+ # else:
412
+ # df[col] = df[col].fillna('') # Replace NA with empty string for other types
413
+ # return df
414
+ #
415
+ # # Apply replacement
416
+ # self.df = replace_missing_values(self.df)
417
+ # self.logger.debug("Missing values replaced.")
418
+ #
419
+ # def _write_data(self):
420
+ # """
421
+ # Writes the Dask DataFrame to a ClickHouse table partition by partition.
422
+ # """
423
+ # if len(self.df.index) == 0:
424
+ # self.logger.debug("No data found. Nothing written.")
425
+ # return
426
+ #
427
+ # for i, partition in enumerate(self.df.to_delayed()):
428
+ # try:
429
+ # # Compute the current partition into a pandas DataFrame
430
+ # df = partition.compute()
431
+ #
432
+ # if df.empty:
433
+ # self.logger.debug(f"Partition {i} is empty. Skipping...")
434
+ # continue
435
+ #
436
+ # self.logger.debug(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
437
+ #
438
+ # # Write the partition to the ClickHouse table
439
+ # self.client.insert_df(self.clickhouse_table, df)
440
+ # except Exception as e:
441
+ # self.logger.error(f"Error writing partition {i}: {e}")
442
+ #
443
+ # def _write_data_multi_not_working_yet(self):
444
+ # """
445
+ # Writes the Dask DataFrame to a ClickHouse table partition by partition.
446
+ # Ensures a separate client instance is used per thread to avoid session conflicts.
447
+ # """
448
+ # if len(self.df.index) == 0:
449
+ # self.logger.debug("No data found. Nothing written.")
450
+ # return
451
+ #
452
+ # def create_client():
453
+ # client = Client(
454
+ # host=self.clickhouse_host,
455
+ # port=self.clickhouse_port,
456
+ # database=self.clickhouse_dbname,
457
+ # user=self.clickhouse_user,
458
+ # password=self.clickhouse_password
459
+ # )
460
+ # """
461
+ # Create a new instance of the ClickHouse client for each thread.
462
+ # This avoids session conflicts during concurrent writes.
463
+ # """
464
+ # return client
465
+ #
466
+ # def write_partition(partition, index):
467
+ # """
468
+ # Write a single partition to ClickHouse using a separate client instance.
469
+ # """
470
+ # try:
471
+ # self.logger.debug(f"Starting to process partition {index}")
472
+ # client = create_client() # Create a new client for the thread
473
+ #
474
+ # # Compute the Dask partition into a Pandas DataFrame
475
+ # df = partition.compute()
476
+ # if df.empty:
477
+ # self.logger.debug(f"Partition {index} is empty. Skipping...")
478
+ # return
479
+ #
480
+ # # Convert DataFrame to list of tuples
481
+ # data = [tuple(row) for row in df.to_numpy()]
482
+ # columns = df.columns.tolist()
483
+ #
484
+ # # Perform the insert
485
+ # self.logger.debug(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
486
+ # client.execute(f"INSERT INTO {self.clickhouse_table} ({', '.join(columns)}) VALUES", data)
487
+ #
488
+ # except Exception as e:
489
+ # self.logger.error(f"Error writing partition {index}: {e}")
490
+ # finally:
491
+ # if 'client' in locals() and hasattr(client, 'close'):
492
+ # client.close()
493
+ # self.logger.debug(f"Closed client for partition {index}")
494
+ #
495
+ # try:
496
+ # # Get delayed partitions and enumerate them
497
+ # partitions = self.df.to_delayed()
498
+ # with ThreadPoolExecutor() as executor:
499
+ # executor.map(write_partition, partitions, range(len(partitions)))
500
+ # except Exception as e:
501
+ # self.logger.error(f"Error during multi-partition write: {e}")