native-dumper 0.3.5.2__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,513 @@
1
+ from collections import OrderedDict
2
+ from gc import collect
3
+ from io import (
4
+ BufferedReader,
5
+ BufferedWriter,
6
+ )
7
+ from logging import Logger
8
+ from types import MethodType
9
+ from typing import (
10
+ Any,
11
+ BinaryIO,
12
+ Iterable,
13
+ Union,
14
+ )
15
+
16
+ from light_compressor import (
17
+ CompressionMethod,
18
+ auto_detector,
19
+ define_reader,
20
+ define_writer,
21
+ )
22
+ from nativelib import (
23
+ NativeReader,
24
+ NativeWriter,
25
+ )
26
+ from pandas import DataFrame as PdFrame
27
+ from polars import DataFrame as PlFrame
28
+ from sqlparse import format as sql_format
29
+
30
+ from .common import (
31
+ CHUNK_SIZE,
32
+ DBMS_DEFAULT_TIMEOUT_SEC,
33
+ CHConnector,
34
+ ClickhouseServerError,
35
+ DBMetadata,
36
+ DumperLogger,
37
+ HTTPCursor,
38
+ NativeDumperError,
39
+ NativeDumperReadError,
40
+ NativeDumperValueError,
41
+ NativeDumperWriteError,
42
+ chunk_query,
43
+ file_writer,
44
+ make_columns,
45
+ transfer_diagram,
46
+ )
47
+
48
+
49
+ class NativeDumper:
50
+ """Class for read and write Native format."""
51
+
52
+ def __init__(
53
+ self,
54
+ connector: CHConnector,
55
+ compression_method: CompressionMethod = CompressionMethod.ZSTD,
56
+ logger: Logger | None = None,
57
+ timeout: int = DBMS_DEFAULT_TIMEOUT_SEC,
58
+ ) -> None:
59
+ """Class initialization."""
60
+
61
+ if not logger:
62
+ logger = DumperLogger()
63
+
64
+ try:
65
+ self.connector = connector
66
+
67
+ if int(self.connector.port) == 9000:
68
+ raise ValueError(
69
+ "NativeDumper don't support port 9000, please, use 8123."
70
+ )
71
+
72
+ self.compression_method = compression_method
73
+ self.logger = logger
74
+ self.cursor = HTTPCursor(
75
+ connector=self.connector,
76
+ compression_method=self.compression_method,
77
+ logger=self.logger,
78
+ timeout=timeout,
79
+ user_agent=self.__class__.__name__,
80
+ )
81
+ self.version = self.cursor.send_hello()
82
+ self._dbmeta: DBMetadata | None = None
83
+ except ClickhouseServerError as error:
84
+ raise error
85
+ except Exception as error:
86
+ logger.error(f"NativeDumperError: {error}")
87
+ raise NativeDumperError(error)
88
+
89
+ self.dbname = "clickhouse"
90
+ self.logger.info(
91
+ f"NativeDumper initialized for host {self.connector.host}"
92
+ f"[{self.dbname} {self.version}]"
93
+ )
94
+
95
+ @staticmethod
96
+ def multiquery(dump_method: MethodType):
97
+ """Multiquery decorator."""
98
+
99
+ def wrapper(*args, **kwargs):
100
+
101
+ first_part: list[str]
102
+ second_part: list[str]
103
+
104
+ self: NativeDumper = args[0]
105
+ cursor: HTTPCursor = (kwargs.get("dumper_src") or self).cursor
106
+ query: str = kwargs.get("query_src") or kwargs.get("query")
107
+ part: int = 1
108
+ first_part, second_part = chunk_query(self.query_formatter(query))
109
+ total_parts = len(sum((first_part, second_part), [])) + int(
110
+ bool(kwargs.get("table_name") or kwargs.get("table_src"))
111
+ )
112
+
113
+ if len(first_part) > 1:
114
+ for query in first_part:
115
+ self.logger.info(f"Execute query {part}/{total_parts}")
116
+ cursor.execute(query)
117
+ part += 1
118
+
119
+ if second_part:
120
+ for key in ("query", "query_src"):
121
+ if key in kwargs:
122
+ kwargs[key] = second_part.pop(0)
123
+ break
124
+
125
+ self.logger.info(
126
+ f"Execute stream {part}/{total_parts} [native mode]"
127
+ )
128
+ output = dump_method(*args, **kwargs)
129
+
130
+ if second_part:
131
+ for query in second_part:
132
+ part += 1
133
+ self.logger.info(f"Execute query {part}/{total_parts}")
134
+ cursor.execute(query)
135
+
136
+ if output:
137
+ self.refresh()
138
+
139
+ collect()
140
+ return output
141
+
142
+ return wrapper
143
+
144
+ def query_formatter(self, query: str) -> str | None:
145
+ """Reformat query."""
146
+
147
+ if not query:
148
+ return
149
+ return sql_format(sql=query, strip_comments=True).strip().strip(";")
150
+
151
+ @multiquery
152
+ def __read_dump(
153
+ self,
154
+ fileobj: BufferedWriter,
155
+ query: str | None,
156
+ table_name: str | None,
157
+ ) -> bool:
158
+ """Internal method read_dump for generate kwargs to decorator."""
159
+
160
+ if not query and not table_name:
161
+ error_message = "Query or table name not defined."
162
+ self.logger.error(f"NativeDumperValueError: {error_message}")
163
+ raise NativeDumperValueError(error_message)
164
+
165
+ if not query:
166
+ query = f"SELECT * FROM {table_name}"
167
+
168
+ self.logger.info(f"Start read from {self.connector.host}.")
169
+
170
+ try:
171
+ self.logger.info(
172
+ "Reading native dump with compression "
173
+ f"{self.compression_method.name}."
174
+ )
175
+ columns = make_columns(self.cursor.metadata(f"({query}\n)"))
176
+ source = DBMetadata(
177
+ name=self.dbname,
178
+ version=self.version,
179
+ columns=columns,
180
+ )
181
+ destination = DBMetadata(
182
+ name="file",
183
+ version=fileobj.name,
184
+ columns=columns,
185
+ )
186
+ self.logger.info(transfer_diagram(source, destination))
187
+ stream = self.cursor.get_response(query)
188
+ size = 0
189
+
190
+ while chunk := stream.read(CHUNK_SIZE):
191
+ size += fileobj.write(chunk)
192
+ del chunk
193
+
194
+ stream.close()
195
+ fileobj.close()
196
+ self.logger.info(f"Successfully read {size} bytes.")
197
+
198
+ if not size:
199
+ self.logger.warning("Empty data read!")
200
+
201
+ self.logger.info(f"Read from {self.connector.host} done.")
202
+ return True
203
+ except ClickhouseServerError as error:
204
+ raise error
205
+ except Exception as error:
206
+ self.logger.error(f"NativeDumperReadError: {error}")
207
+ raise NativeDumperReadError(error)
208
+
209
+ @multiquery
210
+ def __write_between(
211
+ self,
212
+ table_dest: str,
213
+ table_src: str | None,
214
+ query_src: str | None,
215
+ dumper_src: Union["NativeDumper", object],
216
+ ) -> bool:
217
+ """Internal method write_between for generate kwargs to decorator."""
218
+
219
+ if not query_src and not table_src:
220
+ error_message = "Source query or table name not defined."
221
+ self.logger.error(f"NativeDumperValueError: {error_message}")
222
+ raise NativeDumperValueError(error_message)
223
+
224
+ if not table_dest:
225
+ error_message = "Destination table name not defined."
226
+ self.logger.error(f"NativeDumperValueError: {error_message}")
227
+ raise NativeDumperValueError(error_message)
228
+
229
+ if not dumper_src:
230
+ cursor = HTTPCursor(
231
+ connector=self.connector,
232
+ compression_method=self.compression_method,
233
+ logger=self.logger,
234
+ timeout=self.cursor.timeout,
235
+ )
236
+ src_dbname = self.dbname
237
+ src_version = self.version
238
+ self.logger.info(
239
+ f"Set new connection for host {self.connector.host}."
240
+ )
241
+ elif dumper_src.__class__ is NativeDumper:
242
+ cursor = dumper_src.cursor
243
+ src_dbname = dumper_src.dbname
244
+ src_version = dumper_src.version
245
+ else:
246
+ if query_src:
247
+ query_src = query_src.strip().strip(";")
248
+
249
+ reader = dumper_src.to_reader(
250
+ query=query_src,
251
+ table_name=table_src,
252
+ )
253
+ dtype_data = reader.to_rows()
254
+ self.from_rows(
255
+ dtype_data=dtype_data,
256
+ table_name=table_dest,
257
+ source=dumper_src._dbmeta,
258
+ )
259
+ size = reader.tell()
260
+ self.logger.info(f"Successfully sending {size} bytes.")
261
+
262
+ if not size:
263
+ self.logger.warning("Empty data send!")
264
+
265
+ return reader.close()
266
+
267
+ if not query_src:
268
+ query_src = f"SELECT * FROM {table_src}"
269
+ else:
270
+ query_src = query_src.strip().strip(";")
271
+
272
+ source = DBMetadata(
273
+ name=src_dbname,
274
+ version=src_version,
275
+ columns=make_columns(cursor.metadata(f"({query_src})")),
276
+ )
277
+ destination = DBMetadata(
278
+ name=self.dbname,
279
+ version=self.version,
280
+ columns=make_columns(self.cursor.metadata(table_dest)),
281
+ )
282
+ self.logger.info(transfer_diagram(source, destination))
283
+ stream = cursor.get_response(query_src)
284
+ self.write_dump(stream, table_dest, cursor.compression_method)
285
+
286
+ @multiquery
287
+ def __to_reader(
288
+ self,
289
+ query: str | None,
290
+ table_name: str | None,
291
+ ) -> NativeReader:
292
+ """Internal method to_reader for generate kwargs to decorator."""
293
+
294
+ if not query and not table_name:
295
+ error_message = "Query or table name not defined."
296
+ self.logger.error(f"NativeDumperValueError: {error_message}")
297
+ raise NativeDumperValueError(error_message)
298
+
299
+ if not query:
300
+ query = f"SELECT * FROM {table_name}"
301
+
302
+ self.logger.info(
303
+ f"Get NativeReader object from {self.connector.host}."
304
+ )
305
+ self._dbmeta = DBMetadata(
306
+ name=self.dbname,
307
+ version=self.version,
308
+ columns=make_columns(self.cursor.metadata(f"({query}\n)")),
309
+ )
310
+ return self.cursor.get_stream(query)
311
+
312
+ def read_dump(
313
+ self,
314
+ fileobj: BufferedWriter,
315
+ query: str | None = None,
316
+ table_name: str | None = None,
317
+ ) -> bool:
318
+ """Read Native dump from Clickhouse."""
319
+
320
+ return self.__read_dump(
321
+ fileobj=fileobj,
322
+ query=query,
323
+ table_name=table_name,
324
+ )
325
+
326
+ def write_dump(
327
+ self,
328
+ fileobj: BufferedReader | BinaryIO,
329
+ table_name: str,
330
+ compression_method: CompressionMethod | None = None,
331
+ ) -> None:
332
+ """Write Native dump into Clickhouse."""
333
+
334
+ if not table_name:
335
+ error_message = "Table name not defined."
336
+ self.logger.error(f"NativeDumperValueError: {error_message}")
337
+ raise NativeDumperValueError(error_message)
338
+
339
+ self.logger.info(
340
+ f"Start write into {self.connector.host}.{table_name}."
341
+ )
342
+
343
+ try:
344
+ if not compression_method:
345
+ compression_method = auto_detector(fileobj)
346
+
347
+ if compression_method != self.compression_method:
348
+ reader = define_reader(fileobj, compression_method)
349
+ data = define_writer(
350
+ file_writer(reader),
351
+ self.compression_method,
352
+ )
353
+ else:
354
+ reader = fileobj
355
+ data = file_writer(reader)
356
+
357
+ self.cursor.upload_data(
358
+ table=table_name,
359
+ data=data,
360
+ )
361
+ collect()
362
+ size = reader.tell()
363
+ self.logger.info(f"Successfully sending {size} bytes.")
364
+
365
+ if not size:
366
+ self.logger.warning("Empty data send!")
367
+
368
+ reader.close()
369
+ except ClickhouseServerError as error:
370
+ raise error
371
+ except Exception as error:
372
+ self.logger.error(f"NativeDumperWriteError: {error}")
373
+ raise NativeDumperWriteError(error)
374
+
375
+ self.logger.info(
376
+ f"Write into {self.connector.host}.{table_name} done."
377
+ )
378
+ self.refresh()
379
+
380
+ def write_between(
381
+ self,
382
+ table_dest: str,
383
+ table_src: str | None = None,
384
+ query_src: str | None = None,
385
+ dumper_src: Union["NativeDumper", object] = None,
386
+ ) -> bool:
387
+ """Write between Clickhouse servers."""
388
+
389
+ return self.__write_between(
390
+ table_dest=table_dest,
391
+ table_src=table_src,
392
+ query_src=query_src,
393
+ dumper_src=dumper_src,
394
+ )
395
+
396
+ def to_reader(
397
+ self,
398
+ query: str | None = None,
399
+ table_name: str | None = None,
400
+ ) -> NativeReader:
401
+ """Get stream from Clickhouse as NativeReader object."""
402
+
403
+ return self.__to_reader(
404
+ query=query,
405
+ table_name=table_name,
406
+ )
407
+
408
+ def from_rows(
409
+ self,
410
+ dtype_data: Iterable[Any],
411
+ table_name: str,
412
+ source: DBMetadata | None = None,
413
+ ) -> None:
414
+ """Write from python list into Clickhouse table."""
415
+
416
+ if not table_name:
417
+ error_message = "Table name not defined."
418
+ self.logger.error(f"NativeDumperValueError: {error_message}")
419
+ raise NativeDumperValueError(error_message)
420
+
421
+ if not source:
422
+ source = DBMetadata(
423
+ name="python",
424
+ version="iterable object",
425
+ columns={"Unknown": "Unknown"},
426
+ )
427
+
428
+ column_list = self.cursor.metadata(table_name)
429
+ writer = NativeWriter(column_list)
430
+ data = define_writer(
431
+ writer.from_rows(dtype_data),
432
+ self.compression_method,
433
+ )
434
+
435
+ destination = DBMetadata(
436
+ name=self.dbname,
437
+ version=self.version,
438
+ columns=make_columns(column_list),
439
+ )
440
+
441
+ self.logger.info(transfer_diagram(source, destination))
442
+ collect()
443
+ self.logger.info(
444
+ f"Start write into {self.connector.host}.{table_name}."
445
+ )
446
+
447
+ try:
448
+ self.cursor.upload_data(
449
+ table=table_name,
450
+ data=data,
451
+ )
452
+ except ClickhouseServerError as error:
453
+ raise error
454
+ except Exception as error:
455
+ self.logger.error(f"NativeDumperWriteError: {error}")
456
+ raise NativeDumperWriteError(error)
457
+
458
+ self.logger.info(
459
+ f"Write into {self.connector.host}.{table_name} done."
460
+ )
461
+ self.refresh()
462
+
463
+ def from_pandas(
464
+ self,
465
+ data_frame: PdFrame,
466
+ table_name: str,
467
+ ) -> None:
468
+ """Write from pandas.DataFrame into Clickhouse table."""
469
+
470
+ self.from_rows(
471
+ dtype_data=iter(data_frame.values),
472
+ table_name=table_name,
473
+ source=DBMetadata(
474
+ name="pandas",
475
+ version="DataFrame",
476
+ columns=OrderedDict(zip(
477
+ data_frame.columns,
478
+ [str(dtype) for dtype in data_frame.dtypes],
479
+ )),
480
+ )
481
+ )
482
+
483
+ def from_polars(
484
+ self,
485
+ data_frame: PlFrame,
486
+ table_name: str,
487
+ ) -> None:
488
+ """Write from polars.DataFrame into Clickhouse table."""
489
+
490
+ self.from_rows(
491
+ dtype_data=data_frame.iter_rows(),
492
+ table_name=table_name,
493
+ source=DBMetadata(
494
+ name="polars",
495
+ version="DataFrame",
496
+ columns=OrderedDict(zip(
497
+ data_frame.columns,
498
+ [str(dtype) for dtype in data_frame.dtypes],
499
+ )),
500
+ )
501
+ )
502
+
503
+ def refresh(self) -> None:
504
+ """Refresh session."""
505
+
506
+ self.cursor.refresh()
507
+ self.logger.info(f"Connection to host {self.connector.host} updated.")
508
+
509
+ def close(self) -> None:
510
+ """Close session."""
511
+
512
+ self.cursor.close()
513
+ self.logger.info(f"Connection to host {self.connector.host} closed.")
@@ -0,0 +1 @@
1
+ __version__ = "0.3.5.2"
@@ -0,0 +1,198 @@
1
+ Metadata-Version: 2.4
2
+ Name: native-dumper
3
+ Version: 0.3.5.2
4
+ Summary: Library for read and write Native format between Clickhouse and file.
5
+ Home-page: https://0xmihalich.github.io/dbhose_airflow/classes/native_dumper/index.html
6
+ Author: 0xMihalich
7
+ Author-email: bayanmobile87@gmail.com
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Classifier: Programming Language :: Python :: 3.14
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: light-compressor==0.0.2.2
19
+ Requires-Dist: nativelib==0.2.2.6
20
+ Requires-Dist: sqlparse>=0.5.5
21
+ Dynamic: author
22
+ Dynamic: author-email
23
+ Dynamic: classifier
24
+ Dynamic: description
25
+ Dynamic: description-content-type
26
+ Dynamic: home-page
27
+ Dynamic: license-file
28
+ Dynamic: requires-dist
29
+ Dynamic: requires-python
30
+ Dynamic: summary
31
+
32
+ # NativeDumper
33
+
34
+ Library for read and write Native format between Clickhouse and file
35
+
36
+ ## Examples
37
+
38
+ ### Initialization
39
+
40
+ ```python
41
+ from native_dumper import (
42
+ CompressionMethod,
43
+ CHConnector,
44
+ NativeDumper,
45
+ )
46
+
47
+ connector = CHConnector(
48
+ host = <your host>,
49
+ dbname = <your database>,
50
+ user = <your username>,
51
+ password = <your password>,
52
+ port = 8123,
53
+ )
54
+
55
+ dumper = NativeDumper(
56
+ connector=connector,
57
+ compression_method=CompressionMethod.ZSTD, # or CompressionMethod.LZ4 or CompressionMethod.NONE
58
+ )
59
+ ```
60
+
61
+ ### Read dump from Clickhouse into file
62
+
63
+ ```python
64
+ file_name = "native.zstd"
65
+ # you need define one of parameter query or table_name
66
+ query = "select ..." # some sql query
67
+ table_name = "default.test_table" # or some table
68
+
69
+ with open(file_name, "wb") as fileobj:
70
+ dumper.read_dump(
71
+ fileobj,
72
+ query,
73
+ table_name,
74
+ )
75
+ ```
76
+
77
+ ### Write dump from file into Clickhouse
78
+
79
+ ```python
80
+ file_name = "native.zstd"
81
+ # you need define one of parameter table_name
82
+ table_name = "default.test_table" # some table
83
+
84
+ with open(file_name, "rb") as fileobj:
85
+ dumper.write_dump(
86
+ fileobj,
87
+ table_name,
88
+ )
89
+ ```
90
+
91
+ ### Write from Clickhouse into Clickhouse
92
+
93
+ Same server
94
+
95
+ ```python
96
+
97
+ table_dest = "default.test_table_write" # some table for write
98
+ table_src = "default.test_table_read" # some table for read
99
+ query_src = "select ..." # or some sql query for read
100
+
101
+ dumper.write_between(
102
+ table_dest,
103
+ table_src,
104
+ query_src,
105
+ )
106
+ ```
107
+
108
+ Different servers
109
+
110
+ ```python
111
+
112
+ connector_src = CHConnector(
113
+ host = <host src>,
114
+ dbname = <database src>,
115
+ user = <username src>,
116
+ password = <password src>,
117
+ port = 8123,
118
+ )
119
+
120
+ dumper_src = NativeDumper(connector=connector_src)
121
+
122
+ table_dest = "default.test_table_write" # some table for write
123
+ table_src = "default.test_table_read" # some table for read
124
+ query_src = "select ..." # or some sql query for read
125
+
126
+ dumper.write_between(
127
+ table_dest,
128
+ table_src,
129
+ query_src,
130
+ dumper_src.cursor,
131
+ )
132
+ ```
133
+
134
+ ### Get NativeReader object from stream
135
+
136
+ ```python
137
+
138
+ table_name = "default.test_table_read" # some table for read
139
+ query = "select ..." # or some sql query for read
140
+
141
+ reader = dumper.to_reader(
142
+ query=query,
143
+ table_name=table_name,
144
+ )
145
+ ```
146
+
147
+ NativeReader has three methods available,
148
+ but only one of the methods is available at a time within a single session.
149
+
150
+ ```python
151
+ # read as python generator object
152
+ reader.to_rows()
153
+ # or read as pandas.DataFrame
154
+ reader.to_pandas()
155
+ # or read as polars.DataFrame
156
+ reader.to_polars()
157
+ ```
158
+
159
+ ### Write from python objects into target table
160
+
161
+ ```python
162
+ # some table for write data
163
+ table_name = "default.test_table_write"
164
+ dtype_data: Itarable[Any]
165
+ pandas_frame: pandas.DataFrame
166
+ polars_frame: polars.DataFrame
167
+
168
+ # write from python object
169
+ dumper.from_rows(dtype_data, table_name)
170
+ # write from pandas.DataFrame
171
+ dumper.from_pandas(pandas_frame, table_name)
172
+ # write from polars.DataFrame
173
+ dumper.from_polars(polars_frame, table_name)
174
+ ```
175
+
176
+ ### Open Native file format
177
+
178
+ Get info from my another repository https://github.com/0xMihalich/nativelib
179
+
180
+ ## Installation
181
+
182
+ ### From pip
183
+
184
+ ```bash
185
+ pip install native-dumper
186
+ ```
187
+
188
+ ### From local directory
189
+
190
+ ```bash
191
+ pip install .
192
+ ```
193
+
194
+ ### From git
195
+
196
+ ```bash
197
+ pip install git+https://github.com/0xMihalich/native_dumper
198
+ ```