dbhose-airflow 0.0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,426 @@
1
+ from __future__ import annotations
2
+ from os.path import dirname
3
+ from typing import (
4
+ Any,
5
+ TYPE_CHECKING,
6
+ )
7
+
8
+ from airflow.hooks.base import log
9
+ from dbhose_utils import DumpType
10
+ from light_compressor import CompressionMethod
11
+ from native_dumper import NativeDumper
12
+ from native_dumper.common import DBMS_DEFAULT_TIMEOUT_SEC
13
+ from pandas import DataFrame as PDFrame
14
+ from polars import DataFrame as PLFrame
15
+
16
+ from .airflow_connect import dbhose_dumper
17
+ from .dq_check import DQCheck
18
+ from .move_method import MoveMethod
19
+
20
+ if TYPE_CHECKING:
21
+ from collections.abc import Iterable
22
+ from io import BufferedReader
23
+
24
+
25
+ __all__ = (
26
+ "DBMS_DEFAULT_TIMEOUT_SEC",
27
+ "CompressionMethod",
28
+ "DBHose",
29
+ "DumpType",
30
+ "MoveMethod",
31
+ "dbhose_dumper",
32
+ )
33
+ __version__ = "0.0.0.1"
34
+
35
+
36
+ root_path = dirname(__file__)
37
+ logo_path = f"{root_path}/LOGO"
38
+ ddl_path = f"{root_path}/ddl/{{}}.sql"
39
+ dq_path = f"{root_path}/dq/{{}}/{{}}.sql"
40
+ mv_path = f"{root_path}/move/{{}}/{{}}.sql"
41
+
42
+
43
+ def read_text(path: str) -> str:
44
+ """Read text from file."""
45
+
46
+ with open(path, encoding="utf-8") as file:
47
+ return file.read()
48
+
49
+
50
+ def wrap_frame(
51
+ text: str,
52
+ min_width: int = 79,
53
+ ) -> str:
54
+ """Wraps text in a frame with a minimum size.
55
+
56
+ Arguments:
57
+ text (str): Text to wrap
58
+ min_width (int): Minimum frame width (default 79)
59
+
60
+ Returns:
61
+ str: Text in frame
62
+
63
+ """
64
+
65
+ lines = [line.strip() for line in str(text).split("\n") if line.strip()]
66
+ max_line_length = max(len(line) for line in lines) if lines else 0
67
+ content_width = max(
68
+ max_line_length, min_width - 4,
69
+ )
70
+ frame_width = content_width + 4
71
+ result = [""]
72
+ result.append("┌" + "─" * (frame_width - 2) + "┐")
73
+
74
+ for line in lines:
75
+ spaces_needed = content_width - len(line)
76
+ padded_line = f" {line}{' ' * spaces_needed} "
77
+ result.append("│" + padded_line + "│")
78
+
79
+ result.append("└" + "─" * (frame_width - 2) + "┘")
80
+ return "\n".join(result)
81
+
82
+
83
+ class DBHose:
84
+ """DBHose object."""
85
+
86
+ def __init__(
87
+ self,
88
+ table_dest: str,
89
+ connection_dest: str,
90
+ connection_src: str | None = None,
91
+ dq_skip_check: list[str] = [],
92
+ filter_by: list[str] = [],
93
+ drop_temp_table: bool = True,
94
+ move_method: MoveMethod = MoveMethod.replace,
95
+ custom_move: str | None = None,
96
+ compress_method: CompressionMethod = CompressionMethod.ZSTD,
97
+ timeout: int = DBMS_DEFAULT_TIMEOUT_SEC,
98
+ ) -> None:
99
+ """Class initialization."""
100
+
101
+ self.logger = log
102
+ self.table_dest = table_dest
103
+ self.connection_dest = connection_dest
104
+ self.connection_src = connection_src
105
+ self.dq_skip_check = dq_skip_check
106
+ self.filter_by = filter_by
107
+ self.drop_temp_table = drop_temp_table
108
+ self.move_method = move_method
109
+ self.custom_move = custom_move
110
+ self.dumper_dest = dbhose_dumper(
111
+ self.connection_dest,
112
+ compress_method,
113
+ timeout,
114
+ )
115
+ self.dumper_src = None
116
+ self.ddl = None
117
+ self.temp_ddl = None
118
+ self.table_temp = None
119
+
120
+ if self.connection_src:
121
+ self.dumper_src = dbhose_dumper(
122
+ self.connection_src,
123
+ compress_method,
124
+ timeout,
125
+ )
126
+
127
+ self.logger.info(read_text(logo_path))
128
+
129
+ def create_temp(self) -> None:
130
+ """Create temporary table."""
131
+
132
+ self.logger.info("Make temp table operation start")
133
+ query_ddl = read_text(ddl_path.format(self.dumper_dest.dbname))
134
+ self.logger.info("Getting data from server")
135
+ reader = self.dumper_dest.to_reader(
136
+ query_ddl.format(table=self.table_dest)
137
+ )
138
+ self.ddl, self.temp_ddl, self.table_temp = tuple(*reader.to_rows())
139
+
140
+ if not self.ddl:
141
+ msg = f"Table {self.table_dest} not found!"
142
+ self.logger.error(wrap_frame(msg))
143
+ raise ValueError(msg)
144
+
145
+ self.logger.info(f"Make table {self.table_temp}")
146
+ self.dumper_dest.cursor.execute(self.temp_ddl)
147
+
148
+ if self.dumper_dest.__class__ is not NativeDumper:
149
+ self.dumper_dest.connect.commit()
150
+
151
+ self.logger.info(wrap_frame(f"Table {self.table_temp} created"))
152
+
153
+ def drop_temp(self) -> None:
154
+ """Drop temp table."""
155
+
156
+ if self.drop_temp_table:
157
+ self.logger.info("Drop temp table operation start")
158
+ self.dumper_dest.cursor.execute(
159
+ f"drop table if exists {self.table_temp}"
160
+ )
161
+
162
+ if self.dumper_dest.__class__ is not NativeDumper:
163
+ self.dumper_dest.connect.commit()
164
+
165
+ self.logger.info(wrap_frame(f"Table {self.table_temp} dropped"))
166
+ else:
167
+ self.logger.warning(
168
+ wrap_frame("Drop temp table operation skipped by user")
169
+ )
170
+
171
+ def dq_check(self, table: str | None = None) -> None:
172
+ """Data quality checker."""
173
+
174
+ self.logger.info(wrap_frame("Start Data Quality tests"))
175
+
176
+ for test in DQCheck._member_names_:
177
+ dq = DQCheck[test]
178
+
179
+ if test in self.dq_skip_check:
180
+ self.logger.warning(
181
+ wrap_frame(f"{dq.description} test skipped by user")
182
+ )
183
+ continue
184
+ if dq.need_source_table and not table:
185
+ self.logger.warning(
186
+ wrap_frame(
187
+ f"{dq.description} test skipped [no source object]"
188
+ ),
189
+ )
190
+ continue
191
+
192
+ query_dest = read_text(
193
+ dq_path.format(self.dumper_dest.dbname, test),
194
+ )
195
+
196
+ if dq.need_source_table:
197
+ dumper_src = self.dumper_src or self.dumper_dest
198
+ query_src = read_text(
199
+ dq_path.format(dumper_src.dbname, test),
200
+ )
201
+
202
+ if dq.generate_queryes:
203
+ reader_src = dumper_src.to_reader(
204
+ query_src.format(table=table),
205
+ )
206
+ tests_src = list(reader_src.to_rows())
207
+ have_test = next(iter(tests_src))
208
+
209
+ if not have_test:
210
+ self.logger.warning(
211
+ wrap_frame(f"{dq.description} test Skip "
212
+ "[no data types for test]"),
213
+ )
214
+ continue
215
+
216
+ reader_dest = self.dumper_dest.to_reader(
217
+ query_dest.format(table=self.table_temp),
218
+ )
219
+ tests_dest = list(reader_dest.to_rows())
220
+
221
+ for (_, column_src, test_src) in tests_src:
222
+ for (_, column_dest, test_dest) in tests_dest:
223
+ if column_src == column_dest:
224
+ reader_src = dumper_src.to_reader(test_src)
225
+ reader_dest = self.dumper_dest.to_reader(
226
+ test_dest,
227
+ )
228
+ value_src = next(iter(*reader_src.to_rows()))
229
+ value_dst = next(iter(*reader_dest.to_rows()))
230
+
231
+ if value_src != value_dst:
232
+ err_msg = (
233
+ f"Check {column_src} test Fail: "
234
+ f"value {value_src} <> {value_dst}"
235
+ )
236
+ self.logger.error(wrap_frame(err_msg))
237
+ raise ValueError(err_msg)
238
+
239
+ self.logger.info(
240
+ wrap_frame(
241
+ f"Check {column_src} test Pass"
242
+ ),
243
+ )
244
+ break
245
+ else:
246
+ self.logger.warning(
247
+ wrap_frame(f"Check {column_src} test Skip "
248
+ "[no column for test]"),
249
+ )
250
+ else:
251
+ reader_src = dumper_src.to_reader(
252
+ query_src.format(table=table),
253
+ )
254
+ reader_dest = self.dumper_dest.to_reader(
255
+ query_dest.format(table=self.table_temp),
256
+ )
257
+ value_src = next(iter(reader_src.to_rows()))[0]
258
+ value_dst = next(iter(reader_dest.to_rows()))[0]
259
+
260
+ if value_src != value_dst:
261
+ err_msg = (
262
+ f"{dq.description} test Fail: "
263
+ f"value {value_src} <> {value_dst}"
264
+ )
265
+ self.logger.error(wrap_frame(err_msg))
266
+ raise ValueError(err_msg)
267
+
268
+ else:
269
+ reader_dest = self.dumper_dest.to_reader(
270
+ query_dest.format(table=self.table_temp),
271
+ )
272
+
273
+ if dq.generate_queryes:
274
+ tests = list(reader_dest.to_rows())
275
+
276
+ for (have_test, column_name, query) in tests:
277
+
278
+ if not have_test:
279
+ self.logger.warning(
280
+ wrap_frame(f"{dq.description} test Skip "
281
+ "[no column for test]"),
282
+ )
283
+ break
284
+
285
+ reader_dest = self.dumper_dest.to_reader(query)
286
+ value, result = next(iter(reader_dest.to_rows()))
287
+
288
+ if result == "Fail":
289
+ err_msg = (
290
+ f"Check {column_name} test Fail "
291
+ f"with {value} error rows"
292
+ )
293
+ self.logger.error(wrap_frame(err_msg))
294
+ raise ValueError(err_msg)
295
+
296
+ self.logger.info(
297
+ wrap_frame(f"Check {column_name} test Pass"),
298
+ )
299
+ else:
300
+ value, result = next(iter(reader_dest.to_rows()))
301
+
302
+ if result == "Fail":
303
+ err_msg = (
304
+ f"{dq.description} test Fail "
305
+ f"with {value} error rows"
306
+ )
307
+ self.logger.error(wrap_frame(err_msg))
308
+ raise ValueError(err_msg)
309
+
310
+ self.logger.info(wrap_frame(f"{dq.description} test Pass"))
311
+
312
+ self.logger.info(
313
+ wrap_frame("All Data Quality tests have been completed")
314
+ )
315
+
316
+ def to_table(self) -> None:
317
+ """Move data to destination table."""
318
+
319
+ self.logger.info(
320
+ wrap_frame(f"Move data with method {self.move_method.name}")
321
+ )
322
+
323
+ if self.move_method.need_filter and not self.filter_by:
324
+ error_msg = "You must specify columns in filter_by"
325
+ self.logger.error(wrap_frame(error_msg))
326
+ raise ValueError(error_msg)
327
+
328
+ if self.move_method.is_custom:
329
+
330
+ if not self.custom_move:
331
+ error_msg = "You must specify custom query"
332
+ self.logger.error(wrap_frame(error_msg))
333
+ raise ValueError(error_msg)
334
+
335
+ self.dumper_dest.cursor.execute(self.custom_move)
336
+
337
+ if self.dumper_dest.__class__ is not NativeDumper:
338
+ self.dumper_dest.connect.commit()
339
+
340
+ elif self.move_method.have_sql:
341
+ move_query = read_text(
342
+ mv_path.format(self.dumper_dest.dbname, self.move_method.name)
343
+ )
344
+ reader = self.dumper_dest.to_reader(move_query.format(
345
+ table_dest=self.table_dest,
346
+ table_temp=self.table_temp,
347
+ filter_by=self.filter_by,
348
+ ))
349
+ is_avaliable, move_query = tuple(*reader.to_rows())
350
+
351
+ if not is_avaliable:
352
+ error_msg = (
353
+ f"Method {self.move_method.name} is not available for "
354
+ f"{self.table_dest}. Use another method."
355
+ )
356
+ self.logger.error(wrap_frame(error_msg))
357
+ raise ValueError(error_msg)
358
+
359
+ self.dumper_dest.cursor.execute(move_query)
360
+
361
+ if self.dumper_dest.__class__ is not NativeDumper:
362
+ self.dumper_dest.connect.commit()
363
+
364
+ else:
365
+ self.dumper_dest.write_between(self.table_dest, self.table_temp)
366
+
367
+ self.logger.info(wrap_frame(f"Data moved into {self.table_dest}"))
368
+ self.drop_temp()
369
+
370
+ def from_file(
371
+ self,
372
+ fileobj: BufferedReader,
373
+ ) -> None:
374
+ """Upload from dump file object."""
375
+
376
+ self.create_temp()
377
+ self.dumper_dest.write_dump(fileobj, self.table_temp)
378
+ self.dq_check()
379
+ self.to_table()
380
+
381
+ def from_iterable(
382
+ self,
383
+ dtype_data: Iterable[Any],
384
+ ) -> None:
385
+ """Upload from python iterable object."""
386
+
387
+ self.create_temp()
388
+ self.dumper_dest.from_rows(dtype_data, self.table_temp)
389
+ self.dq_check()
390
+ self.to_table()
391
+
392
+ def from_frame(
393
+ self,
394
+ data_frame: PDFrame | PLFrame,
395
+ ) -> None:
396
+ """Upload from DataFrame."""
397
+
398
+ self.create_temp()
399
+
400
+ if data_frame.__class__ is PDFrame:
401
+ self.dumper_dest.from_pandas(data_frame, self.table_temp)
402
+ elif data_frame.__class__ is PLFrame:
403
+ self.dumper_dest.from_polars(data_frame, self.table_temp)
404
+ else:
405
+ msg = f"Unknown DataFrame type {data_frame.__class__}."
406
+ raise TypeError(msg)
407
+
408
+ self.dq_check()
409
+ self.to_table()
410
+
411
+ def from_dmbs(
412
+ self,
413
+ query: str | None = None,
414
+ table: str | None = None,
415
+ ) -> None:
416
+ """Upload from DMBS."""
417
+
418
+ self.create_temp()
419
+ self.dumper_dest.write_between(
420
+ self.table_temp,
421
+ table,
422
+ query,
423
+ self.dumper_src,
424
+ )
425
+ self.dq_check(table)
426
+ self.to_table()
@@ -0,0 +1,28 @@
1
+ from __future__ import annotations
2
+ from typing import TYPE_CHECKING
3
+
4
+ from airflow.hooks.base import BaseHook
5
+ from light_compressor import CompressionMethod
6
+ from native_dumper.common import DBMS_DEFAULT_TIMEOUT_SEC
7
+
8
+ from .dumper import DBHoseDumpParams
9
+
10
+ if TYPE_CHECKING:
11
+ from airflow.models.connection import Connection
12
+ from native_dumper import NativeDumper
13
+ from pgpack_dumper import PGPackDumper
14
+
15
+
16
+ def dbhose_dumper(
17
+ airflow_connection: str,
18
+ compress_method: CompressionMethod = CompressionMethod.ZSTD,
19
+ timeout: int = DBMS_DEFAULT_TIMEOUT_SEC,
20
+ ) -> NativeDumper | PGPackDumper:
21
+ """Make Dumper object from Airflow connection string."""
22
+
23
+ connection: Connection = BaseHook.get_connection(airflow_connection)
24
+ return DBHoseDumpParams[connection.conn_type].from_airflow(
25
+ connection=connection,
26
+ compress_method=compress_method,
27
+ timeout=timeout,
28
+ )
@@ -0,0 +1,22 @@
1
+ from enum import Enum
2
+ from typing import NamedTuple
3
+
4
+
5
+ class DQTest(NamedTuple):
6
+ """Data quality test."""
7
+
8
+ description: str
9
+ generate_queryes: int
10
+ need_source_table: int
11
+
12
+
13
+ class DQCheck(DQTest, Enum):
14
+ """Enum for avaliable tests."""
15
+
16
+ empty = DQTest("Table not empty", 0, 0)
17
+ uniq = DQTest("Table don't have any duplicate rows", 0, 0)
18
+ future = DQTest("Table don't have dates from future", 1, 0)
19
+ infinity = DQTest("Table don't have infinity values", 1, 0)
20
+ nan = DQTest("Table don't have NaN values", 1, 0)
21
+ total = DQTest("Equal data total rows count between objects", 0, 1)
22
+ sum = DQTest("Equal data sums in digits columns between objects", 1, 1)
@@ -0,0 +1,68 @@
1
+ from __future__ import annotations
2
+ from enum import Enum
3
+ from typing import (
4
+ NamedTuple,
5
+ TYPE_CHECKING,
6
+ )
7
+
8
+ from airflow.hooks.base import log
9
+ from light_compressor import CompressionMethod
10
+ from native_dumper import (
11
+ CHConnector,
12
+ NativeDumper,
13
+ )
14
+ from native_dumper.common import DBMS_DEFAULT_TIMEOUT_SEC
15
+ from pgpack_dumper import (
16
+ PGConnector,
17
+ PGPackDumper,
18
+ )
19
+
20
+ if TYPE_CHECKING:
21
+ from airflow.models import Connection
22
+
23
+
24
+ class DBHoseObject(NamedTuple):
25
+ """DBHoseDump init params."""
26
+
27
+ name: str
28
+ connection: CHConnector | PGConnector
29
+ dumper: NativeDumper | PGPackDumper
30
+
31
+ def from_airflow(
32
+ self,
33
+ connection: Connection,
34
+ compress_method: CompressionMethod = CompressionMethod.ZSTD,
35
+ timeout: int = DBMS_DEFAULT_TIMEOUT_SEC,
36
+ ) -> NativeDumper | PGPackDumper:
37
+ """Init dumper from airflow connection object."""
38
+
39
+ params = {
40
+ "compression_method": compress_method,
41
+ "logger": log,
42
+ }
43
+
44
+ if self.connection is CHConnector and connection.port == 9000:
45
+ port = 8123
46
+ params["timeout"] = timeout
47
+ else:
48
+ port = connection.port
49
+
50
+ dbhose_connector = self.connection(
51
+ connection.host,
52
+ connection.schema,
53
+ connection.login,
54
+ connection.password,
55
+ port,
56
+ )
57
+
58
+ return self.dumper(dbhose_connector, **params)
59
+
60
+
61
+ class DBHoseDumpParams(DBHoseObject, Enum):
62
+ """Enums for DBHoseDumps."""
63
+
64
+ clickhouse = DBHoseObject("clickhouse", CHConnector, NativeDumper)
65
+ ftp = DBHoseObject("ftp", CHConnector, NativeDumper)
66
+ http = DBHoseObject("http", CHConnector, NativeDumper)
67
+ postgres = DBHoseObject("postgres", PGConnector, PGPackDumper)
68
+ greenplum = DBHoseObject("greenplum", PGConnector, PGPackDumper)
@@ -0,0 +1,20 @@
1
+ from enum import Enum
2
+ from typing import NamedTuple
3
+
4
+
5
+ class MoveType(NamedTuple):
6
+ """Move method object."""
7
+
8
+ name: str
9
+ have_sql: bool
10
+ need_filter: bool
11
+ is_custom: bool
12
+
13
+
14
+ class MoveMethod(MoveType, Enum):
15
+ """Insert from temp table methods."""
16
+
17
+ append = MoveType("append", False, False, False)
18
+ custom = MoveType("custom", False, False, True)
19
+ delete = MoveType("delete", True, True, False)
20
+ replace = MoveType("replace", True, False, False)