dbhose-airflow 0.0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dbhose_airflow/__init__.py +426 -0
- dbhose_airflow/airflow_connect.py +28 -0
- dbhose_airflow/dq_check.py +22 -0
- dbhose_airflow/dumper.py +68 -0
- dbhose_airflow/move_method.py +20 -0
- dbhose_airflow-0.0.0.1.dist-info/METADATA +432 -0
- dbhose_airflow-0.0.0.1.dist-info/RECORD +11 -0
- dbhose_airflow-0.0.0.1.dist-info/WHEEL +5 -0
- dbhose_airflow-0.0.0.1.dist-info/licenses/CHANGELOG.md +5 -0
- dbhose_airflow-0.0.0.1.dist-info/licenses/README.md +410 -0
- dbhose_airflow-0.0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from os.path import dirname
|
|
3
|
+
from typing import (
|
|
4
|
+
Any,
|
|
5
|
+
TYPE_CHECKING,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from airflow.hooks.base import log
|
|
9
|
+
from dbhose_utils import DumpType
|
|
10
|
+
from light_compressor import CompressionMethod
|
|
11
|
+
from native_dumper import NativeDumper
|
|
12
|
+
from native_dumper.common import DBMS_DEFAULT_TIMEOUT_SEC
|
|
13
|
+
from pandas import DataFrame as PDFrame
|
|
14
|
+
from polars import DataFrame as PLFrame
|
|
15
|
+
|
|
16
|
+
from .airflow_connect import dbhose_dumper
|
|
17
|
+
from .dq_check import DQCheck
|
|
18
|
+
from .move_method import MoveMethod
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from collections.abc import Iterable
|
|
22
|
+
from io import BufferedReader
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
__all__ = (
|
|
26
|
+
"DBMS_DEFAULT_TIMEOUT_SEC",
|
|
27
|
+
"CompressionMethod",
|
|
28
|
+
"DBHose",
|
|
29
|
+
"DumpType",
|
|
30
|
+
"MoveMethod",
|
|
31
|
+
"dbhose_dumper",
|
|
32
|
+
)
|
|
33
|
+
__version__ = "0.0.0.1"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
root_path = dirname(__file__)
|
|
37
|
+
logo_path = f"{root_path}/LOGO"
|
|
38
|
+
ddl_path = f"{root_path}/ddl/{{}}.sql"
|
|
39
|
+
dq_path = f"{root_path}/dq/{{}}/{{}}.sql"
|
|
40
|
+
mv_path = f"{root_path}/move/{{}}/{{}}.sql"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def read_text(path: str) -> str:
|
|
44
|
+
"""Read text from file."""
|
|
45
|
+
|
|
46
|
+
with open(path, encoding="utf-8") as file:
|
|
47
|
+
return file.read()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def wrap_frame(
|
|
51
|
+
text: str,
|
|
52
|
+
min_width: int = 79,
|
|
53
|
+
) -> str:
|
|
54
|
+
"""Wraps text in a frame with a minimum size.
|
|
55
|
+
|
|
56
|
+
Arguments:
|
|
57
|
+
text (str): Text to wrap
|
|
58
|
+
min_width (int): Minimum frame width (default 79)
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
str: Text in frame
|
|
62
|
+
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
lines = [line.strip() for line in str(text).split("\n") if line.strip()]
|
|
66
|
+
max_line_length = max(len(line) for line in lines) if lines else 0
|
|
67
|
+
content_width = max(
|
|
68
|
+
max_line_length, min_width - 4,
|
|
69
|
+
)
|
|
70
|
+
frame_width = content_width + 4
|
|
71
|
+
result = [""]
|
|
72
|
+
result.append("┌" + "─" * (frame_width - 2) + "┐")
|
|
73
|
+
|
|
74
|
+
for line in lines:
|
|
75
|
+
spaces_needed = content_width - len(line)
|
|
76
|
+
padded_line = f" {line}{' ' * spaces_needed} "
|
|
77
|
+
result.append("│" + padded_line + "│")
|
|
78
|
+
|
|
79
|
+
result.append("└" + "─" * (frame_width - 2) + "┘")
|
|
80
|
+
return "\n".join(result)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class DBHose:
|
|
84
|
+
"""DBHose object."""
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
table_dest: str,
|
|
89
|
+
connection_dest: str,
|
|
90
|
+
connection_src: str | None = None,
|
|
91
|
+
dq_skip_check: list[str] = [],
|
|
92
|
+
filter_by: list[str] = [],
|
|
93
|
+
drop_temp_table: bool = True,
|
|
94
|
+
move_method: MoveMethod = MoveMethod.replace,
|
|
95
|
+
custom_move: str | None = None,
|
|
96
|
+
compress_method: CompressionMethod = CompressionMethod.ZSTD,
|
|
97
|
+
timeout: int = DBMS_DEFAULT_TIMEOUT_SEC,
|
|
98
|
+
) -> None:
|
|
99
|
+
"""Class initialization."""
|
|
100
|
+
|
|
101
|
+
self.logger = log
|
|
102
|
+
self.table_dest = table_dest
|
|
103
|
+
self.connection_dest = connection_dest
|
|
104
|
+
self.connection_src = connection_src
|
|
105
|
+
self.dq_skip_check = dq_skip_check
|
|
106
|
+
self.filter_by = filter_by
|
|
107
|
+
self.drop_temp_table = drop_temp_table
|
|
108
|
+
self.move_method = move_method
|
|
109
|
+
self.custom_move = custom_move
|
|
110
|
+
self.dumper_dest = dbhose_dumper(
|
|
111
|
+
self.connection_dest,
|
|
112
|
+
compress_method,
|
|
113
|
+
timeout,
|
|
114
|
+
)
|
|
115
|
+
self.dumper_src = None
|
|
116
|
+
self.ddl = None
|
|
117
|
+
self.temp_ddl = None
|
|
118
|
+
self.table_temp = None
|
|
119
|
+
|
|
120
|
+
if self.connection_src:
|
|
121
|
+
self.dumper_src = dbhose_dumper(
|
|
122
|
+
self.connection_src,
|
|
123
|
+
compress_method,
|
|
124
|
+
timeout,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
self.logger.info(read_text(logo_path))
|
|
128
|
+
|
|
129
|
+
def create_temp(self) -> None:
|
|
130
|
+
"""Create temporary table."""
|
|
131
|
+
|
|
132
|
+
self.logger.info("Make temp table operation start")
|
|
133
|
+
query_ddl = read_text(ddl_path.format(self.dumper_dest.dbname))
|
|
134
|
+
self.logger.info("Getting data from server")
|
|
135
|
+
reader = self.dumper_dest.to_reader(
|
|
136
|
+
query_ddl.format(table=self.table_dest)
|
|
137
|
+
)
|
|
138
|
+
self.ddl, self.temp_ddl, self.table_temp = tuple(*reader.to_rows())
|
|
139
|
+
|
|
140
|
+
if not self.ddl:
|
|
141
|
+
msg = f"Table {self.table_dest} not found!"
|
|
142
|
+
self.logger.error(wrap_frame(msg))
|
|
143
|
+
raise ValueError(msg)
|
|
144
|
+
|
|
145
|
+
self.logger.info(f"Make table {self.table_temp}")
|
|
146
|
+
self.dumper_dest.cursor.execute(self.temp_ddl)
|
|
147
|
+
|
|
148
|
+
if self.dumper_dest.__class__ is not NativeDumper:
|
|
149
|
+
self.dumper_dest.connect.commit()
|
|
150
|
+
|
|
151
|
+
self.logger.info(wrap_frame(f"Table {self.table_temp} created"))
|
|
152
|
+
|
|
153
|
+
def drop_temp(self) -> None:
|
|
154
|
+
"""Drop temp table."""
|
|
155
|
+
|
|
156
|
+
if self.drop_temp_table:
|
|
157
|
+
self.logger.info("Drop temp table operation start")
|
|
158
|
+
self.dumper_dest.cursor.execute(
|
|
159
|
+
f"drop table if exists {self.table_temp}"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
if self.dumper_dest.__class__ is not NativeDumper:
|
|
163
|
+
self.dumper_dest.connect.commit()
|
|
164
|
+
|
|
165
|
+
self.logger.info(wrap_frame(f"Table {self.table_temp} dropped"))
|
|
166
|
+
else:
|
|
167
|
+
self.logger.warning(
|
|
168
|
+
wrap_frame("Drop temp table operation skipped by user")
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def dq_check(self, table: str | None = None) -> None:
|
|
172
|
+
"""Data quality checker."""
|
|
173
|
+
|
|
174
|
+
self.logger.info(wrap_frame("Start Data Quality tests"))
|
|
175
|
+
|
|
176
|
+
for test in DQCheck._member_names_:
|
|
177
|
+
dq = DQCheck[test]
|
|
178
|
+
|
|
179
|
+
if test in self.dq_skip_check:
|
|
180
|
+
self.logger.warning(
|
|
181
|
+
wrap_frame(f"{dq.description} test skipped by user")
|
|
182
|
+
)
|
|
183
|
+
continue
|
|
184
|
+
if dq.need_source_table and not table:
|
|
185
|
+
self.logger.warning(
|
|
186
|
+
wrap_frame(
|
|
187
|
+
f"{dq.description} test skipped [no source object]"
|
|
188
|
+
),
|
|
189
|
+
)
|
|
190
|
+
continue
|
|
191
|
+
|
|
192
|
+
query_dest = read_text(
|
|
193
|
+
dq_path.format(self.dumper_dest.dbname, test),
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
if dq.need_source_table:
|
|
197
|
+
dumper_src = self.dumper_src or self.dumper_dest
|
|
198
|
+
query_src = read_text(
|
|
199
|
+
dq_path.format(dumper_src.dbname, test),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
if dq.generate_queryes:
|
|
203
|
+
reader_src = dumper_src.to_reader(
|
|
204
|
+
query_src.format(table=table),
|
|
205
|
+
)
|
|
206
|
+
tests_src = list(reader_src.to_rows())
|
|
207
|
+
have_test = next(iter(tests_src))
|
|
208
|
+
|
|
209
|
+
if not have_test:
|
|
210
|
+
self.logger.warning(
|
|
211
|
+
wrap_frame(f"{dq.description} test Skip "
|
|
212
|
+
"[no data types for test]"),
|
|
213
|
+
)
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
reader_dest = self.dumper_dest.to_reader(
|
|
217
|
+
query_dest.format(table=self.table_temp),
|
|
218
|
+
)
|
|
219
|
+
tests_dest = list(reader_dest.to_rows())
|
|
220
|
+
|
|
221
|
+
for (_, column_src, test_src) in tests_src:
|
|
222
|
+
for (_, column_dest, test_dest) in tests_dest:
|
|
223
|
+
if column_src == column_dest:
|
|
224
|
+
reader_src = dumper_src.to_reader(test_src)
|
|
225
|
+
reader_dest = self.dumper_dest.to_reader(
|
|
226
|
+
test_dest,
|
|
227
|
+
)
|
|
228
|
+
value_src = next(iter(*reader_src.to_rows()))
|
|
229
|
+
value_dst = next(iter(*reader_dest.to_rows()))
|
|
230
|
+
|
|
231
|
+
if value_src != value_dst:
|
|
232
|
+
err_msg = (
|
|
233
|
+
f"Check {column_src} test Fail: "
|
|
234
|
+
f"value {value_src} <> {value_dst}"
|
|
235
|
+
)
|
|
236
|
+
self.logger.error(wrap_frame(err_msg))
|
|
237
|
+
raise ValueError(err_msg)
|
|
238
|
+
|
|
239
|
+
self.logger.info(
|
|
240
|
+
wrap_frame(
|
|
241
|
+
f"Check {column_src} test Pass"
|
|
242
|
+
),
|
|
243
|
+
)
|
|
244
|
+
break
|
|
245
|
+
else:
|
|
246
|
+
self.logger.warning(
|
|
247
|
+
wrap_frame(f"Check {column_src} test Skip "
|
|
248
|
+
"[no column for test]"),
|
|
249
|
+
)
|
|
250
|
+
else:
|
|
251
|
+
reader_src = dumper_src.to_reader(
|
|
252
|
+
query_src.format(table=table),
|
|
253
|
+
)
|
|
254
|
+
reader_dest = self.dumper_dest.to_reader(
|
|
255
|
+
query_dest.format(table=self.table_temp),
|
|
256
|
+
)
|
|
257
|
+
value_src = next(iter(reader_src.to_rows()))[0]
|
|
258
|
+
value_dst = next(iter(reader_dest.to_rows()))[0]
|
|
259
|
+
|
|
260
|
+
if value_src != value_dst:
|
|
261
|
+
err_msg = (
|
|
262
|
+
f"{dq.description} test Fail: "
|
|
263
|
+
f"value {value_src} <> {value_dst}"
|
|
264
|
+
)
|
|
265
|
+
self.logger.error(wrap_frame(err_msg))
|
|
266
|
+
raise ValueError(err_msg)
|
|
267
|
+
|
|
268
|
+
else:
|
|
269
|
+
reader_dest = self.dumper_dest.to_reader(
|
|
270
|
+
query_dest.format(table=self.table_temp),
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
if dq.generate_queryes:
|
|
274
|
+
tests = list(reader_dest.to_rows())
|
|
275
|
+
|
|
276
|
+
for (have_test, column_name, query) in tests:
|
|
277
|
+
|
|
278
|
+
if not have_test:
|
|
279
|
+
self.logger.warning(
|
|
280
|
+
wrap_frame(f"{dq.description} test Skip "
|
|
281
|
+
"[no column for test]"),
|
|
282
|
+
)
|
|
283
|
+
break
|
|
284
|
+
|
|
285
|
+
reader_dest = self.dumper_dest.to_reader(query)
|
|
286
|
+
value, result = next(iter(reader_dest.to_rows()))
|
|
287
|
+
|
|
288
|
+
if result == "Fail":
|
|
289
|
+
err_msg = (
|
|
290
|
+
f"Check {column_name} test Fail "
|
|
291
|
+
f"with {value} error rows"
|
|
292
|
+
)
|
|
293
|
+
self.logger.error(wrap_frame(err_msg))
|
|
294
|
+
raise ValueError(err_msg)
|
|
295
|
+
|
|
296
|
+
self.logger.info(
|
|
297
|
+
wrap_frame(f"Check {column_name} test Pass"),
|
|
298
|
+
)
|
|
299
|
+
else:
|
|
300
|
+
value, result = next(iter(reader_dest.to_rows()))
|
|
301
|
+
|
|
302
|
+
if result == "Fail":
|
|
303
|
+
err_msg = (
|
|
304
|
+
f"{dq.description} test Fail "
|
|
305
|
+
f"with {value} error rows"
|
|
306
|
+
)
|
|
307
|
+
self.logger.error(wrap_frame(err_msg))
|
|
308
|
+
raise ValueError(err_msg)
|
|
309
|
+
|
|
310
|
+
self.logger.info(wrap_frame(f"{dq.description} test Pass"))
|
|
311
|
+
|
|
312
|
+
self.logger.info(
|
|
313
|
+
wrap_frame("All Data Quality tests have been completed")
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
def to_table(self) -> None:
|
|
317
|
+
"""Move data to destination table."""
|
|
318
|
+
|
|
319
|
+
self.logger.info(
|
|
320
|
+
wrap_frame(f"Move data with method {self.move_method.name}")
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
if self.move_method.need_filter and not self.filter_by:
|
|
324
|
+
error_msg = "You must specify columns in filter_by"
|
|
325
|
+
self.logger.error(wrap_frame(error_msg))
|
|
326
|
+
raise ValueError(error_msg)
|
|
327
|
+
|
|
328
|
+
if self.move_method.is_custom:
|
|
329
|
+
|
|
330
|
+
if not self.custom_move:
|
|
331
|
+
error_msg = "You must specify custom query"
|
|
332
|
+
self.logger.error(wrap_frame(error_msg))
|
|
333
|
+
raise ValueError(error_msg)
|
|
334
|
+
|
|
335
|
+
self.dumper_dest.cursor.execute(self.custom_move)
|
|
336
|
+
|
|
337
|
+
if self.dumper_dest.__class__ is not NativeDumper:
|
|
338
|
+
self.dumper_dest.connect.commit()
|
|
339
|
+
|
|
340
|
+
elif self.move_method.have_sql:
|
|
341
|
+
move_query = read_text(
|
|
342
|
+
mv_path.format(self.dumper_dest.dbname, self.move_method.name)
|
|
343
|
+
)
|
|
344
|
+
reader = self.dumper_dest.to_reader(move_query.format(
|
|
345
|
+
table_dest=self.table_dest,
|
|
346
|
+
table_temp=self.table_temp,
|
|
347
|
+
filter_by=self.filter_by,
|
|
348
|
+
))
|
|
349
|
+
is_avaliable, move_query = tuple(*reader.to_rows())
|
|
350
|
+
|
|
351
|
+
if not is_avaliable:
|
|
352
|
+
error_msg = (
|
|
353
|
+
f"Method {self.move_method.name} is not available for "
|
|
354
|
+
f"{self.table_dest}. Use another method."
|
|
355
|
+
)
|
|
356
|
+
self.logger.error(wrap_frame(error_msg))
|
|
357
|
+
raise ValueError(error_msg)
|
|
358
|
+
|
|
359
|
+
self.dumper_dest.cursor.execute(move_query)
|
|
360
|
+
|
|
361
|
+
if self.dumper_dest.__class__ is not NativeDumper:
|
|
362
|
+
self.dumper_dest.connect.commit()
|
|
363
|
+
|
|
364
|
+
else:
|
|
365
|
+
self.dumper_dest.write_between(self.table_dest, self.table_temp)
|
|
366
|
+
|
|
367
|
+
self.logger.info(wrap_frame(f"Data moved into {self.table_dest}"))
|
|
368
|
+
self.drop_temp()
|
|
369
|
+
|
|
370
|
+
def from_file(
|
|
371
|
+
self,
|
|
372
|
+
fileobj: BufferedReader,
|
|
373
|
+
) -> None:
|
|
374
|
+
"""Upload from dump file object."""
|
|
375
|
+
|
|
376
|
+
self.create_temp()
|
|
377
|
+
self.dumper_dest.write_dump(fileobj, self.table_temp)
|
|
378
|
+
self.dq_check()
|
|
379
|
+
self.to_table()
|
|
380
|
+
|
|
381
|
+
def from_iterable(
|
|
382
|
+
self,
|
|
383
|
+
dtype_data: Iterable[Any],
|
|
384
|
+
) -> None:
|
|
385
|
+
"""Upload from python iterable object."""
|
|
386
|
+
|
|
387
|
+
self.create_temp()
|
|
388
|
+
self.dumper_dest.from_rows(dtype_data, self.table_temp)
|
|
389
|
+
self.dq_check()
|
|
390
|
+
self.to_table()
|
|
391
|
+
|
|
392
|
+
def from_frame(
|
|
393
|
+
self,
|
|
394
|
+
data_frame: PDFrame | PLFrame,
|
|
395
|
+
) -> None:
|
|
396
|
+
"""Upload from DataFrame."""
|
|
397
|
+
|
|
398
|
+
self.create_temp()
|
|
399
|
+
|
|
400
|
+
if data_frame.__class__ is PDFrame:
|
|
401
|
+
self.dumper_dest.from_pandas(data_frame, self.table_temp)
|
|
402
|
+
elif data_frame.__class__ is PLFrame:
|
|
403
|
+
self.dumper_dest.from_polars(data_frame, self.table_temp)
|
|
404
|
+
else:
|
|
405
|
+
msg = f"Unknown DataFrame type {data_frame.__class__}."
|
|
406
|
+
raise TypeError(msg)
|
|
407
|
+
|
|
408
|
+
self.dq_check()
|
|
409
|
+
self.to_table()
|
|
410
|
+
|
|
411
|
+
def from_dmbs(
|
|
412
|
+
self,
|
|
413
|
+
query: str | None = None,
|
|
414
|
+
table: str | None = None,
|
|
415
|
+
) -> None:
|
|
416
|
+
"""Upload from DMBS."""
|
|
417
|
+
|
|
418
|
+
self.create_temp()
|
|
419
|
+
self.dumper_dest.write_between(
|
|
420
|
+
self.table_temp,
|
|
421
|
+
table,
|
|
422
|
+
query,
|
|
423
|
+
self.dumper_src,
|
|
424
|
+
)
|
|
425
|
+
self.dq_check(table)
|
|
426
|
+
self.to_table()
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from airflow.hooks.base import BaseHook
|
|
5
|
+
from light_compressor import CompressionMethod
|
|
6
|
+
from native_dumper.common import DBMS_DEFAULT_TIMEOUT_SEC
|
|
7
|
+
|
|
8
|
+
from .dumper import DBHoseDumpParams
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from airflow.models.connection import Connection
|
|
12
|
+
from native_dumper import NativeDumper
|
|
13
|
+
from pgpack_dumper import PGPackDumper
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def dbhose_dumper(
|
|
17
|
+
airflow_connection: str,
|
|
18
|
+
compress_method: CompressionMethod = CompressionMethod.ZSTD,
|
|
19
|
+
timeout: int = DBMS_DEFAULT_TIMEOUT_SEC,
|
|
20
|
+
) -> NativeDumper | PGPackDumper:
|
|
21
|
+
"""Make Dumper object from Airflow connection string."""
|
|
22
|
+
|
|
23
|
+
connection: Connection = BaseHook.get_connection(airflow_connection)
|
|
24
|
+
return DBHoseDumpParams[connection.conn_type].from_airflow(
|
|
25
|
+
connection=connection,
|
|
26
|
+
compress_method=compress_method,
|
|
27
|
+
timeout=timeout,
|
|
28
|
+
)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import NamedTuple
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class DQTest(NamedTuple):
|
|
6
|
+
"""Data quality test."""
|
|
7
|
+
|
|
8
|
+
description: str
|
|
9
|
+
generate_queryes: int
|
|
10
|
+
need_source_table: int
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DQCheck(DQTest, Enum):
|
|
14
|
+
"""Enum for avaliable tests."""
|
|
15
|
+
|
|
16
|
+
empty = DQTest("Table not empty", 0, 0)
|
|
17
|
+
uniq = DQTest("Table don't have any duplicate rows", 0, 0)
|
|
18
|
+
future = DQTest("Table don't have dates from future", 1, 0)
|
|
19
|
+
infinity = DQTest("Table don't have infinity values", 1, 0)
|
|
20
|
+
nan = DQTest("Table don't have NaN values", 1, 0)
|
|
21
|
+
total = DQTest("Equal data total rows count between objects", 0, 1)
|
|
22
|
+
sum = DQTest("Equal data sums in digits columns between objects", 1, 1)
|
dbhose_airflow/dumper.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import (
|
|
4
|
+
NamedTuple,
|
|
5
|
+
TYPE_CHECKING,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
from airflow.hooks.base import log
|
|
9
|
+
from light_compressor import CompressionMethod
|
|
10
|
+
from native_dumper import (
|
|
11
|
+
CHConnector,
|
|
12
|
+
NativeDumper,
|
|
13
|
+
)
|
|
14
|
+
from native_dumper.common import DBMS_DEFAULT_TIMEOUT_SEC
|
|
15
|
+
from pgpack_dumper import (
|
|
16
|
+
PGConnector,
|
|
17
|
+
PGPackDumper,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from airflow.models import Connection
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DBHoseObject(NamedTuple):
|
|
25
|
+
"""DBHoseDump init params."""
|
|
26
|
+
|
|
27
|
+
name: str
|
|
28
|
+
connection: CHConnector | PGConnector
|
|
29
|
+
dumper: NativeDumper | PGPackDumper
|
|
30
|
+
|
|
31
|
+
def from_airflow(
|
|
32
|
+
self,
|
|
33
|
+
connection: Connection,
|
|
34
|
+
compress_method: CompressionMethod = CompressionMethod.ZSTD,
|
|
35
|
+
timeout: int = DBMS_DEFAULT_TIMEOUT_SEC,
|
|
36
|
+
) -> NativeDumper | PGPackDumper:
|
|
37
|
+
"""Init dumper from airflow connection object."""
|
|
38
|
+
|
|
39
|
+
params = {
|
|
40
|
+
"compression_method": compress_method,
|
|
41
|
+
"logger": log,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if self.connection is CHConnector and connection.port == 9000:
|
|
45
|
+
port = 8123
|
|
46
|
+
params["timeout"] = timeout
|
|
47
|
+
else:
|
|
48
|
+
port = connection.port
|
|
49
|
+
|
|
50
|
+
dbhose_connector = self.connection(
|
|
51
|
+
connection.host,
|
|
52
|
+
connection.schema,
|
|
53
|
+
connection.login,
|
|
54
|
+
connection.password,
|
|
55
|
+
port,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
return self.dumper(dbhose_connector, **params)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class DBHoseDumpParams(DBHoseObject, Enum):
|
|
62
|
+
"""Enums for DBHoseDumps."""
|
|
63
|
+
|
|
64
|
+
clickhouse = DBHoseObject("clickhouse", CHConnector, NativeDumper)
|
|
65
|
+
ftp = DBHoseObject("ftp", CHConnector, NativeDumper)
|
|
66
|
+
http = DBHoseObject("http", CHConnector, NativeDumper)
|
|
67
|
+
postgres = DBHoseObject("postgres", PGConnector, PGPackDumper)
|
|
68
|
+
greenplum = DBHoseObject("greenplum", PGConnector, PGPackDumper)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
from typing import NamedTuple
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class MoveType(NamedTuple):
|
|
6
|
+
"""Move method object."""
|
|
7
|
+
|
|
8
|
+
name: str
|
|
9
|
+
have_sql: bool
|
|
10
|
+
need_filter: bool
|
|
11
|
+
is_custom: bool
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MoveMethod(MoveType, Enum):
|
|
15
|
+
"""Insert from temp table methods."""
|
|
16
|
+
|
|
17
|
+
append = MoveType("append", False, False, False)
|
|
18
|
+
custom = MoveType("custom", False, False, True)
|
|
19
|
+
delete = MoveType("delete", True, True, False)
|
|
20
|
+
replace = MoveType("replace", True, False, False)
|