xoverrr 1.1.4__tar.gz → 1.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xoverrr-1.1.4/src/xoverrr.egg-info → xoverrr-1.1.6}/PKG-INFO +67 -71
- {xoverrr-1.1.4 → xoverrr-1.1.6}/README.md +64 -68
- {xoverrr-1.1.4 → xoverrr-1.1.6}/pyproject.toml +10 -4
- xoverrr-1.1.6/src/xoverrr/__init__.py +13 -0
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr/adapters/__init__.py +7 -2
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr/adapters/base.py +61 -32
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr/adapters/clickhouse.py +64 -35
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr/adapters/oracle.py +67 -38
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr/adapters/postgres.py +67 -35
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr/constants.py +4 -4
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr/core.py +299 -197
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr/exceptions.py +8 -1
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr/logger.py +4 -2
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr/models.py +11 -5
- xoverrr-1.1.6/src/xoverrr/utils.py +740 -0
- {xoverrr-1.1.4 → xoverrr-1.1.6/src/xoverrr.egg-info}/PKG-INFO +67 -71
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr.egg-info/requires.txt +2 -2
- xoverrr-1.1.4/src/xoverrr/__init__.py +0 -17
- xoverrr-1.1.4/src/xoverrr/utils.py +0 -668
- {xoverrr-1.1.4 → xoverrr-1.1.6}/LICENSE +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.6}/setup.cfg +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr.egg-info/SOURCES.txt +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr.egg-info/dependency_links.txt +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.6}/src/xoverrr.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xoverrr
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.6
|
|
4
4
|
Summary: A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
|
|
5
5
|
Author-email: Dmitry Ischenko <hotmori@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -21,7 +21,7 @@ Requires-Dist: clickhouse-sqlalchemy>=0.2.0
|
|
|
21
21
|
Provides-Extra: dev
|
|
22
22
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
23
23
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
24
|
-
Requires-Dist:
|
|
24
|
+
Requires-Dist: ruff>=0.15.0; extra == "dev"
|
|
25
25
|
Requires-Dist: isort>=5.12.0; extra == "dev"
|
|
26
26
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
27
27
|
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
@@ -31,7 +31,7 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
|
31
31
|
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
32
32
|
Requires-Dist: tenacity>=8.2.0; extra == "test"
|
|
33
33
|
Provides-Extra: lint
|
|
34
|
-
Requires-Dist:
|
|
34
|
+
Requires-Dist: ruff>=0.15.0; extra == "lint"
|
|
35
35
|
Requires-Dist: isort>=5.12.0; extra == "lint"
|
|
36
36
|
Requires-Dist: flake8>=6.0.0; extra == "lint"
|
|
37
37
|
Dynamic: license-file
|
|
@@ -40,6 +40,70 @@ Dynamic: license-file
|
|
|
40
40
|
|
|
41
41
|
A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
|
|
42
42
|
|
|
43
|
+
## Usage Example
|
|
44
|
+
**Sample comparison** (Greenplum vs Oracle):
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
|
|
48
|
+
import os
|
|
49
|
+
from datetime import date, timedelta
|
|
50
|
+
|
|
51
|
+
USER_ORA = os.getenv('USER_ORA', '')
|
|
52
|
+
PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
|
|
53
|
+
|
|
54
|
+
USER_GP = os.getenv('USER_GP', '')
|
|
55
|
+
PASSWORD_GP = os.getenv('PASSWORD_GP', '')
|
|
56
|
+
|
|
57
|
+
HOST_ORA = os.getenv('HOST_ORA', '')
|
|
58
|
+
HOST_GP = os.getenv('HOST_GP', '')
|
|
59
|
+
|
|
60
|
+
def create_src_engine(user, password, host):
|
|
61
|
+
"""Source engine (Oracle)"""
|
|
62
|
+
os.environ['NLS_LANG'] = '.AL32UTF8'
|
|
63
|
+
return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
|
|
64
|
+
|
|
65
|
+
def create_trg_engine(user, password, host):
|
|
66
|
+
"""Target engine (Postgres/Greenplum)"""
|
|
67
|
+
connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
|
|
68
|
+
engine = create_engine(connection_string)
|
|
69
|
+
return engine
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST_ORA)
|
|
73
|
+
trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST_GP)
|
|
74
|
+
|
|
75
|
+
comparator = DataQualityComparator(
|
|
76
|
+
source_engine=src_engine,
|
|
77
|
+
target_engine=trg_engine,
|
|
78
|
+
timezone='Europe/Athens'
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
source = DataReference("users", "schema1")
|
|
82
|
+
target = DataReference("users", "schema2")
|
|
83
|
+
|
|
84
|
+
FORMAT = '%Y-%m-%d'
|
|
85
|
+
recent_range_end = date.today()
|
|
86
|
+
recent_range_begin = recent_range_end - timedelta(days=1)
|
|
87
|
+
|
|
88
|
+
status, report, stats, details = comparator.compare_sample(
|
|
89
|
+
source,
|
|
90
|
+
target,
|
|
91
|
+
date_column="created_at",
|
|
92
|
+
update_column="modified_date",
|
|
93
|
+
exclude_columns=["audit_timestamp", "internal_id"],
|
|
94
|
+
exclude_recent_hours=3,
|
|
95
|
+
date_range=(
|
|
96
|
+
recent_range_begin.strftime(FORMAT),
|
|
97
|
+
recent_range_end.strftime(FORMAT)
|
|
98
|
+
),
|
|
99
|
+
tolerance_percentage=0
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
print(report)
|
|
103
|
+
if status == COMPARISON_FAILED:
|
|
104
|
+
raise Exception("Sample check failed")
|
|
105
|
+
```
|
|
106
|
+
|
|
43
107
|
## Key Features
|
|
44
108
|
- **Multi‑DBMS support**: Oracle, PostgreSQL (+ Greenplum), ClickHouse (extensible via adapter layer) — tables and views.
|
|
45
109
|
- **Universal connections**: Provide SQLAlchemy Engine objects for source and target databases.
|
|
@@ -273,71 +337,3 @@ Logs include timing information and structured context:
|
|
|
273
337
|
- If `final_diff_score ≤ tolerance`: status = `COMPARISON_SUCCESS`
|
|
274
338
|
- Enables configuration of acceptable discrepancy levels.
|
|
275
339
|
|
|
276
|
-
---
|
|
277
|
-
|
|
278
|
-
## Usage Example
|
|
279
|
-
**Sample comparison** (Greenplum vs Oracle):
|
|
280
|
-
|
|
281
|
-
```python
|
|
282
|
-
from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
|
|
283
|
-
import os
|
|
284
|
-
from datetime import date, timedelta
|
|
285
|
-
|
|
286
|
-
USER_ORA = os.getenv('USER_ORA', '')
|
|
287
|
-
PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
|
|
288
|
-
|
|
289
|
-
USER_GP = os.getenv('USER_GP', '')
|
|
290
|
-
PASSWORD_GP = os.getenv('PASSWORD_GP', '')
|
|
291
|
-
|
|
292
|
-
HOST = os.getenv('HOST', '')
|
|
293
|
-
|
|
294
|
-
def create_src_engine(user, password, host):
|
|
295
|
-
"""Source engine (Oracle)"""
|
|
296
|
-
os.environ['NLS_LANG'] = '.AL32UTF8'
|
|
297
|
-
return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
|
|
298
|
-
|
|
299
|
-
def create_trg_engine(user, password, host):
|
|
300
|
-
"""Target engine (Postgres/Greenplum)"""
|
|
301
|
-
connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
|
|
302
|
-
engine = create_engine(connection_string)
|
|
303
|
-
return engine
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST)
|
|
308
|
-
trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST)
|
|
309
|
-
|
|
310
|
-
comparator = DataQualityComparator(
|
|
311
|
-
source_engine=src_engine,
|
|
312
|
-
target_engine=trg_engine,
|
|
313
|
-
timezone='Asia/Yekaterinburg'
|
|
314
|
-
)
|
|
315
|
-
|
|
316
|
-
source = DataReference("users", "schema1")
|
|
317
|
-
target = DataReference("users", "schema2")
|
|
318
|
-
|
|
319
|
-
FORMAT = '%Y-%m-%d'
|
|
320
|
-
recent_range_end = date.today()
|
|
321
|
-
recent_range_begin = recent_range_end - timedelta(days=1)
|
|
322
|
-
|
|
323
|
-
status, report, stats, details = comparator.compare_sample(
|
|
324
|
-
source,
|
|
325
|
-
target,
|
|
326
|
-
date_column="created_at",
|
|
327
|
-
update_column="modified_date",
|
|
328
|
-
exclude_columns=["audit_timestamp", "internal_id"],
|
|
329
|
-
exclude_recent_hours=24,
|
|
330
|
-
date_range=(
|
|
331
|
-
recent_range_begin.strftime(FORMAT),
|
|
332
|
-
recent_range_end.strftime(FORMAT)
|
|
333
|
-
),
|
|
334
|
-
tolerance_percentage=0
|
|
335
|
-
)
|
|
336
|
-
|
|
337
|
-
print(report)
|
|
338
|
-
if status == COMPARISON_FAILED:
|
|
339
|
-
raise Exception("Sample check failed")
|
|
340
|
-
```
|
|
341
|
-
|
|
342
|
-
---
|
|
343
|
-
|
|
@@ -2,6 +2,70 @@
|
|
|
2
2
|
|
|
3
3
|
A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
|
|
4
4
|
|
|
5
|
+
## Usage Example
|
|
6
|
+
**Sample comparison** (Greenplum vs Oracle):
|
|
7
|
+
|
|
8
|
+
```python
|
|
9
|
+
from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
|
|
10
|
+
import os
|
|
11
|
+
from datetime import date, timedelta
|
|
12
|
+
|
|
13
|
+
USER_ORA = os.getenv('USER_ORA', '')
|
|
14
|
+
PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
|
|
15
|
+
|
|
16
|
+
USER_GP = os.getenv('USER_GP', '')
|
|
17
|
+
PASSWORD_GP = os.getenv('PASSWORD_GP', '')
|
|
18
|
+
|
|
19
|
+
HOST_ORA = os.getenv('HOST_ORA', '')
|
|
20
|
+
HOST_GP = os.getenv('HOST_GP', '')
|
|
21
|
+
|
|
22
|
+
def create_src_engine(user, password, host):
|
|
23
|
+
"""Source engine (Oracle)"""
|
|
24
|
+
os.environ['NLS_LANG'] = '.AL32UTF8'
|
|
25
|
+
return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
|
|
26
|
+
|
|
27
|
+
def create_trg_engine(user, password, host):
|
|
28
|
+
"""Target engine (Postgres/Greenplum)"""
|
|
29
|
+
connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
|
|
30
|
+
engine = create_engine(connection_string)
|
|
31
|
+
return engine
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST_ORA)
|
|
35
|
+
trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST_GP)
|
|
36
|
+
|
|
37
|
+
comparator = DataQualityComparator(
|
|
38
|
+
source_engine=src_engine,
|
|
39
|
+
target_engine=trg_engine,
|
|
40
|
+
timezone='Europe/Athens'
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
source = DataReference("users", "schema1")
|
|
44
|
+
target = DataReference("users", "schema2")
|
|
45
|
+
|
|
46
|
+
FORMAT = '%Y-%m-%d'
|
|
47
|
+
recent_range_end = date.today()
|
|
48
|
+
recent_range_begin = recent_range_end - timedelta(days=1)
|
|
49
|
+
|
|
50
|
+
status, report, stats, details = comparator.compare_sample(
|
|
51
|
+
source,
|
|
52
|
+
target,
|
|
53
|
+
date_column="created_at",
|
|
54
|
+
update_column="modified_date",
|
|
55
|
+
exclude_columns=["audit_timestamp", "internal_id"],
|
|
56
|
+
exclude_recent_hours=3,
|
|
57
|
+
date_range=(
|
|
58
|
+
recent_range_begin.strftime(FORMAT),
|
|
59
|
+
recent_range_end.strftime(FORMAT)
|
|
60
|
+
),
|
|
61
|
+
tolerance_percentage=0
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
print(report)
|
|
65
|
+
if status == COMPARISON_FAILED:
|
|
66
|
+
raise Exception("Sample check failed")
|
|
67
|
+
```
|
|
68
|
+
|
|
5
69
|
## Key Features
|
|
6
70
|
- **Multi‑DBMS support**: Oracle, PostgreSQL (+ Greenplum), ClickHouse (extensible via adapter layer) — tables and views.
|
|
7
71
|
- **Universal connections**: Provide SQLAlchemy Engine objects for source and target databases.
|
|
@@ -235,71 +299,3 @@ Logs include timing information and structured context:
|
|
|
235
299
|
- If `final_diff_score ≤ tolerance`: status = `COMPARISON_SUCCESS`
|
|
236
300
|
- Enables configuration of acceptable discrepancy levels.
|
|
237
301
|
|
|
238
|
-
---
|
|
239
|
-
|
|
240
|
-
## Usage Example
|
|
241
|
-
**Sample comparison** (Greenplum vs Oracle):
|
|
242
|
-
|
|
243
|
-
```python
|
|
244
|
-
from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
|
|
245
|
-
import os
|
|
246
|
-
from datetime import date, timedelta
|
|
247
|
-
|
|
248
|
-
USER_ORA = os.getenv('USER_ORA', '')
|
|
249
|
-
PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
|
|
250
|
-
|
|
251
|
-
USER_GP = os.getenv('USER_GP', '')
|
|
252
|
-
PASSWORD_GP = os.getenv('PASSWORD_GP', '')
|
|
253
|
-
|
|
254
|
-
HOST = os.getenv('HOST', '')
|
|
255
|
-
|
|
256
|
-
def create_src_engine(user, password, host):
|
|
257
|
-
"""Source engine (Oracle)"""
|
|
258
|
-
os.environ['NLS_LANG'] = '.AL32UTF8'
|
|
259
|
-
return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
|
|
260
|
-
|
|
261
|
-
def create_trg_engine(user, password, host):
|
|
262
|
-
"""Target engine (Postgres/Greenplum)"""
|
|
263
|
-
connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
|
|
264
|
-
engine = create_engine(connection_string)
|
|
265
|
-
return engine
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST)
|
|
270
|
-
trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST)
|
|
271
|
-
|
|
272
|
-
comparator = DataQualityComparator(
|
|
273
|
-
source_engine=src_engine,
|
|
274
|
-
target_engine=trg_engine,
|
|
275
|
-
timezone='Asia/Yekaterinburg'
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
source = DataReference("users", "schema1")
|
|
279
|
-
target = DataReference("users", "schema2")
|
|
280
|
-
|
|
281
|
-
FORMAT = '%Y-%m-%d'
|
|
282
|
-
recent_range_end = date.today()
|
|
283
|
-
recent_range_begin = recent_range_end - timedelta(days=1)
|
|
284
|
-
|
|
285
|
-
status, report, stats, details = comparator.compare_sample(
|
|
286
|
-
source,
|
|
287
|
-
target,
|
|
288
|
-
date_column="created_at",
|
|
289
|
-
update_column="modified_date",
|
|
290
|
-
exclude_columns=["audit_timestamp", "internal_id"],
|
|
291
|
-
exclude_recent_hours=24,
|
|
292
|
-
date_range=(
|
|
293
|
-
recent_range_begin.strftime(FORMAT),
|
|
294
|
-
recent_range_end.strftime(FORMAT)
|
|
295
|
-
),
|
|
296
|
-
tolerance_percentage=0
|
|
297
|
-
)
|
|
298
|
-
|
|
299
|
-
print(report)
|
|
300
|
-
if status == COMPARISON_FAILED:
|
|
301
|
-
raise Exception("Sample check failed")
|
|
302
|
-
```
|
|
303
|
-
|
|
304
|
-
---
|
|
305
|
-
|
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "xoverrr"
|
|
8
|
-
version = "1.1.
|
|
8
|
+
version = "1.1.6"
|
|
9
9
|
description = "A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting."
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
requires-python = ">=3.9"
|
|
@@ -36,7 +36,7 @@ Homepage = "https://github.com/dima-ischenko/xoverrr"
|
|
|
36
36
|
dev = [
|
|
37
37
|
"pytest>=7.0.0",
|
|
38
38
|
"pytest-cov>=4.0.0",
|
|
39
|
-
"
|
|
39
|
+
"ruff>=0.15.0",
|
|
40
40
|
"isort>=5.12.0",
|
|
41
41
|
"mypy>=1.0.0",
|
|
42
42
|
"pre-commit>=3.0.0",
|
|
@@ -48,7 +48,7 @@ test = [
|
|
|
48
48
|
"tenacity>=8.2.0"
|
|
49
49
|
]
|
|
50
50
|
lint = [
|
|
51
|
-
"
|
|
51
|
+
"ruff>=0.15.0",
|
|
52
52
|
"isort>=5.12.0",
|
|
53
53
|
"flake8>=6.0.0",
|
|
54
54
|
]
|
|
@@ -59,4 +59,10 @@ where = ["src"]
|
|
|
59
59
|
[tool.pytest.ini_options]
|
|
60
60
|
pythonpath = ["src"]
|
|
61
61
|
testpaths = ["tests"]
|
|
62
|
-
addopts = "-v"
|
|
62
|
+
addopts = "-v"
|
|
63
|
+
|
|
64
|
+
[tool.ruff]
|
|
65
|
+
target-version = "py39"
|
|
66
|
+
|
|
67
|
+
[tool.ruff.format]
|
|
68
|
+
quote-style = "single"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .constants import (COMPARISON_FAILED, COMPARISON_SKIPPED,
|
|
2
|
+
COMPARISON_SUCCESS)
|
|
3
|
+
from .core import DataQualityComparator, DataReference
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
'DataQualityComparator',
|
|
7
|
+
'DataReference',
|
|
8
|
+
'COMPARISON_SUCCESS',
|
|
9
|
+
'COMPARISON_FAILED',
|
|
10
|
+
'COMPARISON_SKIPPED',
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
__version__ = '1.1.6'
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
from .base import BaseDatabaseAdapter
|
|
2
|
+
from .clickhouse import ClickHouseAdapter
|
|
2
3
|
from .oracle import OracleAdapter
|
|
3
4
|
from .postgres import PostgresAdapter
|
|
4
|
-
from .clickhouse import ClickHouseAdapter
|
|
5
5
|
|
|
6
|
-
__all__ = [
|
|
6
|
+
__all__ = [
|
|
7
|
+
'BaseDatabaseAdapter',
|
|
8
|
+
'OracleAdapter',
|
|
9
|
+
'PostgresAdapter',
|
|
10
|
+
'ClickHouseAdapter',
|
|
11
|
+
]
|
|
@@ -1,18 +1,23 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from typing import Dict, Callable, List, Tuple, Optional, Union
|
|
4
1
|
import re
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
5
3
|
from datetime import datetime, timedelta
|
|
6
|
-
from
|
|
7
|
-
|
|
4
|
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
8
7
|
from sqlalchemy.engine import Engine
|
|
8
|
+
|
|
9
|
+
from ..constants import RESERVED_WORDS
|
|
9
10
|
from ..logger import app_logger
|
|
10
|
-
from ..
|
|
11
|
+
from ..models import DataReference, ObjectType
|
|
12
|
+
|
|
11
13
|
|
|
12
14
|
class BaseDatabaseAdapter(ABC):
|
|
13
15
|
"""Abstract base class with updated method signatures for parameterized queries"""
|
|
16
|
+
|
|
14
17
|
@abstractmethod
|
|
15
|
-
def _execute_query(
|
|
18
|
+
def _execute_query(
|
|
19
|
+
self, query: Union[str, Tuple[str, Dict]], engine: Engine, timezone: str
|
|
20
|
+
) -> pd.DataFrame:
|
|
16
21
|
"""Execute query with DBMS-specific optimizations"""
|
|
17
22
|
pass
|
|
18
23
|
|
|
@@ -30,42 +35,66 @@ class BaseDatabaseAdapter(ABC):
|
|
|
30
35
|
pass
|
|
31
36
|
|
|
32
37
|
@abstractmethod
|
|
33
|
-
def build_count_query(
|
|
34
|
-
|
|
35
|
-
|
|
38
|
+
def build_count_query(
|
|
39
|
+
self,
|
|
40
|
+
data_ref: DataReference,
|
|
41
|
+
date_column: str,
|
|
42
|
+
start_date: Optional[str],
|
|
43
|
+
end_date: Optional[str],
|
|
44
|
+
) -> Tuple[str, Dict]:
|
|
36
45
|
"""Returns tuple of (query, params) with recent data exclusion"""
|
|
37
46
|
pass
|
|
38
47
|
|
|
39
|
-
def build_data_query_common(
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
48
|
+
def build_data_query_common(
|
|
49
|
+
self,
|
|
50
|
+
data_ref: DataReference,
|
|
51
|
+
columns: List[str],
|
|
52
|
+
date_column: Optional[str],
|
|
53
|
+
update_column: Optional[str],
|
|
54
|
+
start_date: Optional[str],
|
|
55
|
+
end_date: Optional[str],
|
|
56
|
+
exclude_recent_hours: Optional[int] = None,
|
|
57
|
+
) -> Tuple[str, Dict]:
|
|
43
58
|
"""Build data query for the DBMS with recent data exclusion"""
|
|
44
59
|
# Handle reserved words
|
|
45
60
|
cols_select = [
|
|
46
|
-
f'"{col}"' if col.lower() in RESERVED_WORDS
|
|
47
|
-
else col
|
|
48
|
-
for col in columns
|
|
61
|
+
f'"{col}"' if col.lower() in RESERVED_WORDS else col for col in columns
|
|
49
62
|
]
|
|
50
63
|
|
|
51
|
-
result = self.build_data_query(
|
|
52
|
-
|
|
64
|
+
result = self.build_data_query(
|
|
65
|
+
data_ref,
|
|
66
|
+
cols_select,
|
|
67
|
+
date_column,
|
|
68
|
+
update_column,
|
|
69
|
+
start_date,
|
|
70
|
+
end_date,
|
|
71
|
+
exclude_recent_hours,
|
|
72
|
+
)
|
|
53
73
|
return result
|
|
54
74
|
|
|
55
75
|
@abstractmethod
|
|
56
|
-
def build_data_query(
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
76
|
+
def build_data_query(
|
|
77
|
+
self,
|
|
78
|
+
data_ref: DataReference,
|
|
79
|
+
columns: List[str],
|
|
80
|
+
date_column: Optional[str],
|
|
81
|
+
update_column: Optional[str],
|
|
82
|
+
start_date: Optional[str],
|
|
83
|
+
end_date: Optional[str],
|
|
84
|
+
exclude_recent_hours: Optional[int] = None,
|
|
85
|
+
) -> Tuple[str, Dict]:
|
|
60
86
|
pass
|
|
61
87
|
|
|
62
88
|
@abstractmethod
|
|
63
|
-
def _build_exclusion_condition(
|
|
64
|
-
|
|
89
|
+
def _build_exclusion_condition(
|
|
90
|
+
self, update_column: str, exclude_recent_hours: int
|
|
91
|
+
) -> Tuple[str, Dict]:
|
|
65
92
|
"""DBMS-specific implementation for recent data exclusion"""
|
|
66
93
|
pass
|
|
67
94
|
|
|
68
|
-
def convert_types(
|
|
95
|
+
def convert_types(
|
|
96
|
+
self, df: pd.DataFrame, metadata: pd.DataFrame, timezone: str
|
|
97
|
+
) -> pd.DataFrame:
|
|
69
98
|
"""Convert DBMS-specific types to standardized formats"""
|
|
70
99
|
# there is need to specify timezone for covnersion as
|
|
71
100
|
# pandas implicitly converts to UTC tz aware cols
|
|
@@ -78,8 +107,9 @@ class BaseDatabaseAdapter(ABC):
|
|
|
78
107
|
"""Get type conversion rules for specific DBMS"""
|
|
79
108
|
pass
|
|
80
109
|
|
|
81
|
-
def _apply_type_conversion(
|
|
82
|
-
|
|
110
|
+
def _apply_type_conversion(
|
|
111
|
+
self, df: pd.DataFrame, metadata: pd.DataFrame, type_rules: Dict[str, Callable]
|
|
112
|
+
) -> pd.DataFrame:
|
|
83
113
|
"""Apply type conversion rules to DataFrame"""
|
|
84
114
|
if df.empty:
|
|
85
115
|
return df
|
|
@@ -94,7 +124,6 @@ class BaseDatabaseAdapter(ABC):
|
|
|
94
124
|
if col_name not in df.columns:
|
|
95
125
|
continue
|
|
96
126
|
|
|
97
|
-
|
|
98
127
|
col_type = col_info['data_type'].lower()
|
|
99
128
|
# Find matching conversion rule
|
|
100
129
|
converter = None
|
|
@@ -105,15 +134,15 @@ class BaseDatabaseAdapter(ABC):
|
|
|
105
134
|
break
|
|
106
135
|
|
|
107
136
|
if converter is None:
|
|
108
|
-
continue
|
|
137
|
+
continue # Skip columns without converters
|
|
109
138
|
|
|
110
139
|
try:
|
|
111
140
|
df[col_name] = converter(df[col_name])
|
|
112
141
|
except Exception as e:
|
|
113
|
-
app_logger.warning(f
|
|
142
|
+
app_logger.warning(f'Type conversion failed for {col_name}: {str(e)}')
|
|
114
143
|
df[col_name] = df[col_name].astype(str)
|
|
115
144
|
|
|
116
145
|
new_type = df[col_name].dtype
|
|
117
146
|
app_logger.debug(f'old: {col_type}, new: {new_type}')
|
|
118
147
|
|
|
119
|
-
return df
|
|
148
|
+
return df
|