xoverrr 1.1.5__tar.gz → 1.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xoverrr-1.1.5/src/xoverrr.egg-info → xoverrr-1.1.6}/PKG-INFO +3 -3
- {xoverrr-1.1.5 → xoverrr-1.1.6}/pyproject.toml +10 -4
- xoverrr-1.1.6/src/xoverrr/__init__.py +13 -0
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr/adapters/__init__.py +7 -2
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr/adapters/base.py +61 -32
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr/adapters/clickhouse.py +62 -37
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr/adapters/oracle.py +65 -36
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr/adapters/postgres.py +67 -35
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr/constants.py +4 -4
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr/core.py +296 -198
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr/exceptions.py +8 -1
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr/logger.py +4 -2
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr/models.py +11 -5
- xoverrr-1.1.6/src/xoverrr/utils.py +740 -0
- {xoverrr-1.1.5 → xoverrr-1.1.6/src/xoverrr.egg-info}/PKG-INFO +3 -3
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr.egg-info/requires.txt +2 -2
- xoverrr-1.1.5/src/xoverrr/__init__.py +0 -17
- xoverrr-1.1.5/src/xoverrr/utils.py +0 -664
- {xoverrr-1.1.5 → xoverrr-1.1.6}/LICENSE +0 -0
- {xoverrr-1.1.5 → xoverrr-1.1.6}/README.md +0 -0
- {xoverrr-1.1.5 → xoverrr-1.1.6}/setup.cfg +0 -0
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr.egg-info/SOURCES.txt +0 -0
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr.egg-info/dependency_links.txt +0 -0
- {xoverrr-1.1.5 → xoverrr-1.1.6}/src/xoverrr.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xoverrr
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.6
|
|
4
4
|
Summary: A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
|
|
5
5
|
Author-email: Dmitry Ischenko <hotmori@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -21,7 +21,7 @@ Requires-Dist: clickhouse-sqlalchemy>=0.2.0
|
|
|
21
21
|
Provides-Extra: dev
|
|
22
22
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
23
23
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
24
|
-
Requires-Dist:
|
|
24
|
+
Requires-Dist: ruff>=0.15.0; extra == "dev"
|
|
25
25
|
Requires-Dist: isort>=5.12.0; extra == "dev"
|
|
26
26
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
27
27
|
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
@@ -31,7 +31,7 @@ Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
|
31
31
|
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
32
32
|
Requires-Dist: tenacity>=8.2.0; extra == "test"
|
|
33
33
|
Provides-Extra: lint
|
|
34
|
-
Requires-Dist:
|
|
34
|
+
Requires-Dist: ruff>=0.15.0; extra == "lint"
|
|
35
35
|
Requires-Dist: isort>=5.12.0; extra == "lint"
|
|
36
36
|
Requires-Dist: flake8>=6.0.0; extra == "lint"
|
|
37
37
|
Dynamic: license-file
|
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "xoverrr"
|
|
8
|
-
version = "1.1.
|
|
8
|
+
version = "1.1.6"
|
|
9
9
|
description = "A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting."
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
requires-python = ">=3.9"
|
|
@@ -36,7 +36,7 @@ Homepage = "https://github.com/dima-ischenko/xoverrr"
|
|
|
36
36
|
dev = [
|
|
37
37
|
"pytest>=7.0.0",
|
|
38
38
|
"pytest-cov>=4.0.0",
|
|
39
|
-
"
|
|
39
|
+
"ruff>=0.15.0",
|
|
40
40
|
"isort>=5.12.0",
|
|
41
41
|
"mypy>=1.0.0",
|
|
42
42
|
"pre-commit>=3.0.0",
|
|
@@ -48,7 +48,7 @@ test = [
|
|
|
48
48
|
"tenacity>=8.2.0"
|
|
49
49
|
]
|
|
50
50
|
lint = [
|
|
51
|
-
"
|
|
51
|
+
"ruff>=0.15.0",
|
|
52
52
|
"isort>=5.12.0",
|
|
53
53
|
"flake8>=6.0.0",
|
|
54
54
|
]
|
|
@@ -59,4 +59,10 @@ where = ["src"]
|
|
|
59
59
|
[tool.pytest.ini_options]
|
|
60
60
|
pythonpath = ["src"]
|
|
61
61
|
testpaths = ["tests"]
|
|
62
|
-
addopts = "-v"
|
|
62
|
+
addopts = "-v"
|
|
63
|
+
|
|
64
|
+
[tool.ruff]
|
|
65
|
+
target-version = "py39"
|
|
66
|
+
|
|
67
|
+
[tool.ruff.format]
|
|
68
|
+
quote-style = "single"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from .constants import (COMPARISON_FAILED, COMPARISON_SKIPPED,
|
|
2
|
+
COMPARISON_SUCCESS)
|
|
3
|
+
from .core import DataQualityComparator, DataReference
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
'DataQualityComparator',
|
|
7
|
+
'DataReference',
|
|
8
|
+
'COMPARISON_SUCCESS',
|
|
9
|
+
'COMPARISON_FAILED',
|
|
10
|
+
'COMPARISON_SKIPPED',
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
__version__ = '1.1.6'
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
from .base import BaseDatabaseAdapter
|
|
2
|
+
from .clickhouse import ClickHouseAdapter
|
|
2
3
|
from .oracle import OracleAdapter
|
|
3
4
|
from .postgres import PostgresAdapter
|
|
4
|
-
from .clickhouse import ClickHouseAdapter
|
|
5
5
|
|
|
6
|
-
__all__ = [
|
|
6
|
+
__all__ = [
|
|
7
|
+
'BaseDatabaseAdapter',
|
|
8
|
+
'OracleAdapter',
|
|
9
|
+
'PostgresAdapter',
|
|
10
|
+
'ClickHouseAdapter',
|
|
11
|
+
]
|
|
@@ -1,18 +1,23 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
import pandas as pd
|
|
3
|
-
from typing import Dict, Callable, List, Tuple, Optional, Union
|
|
4
1
|
import re
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
5
3
|
from datetime import datetime, timedelta
|
|
6
|
-
from
|
|
7
|
-
|
|
4
|
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
8
7
|
from sqlalchemy.engine import Engine
|
|
8
|
+
|
|
9
|
+
from ..constants import RESERVED_WORDS
|
|
9
10
|
from ..logger import app_logger
|
|
10
|
-
from ..
|
|
11
|
+
from ..models import DataReference, ObjectType
|
|
12
|
+
|
|
11
13
|
|
|
12
14
|
class BaseDatabaseAdapter(ABC):
|
|
13
15
|
"""Abstract base class with updated method signatures for parameterized queries"""
|
|
16
|
+
|
|
14
17
|
@abstractmethod
|
|
15
|
-
def _execute_query(
|
|
18
|
+
def _execute_query(
|
|
19
|
+
self, query: Union[str, Tuple[str, Dict]], engine: Engine, timezone: str
|
|
20
|
+
) -> pd.DataFrame:
|
|
16
21
|
"""Execute query with DBMS-specific optimizations"""
|
|
17
22
|
pass
|
|
18
23
|
|
|
@@ -30,42 +35,66 @@ class BaseDatabaseAdapter(ABC):
|
|
|
30
35
|
pass
|
|
31
36
|
|
|
32
37
|
@abstractmethod
|
|
33
|
-
def build_count_query(
|
|
34
|
-
|
|
35
|
-
|
|
38
|
+
def build_count_query(
|
|
39
|
+
self,
|
|
40
|
+
data_ref: DataReference,
|
|
41
|
+
date_column: str,
|
|
42
|
+
start_date: Optional[str],
|
|
43
|
+
end_date: Optional[str],
|
|
44
|
+
) -> Tuple[str, Dict]:
|
|
36
45
|
"""Returns tuple of (query, params) with recent data exclusion"""
|
|
37
46
|
pass
|
|
38
47
|
|
|
39
|
-
def build_data_query_common(
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
48
|
+
def build_data_query_common(
|
|
49
|
+
self,
|
|
50
|
+
data_ref: DataReference,
|
|
51
|
+
columns: List[str],
|
|
52
|
+
date_column: Optional[str],
|
|
53
|
+
update_column: Optional[str],
|
|
54
|
+
start_date: Optional[str],
|
|
55
|
+
end_date: Optional[str],
|
|
56
|
+
exclude_recent_hours: Optional[int] = None,
|
|
57
|
+
) -> Tuple[str, Dict]:
|
|
43
58
|
"""Build data query for the DBMS with recent data exclusion"""
|
|
44
59
|
# Handle reserved words
|
|
45
60
|
cols_select = [
|
|
46
|
-
f'"{col}"' if col.lower() in RESERVED_WORDS
|
|
47
|
-
else col
|
|
48
|
-
for col in columns
|
|
61
|
+
f'"{col}"' if col.lower() in RESERVED_WORDS else col for col in columns
|
|
49
62
|
]
|
|
50
63
|
|
|
51
|
-
result = self.build_data_query(
|
|
52
|
-
|
|
64
|
+
result = self.build_data_query(
|
|
65
|
+
data_ref,
|
|
66
|
+
cols_select,
|
|
67
|
+
date_column,
|
|
68
|
+
update_column,
|
|
69
|
+
start_date,
|
|
70
|
+
end_date,
|
|
71
|
+
exclude_recent_hours,
|
|
72
|
+
)
|
|
53
73
|
return result
|
|
54
74
|
|
|
55
75
|
@abstractmethod
|
|
56
|
-
def build_data_query(
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
76
|
+
def build_data_query(
|
|
77
|
+
self,
|
|
78
|
+
data_ref: DataReference,
|
|
79
|
+
columns: List[str],
|
|
80
|
+
date_column: Optional[str],
|
|
81
|
+
update_column: Optional[str],
|
|
82
|
+
start_date: Optional[str],
|
|
83
|
+
end_date: Optional[str],
|
|
84
|
+
exclude_recent_hours: Optional[int] = None,
|
|
85
|
+
) -> Tuple[str, Dict]:
|
|
60
86
|
pass
|
|
61
87
|
|
|
62
88
|
@abstractmethod
|
|
63
|
-
def _build_exclusion_condition(
|
|
64
|
-
|
|
89
|
+
def _build_exclusion_condition(
|
|
90
|
+
self, update_column: str, exclude_recent_hours: int
|
|
91
|
+
) -> Tuple[str, Dict]:
|
|
65
92
|
"""DBMS-specific implementation for recent data exclusion"""
|
|
66
93
|
pass
|
|
67
94
|
|
|
68
|
-
def convert_types(
|
|
95
|
+
def convert_types(
|
|
96
|
+
self, df: pd.DataFrame, metadata: pd.DataFrame, timezone: str
|
|
97
|
+
) -> pd.DataFrame:
|
|
69
98
|
"""Convert DBMS-specific types to standardized formats"""
|
|
70
99
|
# there is need to specify timezone for covnersion as
|
|
71
100
|
# pandas implicitly converts to UTC tz aware cols
|
|
@@ -78,8 +107,9 @@ class BaseDatabaseAdapter(ABC):
|
|
|
78
107
|
"""Get type conversion rules for specific DBMS"""
|
|
79
108
|
pass
|
|
80
109
|
|
|
81
|
-
def _apply_type_conversion(
|
|
82
|
-
|
|
110
|
+
def _apply_type_conversion(
|
|
111
|
+
self, df: pd.DataFrame, metadata: pd.DataFrame, type_rules: Dict[str, Callable]
|
|
112
|
+
) -> pd.DataFrame:
|
|
83
113
|
"""Apply type conversion rules to DataFrame"""
|
|
84
114
|
if df.empty:
|
|
85
115
|
return df
|
|
@@ -94,7 +124,6 @@ class BaseDatabaseAdapter(ABC):
|
|
|
94
124
|
if col_name not in df.columns:
|
|
95
125
|
continue
|
|
96
126
|
|
|
97
|
-
|
|
98
127
|
col_type = col_info['data_type'].lower()
|
|
99
128
|
# Find matching conversion rule
|
|
100
129
|
converter = None
|
|
@@ -105,15 +134,15 @@ class BaseDatabaseAdapter(ABC):
|
|
|
105
134
|
break
|
|
106
135
|
|
|
107
136
|
if converter is None:
|
|
108
|
-
continue
|
|
137
|
+
continue # Skip columns without converters
|
|
109
138
|
|
|
110
139
|
try:
|
|
111
140
|
df[col_name] = converter(df[col_name])
|
|
112
141
|
except Exception as e:
|
|
113
|
-
app_logger.warning(f
|
|
142
|
+
app_logger.warning(f'Type conversion failed for {col_name}: {str(e)}')
|
|
114
143
|
df[col_name] = df[col_name].astype(str)
|
|
115
144
|
|
|
116
145
|
new_type = df[col_name].dtype
|
|
117
146
|
app_logger.debug(f'old: {col_type}, new: {new_type}')
|
|
118
147
|
|
|
119
|
-
return df
|
|
148
|
+
return df
|
|
@@ -1,15 +1,21 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
1
4
|
import pandas as pd
|
|
2
|
-
|
|
5
|
+
|
|
3
6
|
from ..constants import DATE_FORMAT, DATETIME_FORMAT
|
|
4
|
-
from .base import BaseDatabaseAdapter, Engine
|
|
5
|
-
from ..models import DataReference, ObjectType
|
|
6
7
|
from ..exceptions import QueryExecutionError
|
|
7
|
-
import time
|
|
8
8
|
from ..logger import app_logger
|
|
9
|
+
from ..models import DataReference, ObjectType
|
|
10
|
+
from .base import BaseDatabaseAdapter, Engine
|
|
11
|
+
|
|
9
12
|
|
|
10
13
|
class ClickHouseAdapter(BaseDatabaseAdapter):
|
|
11
14
|
"""ClickHouse adapter with parameterized queries"""
|
|
12
|
-
|
|
15
|
+
|
|
16
|
+
def _execute_query(
|
|
17
|
+
self, query: Union[str, Tuple[str, Dict]], engine: Engine, timezone: str
|
|
18
|
+
) -> pd.DataFrame:
|
|
13
19
|
df = None
|
|
14
20
|
tz_set = None
|
|
15
21
|
start_time = time.time()
|
|
@@ -32,14 +38,16 @@ class ClickHouseAdapter(BaseDatabaseAdapter):
|
|
|
32
38
|
df = pd.read_sql(query, engine)
|
|
33
39
|
|
|
34
40
|
execution_time = time.time() - start_time
|
|
35
|
-
app_logger.info(f
|
|
41
|
+
app_logger.info(f'Query executed in {execution_time:.2f}s')
|
|
36
42
|
return df
|
|
37
43
|
|
|
38
44
|
except Exception as e:
|
|
39
45
|
execution_time = time.time() - start_time
|
|
40
|
-
app_logger.error(
|
|
46
|
+
app_logger.error(
|
|
47
|
+
f'Query execution failed after {execution_time:.2f}s: {str(e)}'
|
|
48
|
+
)
|
|
41
49
|
|
|
42
|
-
raise QueryExecutionError(f
|
|
50
|
+
raise QueryExecutionError(f'Query failed: {str(e)}')
|
|
43
51
|
|
|
44
52
|
def get_object_type(self, data_ref: DataReference, engine: Engine) -> ObjectType:
|
|
45
53
|
"""Determine if object is table or view in ClickHouse"""
|
|
@@ -67,7 +75,9 @@ class ClickHouseAdapter(BaseDatabaseAdapter):
|
|
|
67
75
|
else:
|
|
68
76
|
return ObjectType.TABLE
|
|
69
77
|
except Exception as e:
|
|
70
|
-
app_logger.warning(
|
|
78
|
+
app_logger.warning(
|
|
79
|
+
f'Could not determine object type for {data_ref.full_name}: {str(e)}'
|
|
80
|
+
)
|
|
71
81
|
|
|
72
82
|
return ObjectType.UNKNOWN
|
|
73
83
|
|
|
@@ -97,8 +107,13 @@ class ClickHouseAdapter(BaseDatabaseAdapter):
|
|
|
97
107
|
params = {'schema': data_ref.schema, 'table': data_ref.name}
|
|
98
108
|
return query, params
|
|
99
109
|
|
|
100
|
-
def build_count_query(
|
|
101
|
-
|
|
110
|
+
def build_count_query(
|
|
111
|
+
self,
|
|
112
|
+
data_ref: DataReference,
|
|
113
|
+
date_column: str,
|
|
114
|
+
start_date: Optional[str],
|
|
115
|
+
end_date: Optional[str],
|
|
116
|
+
) -> Tuple[str, Dict]:
|
|
102
117
|
query = f"""
|
|
103
118
|
SELECT
|
|
104
119
|
formatDateTime(toDate({date_column}), '%%Y-%%m-%%d') as dt,
|
|
@@ -108,24 +123,29 @@ class ClickHouseAdapter(BaseDatabaseAdapter):
|
|
|
108
123
|
"""
|
|
109
124
|
params = {}
|
|
110
125
|
|
|
111
|
-
|
|
112
126
|
if start_date:
|
|
113
|
-
query += f
|
|
127
|
+
query += f' AND {date_column} >= toDate(%(start_date)s)'
|
|
114
128
|
params['start_date'] = start_date
|
|
115
129
|
if end_date:
|
|
116
|
-
query += f
|
|
130
|
+
query += f' AND {date_column} < toDate(%(end_date)s) + INTERVAL 1 day'
|
|
117
131
|
params['end_date'] = end_date
|
|
118
132
|
|
|
119
|
-
query +=
|
|
133
|
+
query += ' GROUP BY dt ORDER BY dt DESC'
|
|
120
134
|
return query, params
|
|
121
135
|
|
|
122
|
-
def build_data_query(
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
136
|
+
def build_data_query(
|
|
137
|
+
self,
|
|
138
|
+
data_ref: DataReference,
|
|
139
|
+
columns: List[str],
|
|
140
|
+
date_column: Optional[str],
|
|
141
|
+
update_column: str,
|
|
142
|
+
start_date: Optional[str],
|
|
143
|
+
end_date: Optional[str],
|
|
144
|
+
exclude_recent_hours: Optional[int] = None,
|
|
145
|
+
) -> Tuple[str, Dict]:
|
|
126
146
|
params = {}
|
|
127
147
|
# Add recent data exclusion flag
|
|
128
|
-
exclusion_condition,
|
|
148
|
+
exclusion_condition, exclusion_params = self._build_exclusion_condition(
|
|
129
149
|
update_column, exclude_recent_hours
|
|
130
150
|
)
|
|
131
151
|
|
|
@@ -139,36 +159,41 @@ class ClickHouseAdapter(BaseDatabaseAdapter):
|
|
|
139
159
|
WHERE 1=1\n"""
|
|
140
160
|
|
|
141
161
|
if start_date and date_column:
|
|
142
|
-
query += f
|
|
162
|
+
query += f' AND {date_column} >= toDate(%(start_date)s)\n'
|
|
143
163
|
params['start_date'] = start_date
|
|
144
164
|
if end_date and date_column:
|
|
145
|
-
query += f
|
|
165
|
+
query += f' AND {date_column} < toDate(%(end_date)s) + INTERVAL 1 day\n'
|
|
146
166
|
params['end_date'] = end_date
|
|
147
167
|
|
|
148
168
|
return query, params
|
|
149
169
|
|
|
150
|
-
def _build_exclusion_condition(
|
|
151
|
-
|
|
170
|
+
def _build_exclusion_condition(
|
|
171
|
+
self, update_column: str, exclude_recent_hours: int
|
|
172
|
+
) -> Tuple[str, Dict]:
|
|
152
173
|
"""ClickHouse-specific implementation for recent data exclusion"""
|
|
153
|
-
if
|
|
154
|
-
|
|
155
|
-
|
|
174
|
+
if update_column and exclude_recent_hours:
|
|
156
175
|
exclude_recent_hours = exclude_recent_hours
|
|
157
176
|
|
|
158
177
|
condition = f"""case when {update_column} > (now() - INTERVAL %(exclude_recent_hours)s HOUR) then 'y' end as xrecently_changed"""
|
|
159
|
-
params = {'exclude_recent_hours':
|
|
178
|
+
params = {'exclude_recent_hours': exclude_recent_hours}
|
|
160
179
|
return condition, params
|
|
161
180
|
|
|
162
181
|
return None, None
|
|
163
182
|
|
|
164
183
|
def _get_type_conversion_rules(self, timezone: str) -> Dict[str, Callable]:
|
|
165
184
|
return {
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
r'
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
185
|
+
r'datetime64|datetime': lambda x: (
|
|
186
|
+
pd.to_datetime(x, utc=True, errors='coerce')
|
|
187
|
+
.dt.tz_convert(timezone)
|
|
188
|
+
.dt.strftime(DATETIME_FORMAT)
|
|
189
|
+
.str.replace(r'\s00:00:00$', '', regex=True)
|
|
190
|
+
),
|
|
191
|
+
r'date': lambda x: (
|
|
192
|
+
pd.to_datetime(x, errors='coerce')
|
|
193
|
+
.dt.strftime(DATE_FORMAT)
|
|
194
|
+
.str.replace(r'\s00:00:00$', '', regex=True)
|
|
195
|
+
),
|
|
196
|
+
r'uint64|uint8|float|decimal|int32': lambda x: x.astype(str).str.replace(
|
|
197
|
+
r'\.0+$', '', regex=True
|
|
198
|
+
),
|
|
199
|
+
}
|
|
@@ -1,16 +1,19 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from typing import Callable, Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
1
4
|
import pandas as pd
|
|
2
|
-
from typing import Optional, Dict, Callable, List, Tuple, Union
|
|
3
5
|
|
|
4
6
|
from ..constants import DATETIME_FORMAT
|
|
5
|
-
from .base import BaseDatabaseAdapter, Engine
|
|
6
|
-
from ..models import DataReference, ObjectType
|
|
7
7
|
from ..exceptions import QueryExecutionError
|
|
8
8
|
from ..logger import app_logger
|
|
9
|
-
import
|
|
9
|
+
from ..models import DataReference, ObjectType
|
|
10
|
+
from .base import BaseDatabaseAdapter, Engine
|
|
10
11
|
|
|
11
|
-
class OracleAdapter(BaseDatabaseAdapter):
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
class OracleAdapter(BaseDatabaseAdapter):
|
|
14
|
+
def _execute_query(
|
|
15
|
+
self, query: Union[str, Tuple[str, Dict]], engine: Engine, timezone: str
|
|
16
|
+
) -> pd.DataFrame:
|
|
14
17
|
tz_set = None
|
|
15
18
|
raw_conn = None
|
|
16
19
|
cursor = None
|
|
@@ -40,12 +43,11 @@ class OracleAdapter(BaseDatabaseAdapter):
|
|
|
40
43
|
app_logger.info(f'query\n {query}')
|
|
41
44
|
cursor.execute(query)
|
|
42
45
|
|
|
43
|
-
|
|
44
46
|
columns = [col[0].lower() for col in cursor.description]
|
|
45
47
|
data = cursor.fetchall()
|
|
46
48
|
|
|
47
49
|
execution_time = time.time() - start_time
|
|
48
|
-
app_logger.info(f
|
|
50
|
+
app_logger.info(f'Query executed in {execution_time:.2f}s')
|
|
49
51
|
|
|
50
52
|
app_logger.info('complete')
|
|
51
53
|
|
|
@@ -57,20 +59,22 @@ class OracleAdapter(BaseDatabaseAdapter):
|
|
|
57
59
|
|
|
58
60
|
except Exception as e:
|
|
59
61
|
execution_time = time.time() - start_time
|
|
60
|
-
app_logger.error(
|
|
62
|
+
app_logger.error(
|
|
63
|
+
f'Query execution failed after {execution_time:.2f}s: {str(e)}'
|
|
64
|
+
)
|
|
61
65
|
|
|
62
66
|
if raw_conn:
|
|
63
67
|
try:
|
|
64
68
|
raw_conn.rollback()
|
|
65
69
|
except Exception as rollback_error:
|
|
66
|
-
app_logger.warning(f
|
|
70
|
+
app_logger.warning(f'Rollback failed: {rollback_error}')
|
|
67
71
|
try:
|
|
68
72
|
if cursor:
|
|
69
73
|
cursor.close()
|
|
70
74
|
except Exception as close_error:
|
|
71
|
-
app_logger.warning(f
|
|
75
|
+
app_logger.warning(f'Cursor close failed: {close_error}')
|
|
72
76
|
|
|
73
|
-
raise QueryExecutionError(f
|
|
77
|
+
raise QueryExecutionError(f'Query failed: {str(e)}')
|
|
74
78
|
|
|
75
79
|
def get_object_type(self, data_ref: DataReference, engine: Engine) -> ObjectType:
|
|
76
80
|
"""Determine if object is table or view in Oracle"""
|
|
@@ -95,10 +99,12 @@ class OracleAdapter(BaseDatabaseAdapter):
|
|
|
95
99
|
return {
|
|
96
100
|
'table': ObjectType.TABLE,
|
|
97
101
|
'view': ObjectType.VIEW,
|
|
98
|
-
'materialized_view': ObjectType.MATERIALIZED_VIEW
|
|
102
|
+
'materialized_view': ObjectType.MATERIALIZED_VIEW,
|
|
99
103
|
}.get(type_str, ObjectType.UNKNOWN)
|
|
100
104
|
except Exception as e:
|
|
101
|
-
app_logger.warning(
|
|
105
|
+
app_logger.warning(
|
|
106
|
+
f'Could not determine object type for {data_ref.full_name}: {str(e)}'
|
|
107
|
+
)
|
|
102
108
|
|
|
103
109
|
return ObjectType.UNKNOWN
|
|
104
110
|
|
|
@@ -121,7 +127,7 @@ class OracleAdapter(BaseDatabaseAdapter):
|
|
|
121
127
|
|
|
122
128
|
def build_primary_key_query(self, data_ref: DataReference) -> pd.DataFrame:
|
|
123
129
|
|
|
124
|
-
#todo add suport of unique indexes when no pk?
|
|
130
|
+
# todo add suport of unique indexes when no pk?
|
|
125
131
|
query = """
|
|
126
132
|
SELECT lower(cols.column_name) as pk_column_name
|
|
127
133
|
FROM all_constraints cons
|
|
@@ -139,9 +145,13 @@ class OracleAdapter(BaseDatabaseAdapter):
|
|
|
139
145
|
params['table_name'] = data_ref.name
|
|
140
146
|
return query, params
|
|
141
147
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
148
|
+
def build_count_query(
|
|
149
|
+
self,
|
|
150
|
+
data_ref: DataReference,
|
|
151
|
+
date_column: str,
|
|
152
|
+
start_date: Optional[str],
|
|
153
|
+
end_date: Optional[str],
|
|
154
|
+
) -> Tuple[str, Dict]:
|
|
145
155
|
query = f"""
|
|
146
156
|
SELECT
|
|
147
157
|
to_char(trunc({date_column}, 'dd'),'YYYY-MM-DD') as dt,
|
|
@@ -150,7 +160,6 @@ class OracleAdapter(BaseDatabaseAdapter):
|
|
|
150
160
|
WHERE 1=1\n"""
|
|
151
161
|
params = {}
|
|
152
162
|
|
|
153
|
-
|
|
154
163
|
if start_date:
|
|
155
164
|
query += f" AND {date_column} >= trunc(to_date(:start_date, 'YYYY-MM-DD'), 'dd')\n"
|
|
156
165
|
params['start_date'] = start_date
|
|
@@ -161,14 +170,20 @@ class OracleAdapter(BaseDatabaseAdapter):
|
|
|
161
170
|
query += f" GROUP BY to_char(trunc({date_column}, 'dd'),'YYYY-MM-DD') ORDER BY dt DESC"
|
|
162
171
|
return query, params
|
|
163
172
|
|
|
164
|
-
def build_data_query(
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
173
|
+
def build_data_query(
|
|
174
|
+
self,
|
|
175
|
+
data_ref: DataReference,
|
|
176
|
+
columns: List[str],
|
|
177
|
+
date_column: Optional[str],
|
|
178
|
+
update_column: str,
|
|
179
|
+
start_date: Optional[str],
|
|
180
|
+
end_date: Optional[str],
|
|
181
|
+
exclude_recent_hours: Optional[int] = None,
|
|
182
|
+
) -> Tuple[str, Dict]:
|
|
168
183
|
|
|
169
184
|
params = {}
|
|
170
185
|
# Add recent data exclusion flag
|
|
171
|
-
exclusion_condition,
|
|
186
|
+
exclusion_condition, exclusion_params = self._build_exclusion_condition(
|
|
172
187
|
update_column, exclude_recent_hours
|
|
173
188
|
)
|
|
174
189
|
|
|
@@ -191,25 +206,39 @@ class OracleAdapter(BaseDatabaseAdapter):
|
|
|
191
206
|
|
|
192
207
|
return query, params
|
|
193
208
|
|
|
194
|
-
def _build_exclusion_condition(
|
|
195
|
-
|
|
209
|
+
def _build_exclusion_condition(
|
|
210
|
+
self, update_column: str, exclude_recent_hours: int
|
|
211
|
+
) -> Tuple[str, Dict]:
|
|
196
212
|
"""Oracle-specific implementation for recent data exclusion"""
|
|
197
|
-
if
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
213
|
+
if update_column and exclude_recent_hours:
|
|
201
214
|
condition = f"""case when {update_column} > (sysdate - :exclude_recent_hours/24) then 'y' end as xrecently_changed"""
|
|
202
|
-
params = {'exclude_recent_hours':
|
|
215
|
+
params = {'exclude_recent_hours': exclude_recent_hours}
|
|
203
216
|
return condition, params
|
|
204
217
|
|
|
205
218
|
return None, None
|
|
206
219
|
|
|
207
220
|
def _get_type_conversion_rules(self, timezone: str) -> Dict[str, Callable]:
|
|
208
221
|
return {
|
|
209
|
-
#errors='coerce' is needed as workaround for >= 2262 year: Out of bounds nanosecond timestamp (3023-04-04 00:00:00)
|
|
222
|
+
# errors='coerce' is needed as workaround for >= 2262 year: Out of bounds nanosecond timestamp (3023-04-04 00:00:00)
|
|
210
223
|
# todo need specify explicit dateformat (nls params) in sessions, for the correct string conversion to datetime
|
|
211
|
-
r'date': lambda x:
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
224
|
+
r'date': lambda x: (
|
|
225
|
+
pd.to_datetime(x, errors='coerce')
|
|
226
|
+
.dt.strftime(DATETIME_FORMAT)
|
|
227
|
+
.str.replace(r'\s00:00:00$', '', regex=True)
|
|
228
|
+
),
|
|
229
|
+
r'timestamp.*\bwith\b.*time\szone': lambda x: (
|
|
230
|
+
pd.to_datetime(x, utc=True, errors='coerce')
|
|
231
|
+
.dt.tz_convert(timezone)
|
|
232
|
+
.dt.tz_localize(None)
|
|
233
|
+
.dt.strftime(DATETIME_FORMAT)
|
|
234
|
+
.str.replace(r'\s00:00:00$', '', regex=True)
|
|
235
|
+
),
|
|
236
|
+
r'timestamp': lambda x: (
|
|
237
|
+
pd.to_datetime(x, errors='coerce')
|
|
238
|
+
.dt.strftime(DATETIME_FORMAT)
|
|
239
|
+
.str.replace(r'\s00:00:00$', '', regex=True)
|
|
240
|
+
),
|
|
241
|
+
r'number|float|double': lambda x: (
|
|
242
|
+
x.astype(str).str.replace(r'\.0+$', '', regex=True).str.lower()
|
|
243
|
+
), # lower case for exponential form compare
|
|
215
244
|
}
|