xoverrr 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xoverrr/__init__.py +8 -12
- xoverrr/adapters/__init__.py +7 -2
- xoverrr/adapters/base.py +61 -32
- xoverrr/adapters/clickhouse.py +62 -37
- xoverrr/adapters/oracle.py +65 -36
- xoverrr/adapters/postgres.py +67 -35
- xoverrr/constants.py +4 -4
- xoverrr/core.py +296 -198
- xoverrr/exceptions.py +8 -1
- xoverrr/logger.py +4 -2
- xoverrr/models.py +11 -5
- xoverrr/utils.py +328 -252
- {xoverrr-1.1.5.dist-info → xoverrr-1.1.6.dist-info}/METADATA +3 -3
- xoverrr-1.1.6.dist-info/RECORD +17 -0
- xoverrr-1.1.5.dist-info/RECORD +0 -17
- {xoverrr-1.1.5.dist-info → xoverrr-1.1.6.dist-info}/WHEEL +0 -0
- {xoverrr-1.1.5.dist-info → xoverrr-1.1.6.dist-info}/licenses/LICENSE +0 -0
- {xoverrr-1.1.5.dist-info → xoverrr-1.1.6.dist-info}/top_level.txt +0 -0
xoverrr/core.py
CHANGED
|
@@ -1,41 +1,21 @@
|
|
|
1
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
1
2
|
|
|
2
|
-
import sys
|
|
3
|
-
from enum import Enum, auto
|
|
4
|
-
from typing import Optional, List, Dict, Callable, Union, Tuple, Any
|
|
5
3
|
import pandas as pd
|
|
6
4
|
from sqlalchemy.engine import Engine
|
|
7
|
-
from .models import (
|
|
8
|
-
DBMSType,
|
|
9
|
-
DataReference,
|
|
10
|
-
ObjectType
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
from .logger import app_logger
|
|
14
5
|
|
|
6
|
+
from . import constants as ct
|
|
7
|
+
from .adapters.base import BaseDatabaseAdapter
|
|
8
|
+
from .adapters.clickhouse import ClickHouseAdapter
|
|
15
9
|
from .adapters.oracle import OracleAdapter
|
|
16
10
|
from .adapters.postgres import PostgresAdapter
|
|
17
|
-
from .
|
|
18
|
-
from .
|
|
19
|
-
|
|
20
|
-
from . import
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
)
|
|
26
|
-
from .utils import (
|
|
27
|
-
prepare_dataframe,
|
|
28
|
-
compare_dataframes,
|
|
29
|
-
clean_recently_changed_data,
|
|
30
|
-
generate_comparison_sample_report,
|
|
31
|
-
generate_comparison_count_report,
|
|
32
|
-
cross_fill_missing_dates,
|
|
33
|
-
validate_dataframe_size,
|
|
34
|
-
ComparisonStats,
|
|
35
|
-
ComparisonDiffDetails
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
|
|
11
|
+
from .exceptions import DQCompareException, MetadataError
|
|
12
|
+
from .logger import app_logger
|
|
13
|
+
from .models import DataReference, DBMSType, ObjectType
|
|
14
|
+
from .utils import (ComparisonDiffDetails, ComparisonStats,
|
|
15
|
+
clean_recently_changed_data, compare_dataframes,
|
|
16
|
+
cross_fill_missing_dates, generate_comparison_count_report,
|
|
17
|
+
generate_comparison_sample_report, normalize_column_names,
|
|
18
|
+
prepare_dataframe, validate_dataframe_size)
|
|
39
19
|
|
|
40
20
|
|
|
41
21
|
class DataQualityComparator:
|
|
@@ -48,7 +28,7 @@ class DataQualityComparator:
|
|
|
48
28
|
source_engine: Engine,
|
|
49
29
|
target_engine: Engine,
|
|
50
30
|
default_exclude_recent_hours: Optional[int] = 24,
|
|
51
|
-
timezone: str = ct.DEFAULT_TZ
|
|
31
|
+
timezone: str = ct.DEFAULT_TZ,
|
|
52
32
|
):
|
|
53
33
|
self.source_engine = source_engine
|
|
54
34
|
self.target_engine = target_engine
|
|
@@ -64,11 +44,11 @@ class DataQualityComparator:
|
|
|
64
44
|
}
|
|
65
45
|
self._reset_stats()
|
|
66
46
|
from . import __version__
|
|
47
|
+
|
|
67
48
|
app_logger.info('start')
|
|
68
49
|
app_logger.info(f'Version: v{__version__}')
|
|
69
50
|
app_logger.info(f'Source DB: {self.source_db_type.name}')
|
|
70
|
-
app_logger.info(f'Target DB: {self.target_db_type.name}')
|
|
71
|
-
|
|
51
|
+
app_logger.info(f'Target DB: {self.target_db_type.name}')
|
|
72
52
|
|
|
73
53
|
def reset_stats(self):
|
|
74
54
|
self._reset_stats()
|
|
@@ -79,17 +59,19 @@ class DataQualityComparator:
|
|
|
79
59
|
ct.COMPARISON_SUCCESS: 0,
|
|
80
60
|
ct.COMPARISON_FAILED: 0,
|
|
81
61
|
ct.COMPARISON_SKIPPED: 0,
|
|
82
|
-
'tables_success'
|
|
83
|
-
'tables_failed'
|
|
62
|
+
'tables_success': set(),
|
|
63
|
+
'tables_failed': set(),
|
|
84
64
|
'tables_skipped': set(),
|
|
85
65
|
'start_time': pd.Timestamp.now().strftime(ct.DATETIME_FORMAT),
|
|
86
|
-
'end_time': None
|
|
66
|
+
'end_time': None,
|
|
87
67
|
}
|
|
88
68
|
|
|
89
|
-
def _update_stats(self, status: str, source_table:DataReference):
|
|
69
|
+
def _update_stats(self, status: str, source_table: DataReference):
|
|
90
70
|
"""Update comparison statistics"""
|
|
91
71
|
self.comparison_stats[status] += 1
|
|
92
|
-
self.comparison_stats['end_time'] = pd.Timestamp.now().strftime(
|
|
72
|
+
self.comparison_stats['end_time'] = pd.Timestamp.now().strftime(
|
|
73
|
+
ct.DATETIME_FORMAT
|
|
74
|
+
)
|
|
93
75
|
if source_table:
|
|
94
76
|
match status:
|
|
95
77
|
case ct.COMPARISON_SUCCESS:
|
|
@@ -106,7 +88,7 @@ class DataQualityComparator:
|
|
|
106
88
|
date_column: Optional[str] = None,
|
|
107
89
|
date_range: Optional[Tuple[str, str]] = None,
|
|
108
90
|
tolerance_percentage: float = 0.0,
|
|
109
|
-
max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES
|
|
91
|
+
max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
|
|
110
92
|
) -> Tuple[str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
|
|
111
93
|
|
|
112
94
|
self._validate_inputs(source_table, target_table)
|
|
@@ -116,17 +98,21 @@ class DataQualityComparator:
|
|
|
116
98
|
try:
|
|
117
99
|
self.comparison_stats['compared'] += 1
|
|
118
100
|
|
|
119
|
-
|
|
120
101
|
status, report, stats, details = self._compare_counts(
|
|
121
|
-
|
|
122
|
-
|
|
102
|
+
source_table,
|
|
103
|
+
target_table,
|
|
104
|
+
date_column,
|
|
105
|
+
start_date,
|
|
106
|
+
end_date,
|
|
107
|
+
tolerance_percentage,
|
|
108
|
+
max_examples,
|
|
123
109
|
)
|
|
124
110
|
|
|
125
111
|
self._update_stats(status, source_table)
|
|
126
112
|
return status, report, stats, details
|
|
127
113
|
|
|
128
114
|
except Exception as e:
|
|
129
|
-
app_logger.exception(f
|
|
115
|
+
app_logger.exception(f'Count comparison failed: {str(e)}')
|
|
130
116
|
status = ct.COMPARISON_FAILED
|
|
131
117
|
self._update_stats(status, source_table)
|
|
132
118
|
return status, None, None, None
|
|
@@ -143,25 +129,25 @@ class DataQualityComparator:
|
|
|
143
129
|
custom_primary_key: Optional[List[str]] = None,
|
|
144
130
|
tolerance_percentage: float = 0.0,
|
|
145
131
|
exclude_recent_hours: Optional[int] = None,
|
|
146
|
-
max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES
|
|
132
|
+
max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
|
|
147
133
|
) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
|
|
148
134
|
"""
|
|
149
135
|
Compare data from custom queries with specified key columns
|
|
150
136
|
|
|
151
137
|
Parameters:
|
|
152
|
-
source_table: `DataReference`
|
|
138
|
+
source_table: `DataReference`
|
|
153
139
|
source table to compare
|
|
154
140
|
target_table: `DataReference`
|
|
155
141
|
target table to compare
|
|
156
142
|
custom_primary_key : `List[str]`
|
|
157
143
|
List of primary key columns for comparison.
|
|
158
|
-
exclude_columns : `Optional[List[str]] = None`
|
|
144
|
+
exclude_columns : `Optional[List[str]] = None`
|
|
159
145
|
Columns to exclude from comparison.
|
|
160
|
-
include_columns : `Optional[List[str]] = None`
|
|
146
|
+
include_columns : `Optional[List[str]] = None`
|
|
161
147
|
Columns to include from comparison (default all cols)
|
|
162
|
-
tolerance_percentage : `float`
|
|
148
|
+
tolerance_percentage : `float`
|
|
163
149
|
Tolerance percentage for discrepancies.
|
|
164
|
-
max_examples
|
|
150
|
+
max_examples
|
|
165
151
|
Maximum number of discrepancy examples per column
|
|
166
152
|
"""
|
|
167
153
|
self._validate_inputs(source_table, target_table)
|
|
@@ -169,35 +155,51 @@ class DataQualityComparator:
|
|
|
169
155
|
exclude_hours = exclude_recent_hours or self.default_exclude_recent_hours
|
|
170
156
|
|
|
171
157
|
start_date, end_date = date_range or (None, None)
|
|
172
|
-
exclude_cols = exclude_columns or []
|
|
173
|
-
custom_keys =
|
|
174
|
-
|
|
158
|
+
exclude_cols = normalize_column_names(exclude_columns or [])
|
|
159
|
+
custom_keys = (
|
|
160
|
+
normalize_column_names(custom_primary_key or [])
|
|
161
|
+
if custom_primary_key
|
|
162
|
+
else None
|
|
163
|
+
)
|
|
164
|
+
include_cols = normalize_column_names(include_columns or [])
|
|
175
165
|
|
|
176
166
|
try:
|
|
177
167
|
self.comparison_stats['compared'] += 1
|
|
178
168
|
|
|
179
169
|
status, report, stats, details = self._compare_samples(
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
170
|
+
source_table,
|
|
171
|
+
target_table,
|
|
172
|
+
date_column,
|
|
173
|
+
update_column,
|
|
174
|
+
start_date,
|
|
175
|
+
end_date,
|
|
176
|
+
exclude_cols,
|
|
177
|
+
include_cols,
|
|
178
|
+
custom_keys,
|
|
179
|
+
tolerance_percentage,
|
|
180
|
+
exclude_hours,
|
|
181
|
+
max_examples,
|
|
183
182
|
)
|
|
184
183
|
|
|
185
184
|
self._update_stats(status, source_table)
|
|
186
185
|
return status, report, stats, details
|
|
187
186
|
|
|
188
187
|
except Exception as e:
|
|
189
|
-
app_logger.exception(f
|
|
188
|
+
app_logger.exception(f'Sample comparison failed: {str(e)}')
|
|
190
189
|
status = ct.COMPARISON_FAILED
|
|
191
190
|
self._update_stats(status, source_table)
|
|
192
191
|
return status, None, None, None
|
|
193
192
|
|
|
194
|
-
def _compare_counts(
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
193
|
+
def _compare_counts(
|
|
194
|
+
self,
|
|
195
|
+
source_table: DataReference,
|
|
196
|
+
target_table: DataReference,
|
|
197
|
+
date_column: str,
|
|
198
|
+
start_date: Optional[str],
|
|
199
|
+
end_date: Optional[str],
|
|
200
|
+
tolerance_percentage: float,
|
|
201
|
+
max_examples: int,
|
|
202
|
+
) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
|
|
201
203
|
|
|
202
204
|
try:
|
|
203
205
|
source_adapter = self._get_adapter(self.source_db_type)
|
|
@@ -206,58 +208,73 @@ class DataQualityComparator:
|
|
|
206
208
|
source_query, source_params = source_adapter.build_count_query(
|
|
207
209
|
source_table, date_column, start_date, end_date
|
|
208
210
|
)
|
|
209
|
-
source_counts = self._execute_query(
|
|
211
|
+
source_counts = self._execute_query(
|
|
212
|
+
(source_query, source_params), self.source_engine, self.timezone
|
|
213
|
+
)
|
|
210
214
|
|
|
211
215
|
target_query, target_params = target_adapter.build_count_query(
|
|
212
216
|
target_table, date_column, start_date, end_date
|
|
213
217
|
)
|
|
214
|
-
target_counts = self._execute_query(
|
|
215
|
-
|
|
218
|
+
target_counts = self._execute_query(
|
|
219
|
+
(target_query, target_params), self.target_engine, self.timezone
|
|
220
|
+
)
|
|
216
221
|
|
|
217
|
-
source_counts_filled, target_counts_filled = cross_fill_missing_dates(
|
|
222
|
+
source_counts_filled, target_counts_filled = cross_fill_missing_dates(
|
|
223
|
+
source_counts, target_counts
|
|
224
|
+
)
|
|
218
225
|
|
|
219
226
|
merged = source_counts_filled.merge(target_counts_filled, on='dt')
|
|
220
227
|
total_count_source = source_counts_filled['cnt'].sum()
|
|
221
|
-
total_count_taget =
|
|
228
|
+
total_count_taget = target_counts_filled['cnt'].sum()
|
|
222
229
|
|
|
223
|
-
if (total_count_source, total_count_taget)
|
|
230
|
+
if (total_count_source, total_count_taget) == (0, 0):
|
|
224
231
|
app_logger.warning('nothing to compare to you')
|
|
225
232
|
status = ct.COMPARISON_SKIPPED
|
|
226
233
|
return status, None, None, None
|
|
227
234
|
|
|
228
235
|
else:
|
|
229
|
-
|
|
230
236
|
result_diff_in_counters = abs(merged['cnt_x'] - merged['cnt_y']).sum()
|
|
231
237
|
result_equal_in_counters = merged[['cnt_x', 'cnt_y']].min(axis=1).sum()
|
|
232
238
|
|
|
233
|
-
discrepancies_counters_percentage =
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
239
|
+
discrepancies_counters_percentage = (
|
|
240
|
+
100
|
|
241
|
+
* result_diff_in_counters
|
|
242
|
+
/ (result_diff_in_counters + result_equal_in_counters)
|
|
243
|
+
)
|
|
244
|
+
stats, details = compare_dataframes(
|
|
245
|
+
source_df=source_counts_filled,
|
|
246
|
+
target_df=target_counts_filled,
|
|
247
|
+
key_columns=['dt'],
|
|
248
|
+
max_examples=max_examples,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
status = (
|
|
252
|
+
ct.COMPARISON_FAILED
|
|
253
|
+
if discrepancies_counters_percentage > tolerance_percentage
|
|
254
|
+
else ct.COMPARISON_SUCCESS
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
report = generate_comparison_count_report(
|
|
258
|
+
source_table.full_name,
|
|
259
|
+
target_table.full_name,
|
|
260
|
+
stats,
|
|
261
|
+
details,
|
|
262
|
+
total_count_source,
|
|
263
|
+
total_count_taget,
|
|
264
|
+
discrepancies_counters_percentage,
|
|
265
|
+
result_diff_in_counters,
|
|
266
|
+
result_equal_in_counters,
|
|
267
|
+
self.timezone,
|
|
268
|
+
source_query,
|
|
269
|
+
source_params,
|
|
270
|
+
target_query,
|
|
271
|
+
target_params,
|
|
272
|
+
)
|
|
256
273
|
|
|
257
274
|
return status, report, stats, details
|
|
258
275
|
|
|
259
276
|
except Exception as e:
|
|
260
|
-
app_logger.error(f
|
|
277
|
+
app_logger.error(f'Count comparison failed: {str(e)}')
|
|
261
278
|
raise
|
|
262
279
|
|
|
263
280
|
def _compare_samples(
|
|
@@ -271,28 +288,36 @@ class DataQualityComparator:
|
|
|
271
288
|
exclude_columns: List[str],
|
|
272
289
|
include_columns: List[str],
|
|
273
290
|
custom_key_columns: Optional[List[str]],
|
|
274
|
-
tolerance_percentage:float,
|
|
291
|
+
tolerance_percentage: float,
|
|
275
292
|
exclude_recent_hours: Optional[int],
|
|
276
|
-
max_examples:Optional[int]
|
|
293
|
+
max_examples: Optional[int],
|
|
277
294
|
) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
|
|
278
295
|
|
|
279
296
|
try:
|
|
280
297
|
source_object_type = self._get_object_type(source_table, self.source_engine)
|
|
281
298
|
target_object_type = self._get_object_type(target_table, self.target_engine)
|
|
282
|
-
app_logger.info(
|
|
299
|
+
app_logger.info(
|
|
300
|
+
f'object type source: {source_object_type} vs target {target_object_type}'
|
|
301
|
+
)
|
|
283
302
|
|
|
284
|
-
source_columns_meta = self._get_metadata_cols(
|
|
303
|
+
source_columns_meta = self._get_metadata_cols(
|
|
304
|
+
source_table, self.source_engine
|
|
305
|
+
)
|
|
285
306
|
app_logger.info('source_columns meta:\n')
|
|
286
307
|
app_logger.info(source_columns_meta.to_string(index=False))
|
|
287
308
|
|
|
288
|
-
target_columns_meta = self._get_metadata_cols(
|
|
309
|
+
target_columns_meta = self._get_metadata_cols(
|
|
310
|
+
target_table, self.target_engine
|
|
311
|
+
)
|
|
289
312
|
app_logger.info('target_columns meta:\n')
|
|
290
313
|
app_logger.info(target_columns_meta.to_string(index=False))
|
|
291
314
|
|
|
292
|
-
intersect = list(set(include_columns)&set(exclude_columns))
|
|
315
|
+
intersect = list(set(include_columns) & set(exclude_columns))
|
|
293
316
|
if intersect:
|
|
294
|
-
app_logger.warning(
|
|
295
|
-
|
|
317
|
+
app_logger.warning(
|
|
318
|
+
f'Intersection columns between Include and exclude: {",".join(intersect)}'
|
|
319
|
+
)
|
|
320
|
+
|
|
296
321
|
key_columns = None
|
|
297
322
|
|
|
298
323
|
if custom_key_columns:
|
|
@@ -300,30 +325,55 @@ class DataQualityComparator:
|
|
|
300
325
|
source_cols = source_columns_meta['column_name'].tolist()
|
|
301
326
|
target_cols = target_columns_meta['column_name'].tolist()
|
|
302
327
|
|
|
303
|
-
missing_in_source = [
|
|
304
|
-
|
|
328
|
+
missing_in_source = [
|
|
329
|
+
col for col in custom_key_columns if col not in source_cols
|
|
330
|
+
]
|
|
331
|
+
missing_in_target = [
|
|
332
|
+
col for col in custom_key_columns if col not in target_cols
|
|
333
|
+
]
|
|
305
334
|
|
|
306
335
|
if missing_in_source:
|
|
307
|
-
raise MetadataError(
|
|
336
|
+
raise MetadataError(
|
|
337
|
+
f'Custom key columns missing in source: {missing_in_source}'
|
|
338
|
+
)
|
|
308
339
|
if missing_in_target:
|
|
309
|
-
raise MetadataError(
|
|
340
|
+
raise MetadataError(
|
|
341
|
+
f'Custom key columns missing in target: {missing_in_target}'
|
|
342
|
+
)
|
|
310
343
|
else:
|
|
311
|
-
source_pk =
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
344
|
+
source_pk = (
|
|
345
|
+
self._get_metadata_pk(source_table, self.source_engine)
|
|
346
|
+
if source_object_type == ObjectType.TABLE
|
|
347
|
+
else pd.DataFrame({'pk_column_name': []})
|
|
348
|
+
)
|
|
349
|
+
target_pk = (
|
|
350
|
+
self._get_metadata_pk(target_table, self.target_engine)
|
|
351
|
+
if target_object_type == ObjectType.TABLE
|
|
352
|
+
else pd.DataFrame({'pk_column_name': []})
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
if (
|
|
356
|
+
source_pk['pk_column_name'].tolist()
|
|
357
|
+
!= target_pk['pk_column_name'].tolist()
|
|
358
|
+
):
|
|
359
|
+
app_logger.warning(
|
|
360
|
+
f'Primary keys differ: source={source_pk["pk_column_name"].tolist()}, target={target_pk["pk_column_name"].tolist()}'
|
|
361
|
+
)
|
|
362
|
+
key_columns = (
|
|
363
|
+
source_pk['pk_column_name'].tolist()
|
|
364
|
+
or target_pk['pk_column_name'].tolist()
|
|
365
|
+
)
|
|
319
366
|
if not key_columns:
|
|
320
|
-
raise MetadataError(
|
|
367
|
+
raise MetadataError(
|
|
368
|
+
f'Primary key not found in the source neither in the target and not provided'
|
|
369
|
+
)
|
|
321
370
|
|
|
322
371
|
if include_columns:
|
|
323
|
-
|
|
324
372
|
if not set(include_columns) & set(key_columns):
|
|
325
|
-
app_logger.warning(
|
|
326
|
-
|
|
373
|
+
app_logger.warning(
|
|
374
|
+
f'The primary key was not included in the column list.\
|
|
375
|
+
The key column was included in the resulting query automatically. PK:{key_columns}'
|
|
376
|
+
)
|
|
327
377
|
|
|
328
378
|
include_columns = list(set(include_columns + key_columns))
|
|
329
379
|
|
|
@@ -333,12 +383,13 @@ class DataQualityComparator:
|
|
|
333
383
|
target_columns_meta = target_columns_meta[
|
|
334
384
|
target_columns_meta['column_name'].isin(include_columns)
|
|
335
385
|
]
|
|
336
|
-
|
|
337
|
-
if exclude_columns:
|
|
338
386
|
|
|
387
|
+
if exclude_columns:
|
|
339
388
|
if set(exclude_columns) & set(key_columns):
|
|
340
|
-
app_logger.warning(
|
|
341
|
-
|
|
389
|
+
app_logger.warning(
|
|
390
|
+
f'The primary key has been excluded from the column list.\
|
|
391
|
+
However, the key column must be present in the resulting query.s PK:{key_columns}'
|
|
392
|
+
)
|
|
342
393
|
|
|
343
394
|
exclude_columns = list(set(exclude_columns) - set(key_columns))
|
|
344
395
|
|
|
@@ -349,63 +400,87 @@ class DataQualityComparator:
|
|
|
349
400
|
~target_columns_meta['column_name'].isin(exclude_columns)
|
|
350
401
|
]
|
|
351
402
|
|
|
352
|
-
common_cols_df, source_only_cols, target_only_cols =
|
|
403
|
+
common_cols_df, source_only_cols, target_only_cols = (
|
|
404
|
+
self._analyze_columns_meta(source_columns_meta, target_columns_meta)
|
|
405
|
+
)
|
|
353
406
|
common_cols = common_cols_df['column_name'].tolist()
|
|
354
407
|
|
|
355
408
|
if not common_cols:
|
|
356
|
-
raise MetadataError(
|
|
357
|
-
|
|
409
|
+
raise MetadataError(
|
|
410
|
+
f'No one column to compare, need to check tables or reduce the exclude_columns list: {",".join(exclude_columns)}'
|
|
411
|
+
)
|
|
412
|
+
|
|
358
413
|
source_data, source_query, source_params = self._get_table_data(
|
|
359
|
-
self.source_engine,
|
|
360
|
-
|
|
414
|
+
self.source_engine,
|
|
415
|
+
source_table,
|
|
416
|
+
source_columns_meta,
|
|
417
|
+
common_cols,
|
|
418
|
+
date_column,
|
|
419
|
+
update_column,
|
|
420
|
+
start_date,
|
|
421
|
+
end_date,
|
|
422
|
+
exclude_recent_hours,
|
|
361
423
|
)
|
|
362
424
|
|
|
363
425
|
target_data, target_query, target_params = self._get_table_data(
|
|
364
|
-
self.target_engine,
|
|
365
|
-
|
|
426
|
+
self.target_engine,
|
|
427
|
+
target_table,
|
|
428
|
+
target_columns_meta,
|
|
429
|
+
common_cols,
|
|
430
|
+
date_column,
|
|
431
|
+
update_column,
|
|
432
|
+
start_date,
|
|
433
|
+
end_date,
|
|
434
|
+
exclude_recent_hours,
|
|
366
435
|
)
|
|
367
436
|
status = None
|
|
368
|
-
#special case
|
|
437
|
+
# special case
|
|
369
438
|
if target_data.empty and source_data.empty:
|
|
370
439
|
status = ct.COMPARISON_SKIPPED
|
|
371
440
|
return status, None, None, None
|
|
372
441
|
elif source_data.empty or target_data.empty:
|
|
373
|
-
raise DQCompareException(
|
|
374
|
-
|
|
442
|
+
raise DQCompareException(
|
|
443
|
+
f'Nothing to compare, rows returned from source: {len(source_data)}, from target: {len(target_data)}'
|
|
444
|
+
)
|
|
375
445
|
|
|
376
446
|
source_data = prepare_dataframe(source_data)
|
|
377
447
|
target_data = prepare_dataframe(target_data)
|
|
378
448
|
if update_column and exclude_recent_hours:
|
|
379
|
-
source_data, target_data = clean_recently_changed_data(
|
|
380
|
-
|
|
449
|
+
source_data, target_data = clean_recently_changed_data(
|
|
450
|
+
source_data, target_data, key_columns
|
|
451
|
+
)
|
|
381
452
|
|
|
382
453
|
stats, details = compare_dataframes(
|
|
383
|
-
source_data, target_data,
|
|
384
|
-
key_columns, max_examples
|
|
454
|
+
source_data, target_data, key_columns, max_examples
|
|
385
455
|
)
|
|
386
456
|
|
|
387
457
|
if stats:
|
|
388
458
|
details.skipped_source_columns = source_only_cols
|
|
389
459
|
details.skipped_target_columns = target_only_cols
|
|
390
460
|
|
|
391
|
-
report = generate_comparison_sample_report(
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
461
|
+
report = generate_comparison_sample_report(
|
|
462
|
+
source_table.full_name,
|
|
463
|
+
target_table.full_name,
|
|
464
|
+
stats,
|
|
465
|
+
details,
|
|
466
|
+
self.timezone,
|
|
467
|
+
source_query,
|
|
468
|
+
source_params,
|
|
469
|
+
target_query,
|
|
470
|
+
target_params,
|
|
471
|
+
)
|
|
472
|
+
status = (
|
|
473
|
+
ct.COMPARISON_FAILED
|
|
474
|
+
if stats.final_diff_score > tolerance_percentage
|
|
475
|
+
else ct.COMPARISON_SUCCESS
|
|
476
|
+
)
|
|
402
477
|
return status, report, stats, details
|
|
403
478
|
else:
|
|
404
479
|
status = ct.COMPARISON_SKIPPED
|
|
405
480
|
return status, None, None, None
|
|
406
481
|
|
|
407
482
|
except Exception as e:
|
|
408
|
-
app_logger.error(f
|
|
483
|
+
app_logger.error(f'Sample comparison failed: {str(e)}')
|
|
409
484
|
raise
|
|
410
485
|
|
|
411
486
|
def compare_custom_query(
|
|
@@ -417,25 +492,25 @@ class DataQualityComparator:
|
|
|
417
492
|
custom_primary_key: List[str],
|
|
418
493
|
exclude_columns: Optional[List[str]] = None,
|
|
419
494
|
tolerance_percentage: float = 0.0,
|
|
420
|
-
max_examples:Optional[int] = ct.DEFAULT_MAX_EXAMPLES
|
|
495
|
+
max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
|
|
421
496
|
) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
|
|
422
497
|
"""
|
|
423
498
|
Compare data from custom queries with specified key columns
|
|
424
499
|
|
|
425
500
|
Parameters:
|
|
426
|
-
source_query : Union[str, Tuple[str, Dict]]
|
|
501
|
+
source_query : Union[str, Tuple[str, Dict]]
|
|
427
502
|
Source query (can be string or tuple with query and params).
|
|
428
503
|
target_query : Union[str, Tuple[str, Dict]]
|
|
429
504
|
Target query (can be string or tuple with query and params).
|
|
430
505
|
custom_primary_key : List[str]
|
|
431
506
|
List of primary key columns for comparison.
|
|
432
|
-
exclude_columns : Optional[List[str]] = None
|
|
507
|
+
exclude_columns : Optional[List[str]] = None
|
|
433
508
|
Columns to exclude from comparison.
|
|
434
|
-
tolerance_percentage : float
|
|
509
|
+
tolerance_percentage : float
|
|
435
510
|
Tolerance percentage for discrepancies.
|
|
436
511
|
max_examples: int
|
|
437
|
-
Maximum number of discrepancy examples per column
|
|
438
|
-
|
|
512
|
+
Maximum number of discrepancy examples per column
|
|
513
|
+
|
|
439
514
|
Returns:
|
|
440
515
|
----------
|
|
441
516
|
Tuple[str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]
|
|
@@ -448,8 +523,12 @@ class DataQualityComparator:
|
|
|
448
523
|
self.comparison_stats['compared'] += 1
|
|
449
524
|
|
|
450
525
|
# Execute queries
|
|
451
|
-
source_data = self._execute_query(
|
|
452
|
-
|
|
526
|
+
source_data = self._execute_query(
|
|
527
|
+
(source_query, source_params), source_engine, timezone
|
|
528
|
+
)
|
|
529
|
+
target_data = self._execute_query(
|
|
530
|
+
(target_query, target_params), target_engine, timezone
|
|
531
|
+
)
|
|
453
532
|
app_logger.info('preparing source dataframe')
|
|
454
533
|
source_data_prepared = prepare_dataframe(source_data)
|
|
455
534
|
app_logger.info('preparing target dataframe')
|
|
@@ -457,43 +536,60 @@ class DataQualityComparator:
|
|
|
457
536
|
|
|
458
537
|
# Exclude columns if specified
|
|
459
538
|
exclude_cols = exclude_columns or []
|
|
460
|
-
common_cols = [
|
|
461
|
-
|
|
539
|
+
common_cols = [
|
|
540
|
+
col
|
|
541
|
+
for col in source_data_prepared.columns
|
|
542
|
+
if col in target_data_prepared.columns and col not in exclude_cols
|
|
543
|
+
]
|
|
462
544
|
|
|
463
545
|
source_data_filtered = source_data_prepared[common_cols]
|
|
464
546
|
target_data_filtered = target_data_prepared[common_cols]
|
|
465
547
|
if 'xrecently_changed' in common_cols:
|
|
466
|
-
source_data_filtered, target_data_filtered =
|
|
548
|
+
source_data_filtered, target_data_filtered = (
|
|
549
|
+
clean_recently_changed_data(
|
|
550
|
+
source_data_filtered, target_data_filtered, custom_primary_key
|
|
551
|
+
)
|
|
552
|
+
)
|
|
467
553
|
# Compare dataframes
|
|
468
554
|
stats, details = compare_dataframes(
|
|
469
|
-
source_data_filtered,
|
|
555
|
+
source_data_filtered,
|
|
556
|
+
target_data_filtered,
|
|
557
|
+
custom_primary_key,
|
|
558
|
+
max_examples,
|
|
470
559
|
)
|
|
471
560
|
|
|
472
561
|
if stats:
|
|
473
|
-
report = generate_comparison_sample_report(
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
562
|
+
report = generate_comparison_sample_report(
|
|
563
|
+
None,
|
|
564
|
+
None,
|
|
565
|
+
stats,
|
|
566
|
+
details,
|
|
567
|
+
self.timezone,
|
|
568
|
+
source_query,
|
|
569
|
+
source_params,
|
|
570
|
+
target_query,
|
|
571
|
+
target_params,
|
|
572
|
+
)
|
|
573
|
+
status = (
|
|
574
|
+
ct.COMPARISON_FAILED
|
|
575
|
+
if stats.final_diff_score > tolerance_percentage
|
|
576
|
+
else ct.COMPARISON_SUCCESS
|
|
577
|
+
)
|
|
484
578
|
else:
|
|
485
579
|
status = ct.COMPARISON_SKIPPED
|
|
486
580
|
|
|
487
|
-
|
|
488
581
|
self._update_stats(status, None)
|
|
489
582
|
return status, report, stats, details
|
|
490
583
|
|
|
491
584
|
except Exception as e:
|
|
492
|
-
app_logger.exception(
|
|
585
|
+
app_logger.exception('Custom query comparison failed')
|
|
493
586
|
status = ct.COMPARISON_FAILED
|
|
494
587
|
self._update_stats(status, None)
|
|
495
588
|
return status, None, None, None
|
|
496
|
-
|
|
589
|
+
|
|
590
|
+
def _get_metadata_cols(
|
|
591
|
+
self, data_ref: DataReference, engine: Engine
|
|
592
|
+
) -> pd.DataFrame:
|
|
497
593
|
"""Get metadata with proper source handling"""
|
|
498
594
|
adapter = self._get_adapter(DBMSType.from_engine(engine))
|
|
499
595
|
|
|
@@ -501,13 +597,12 @@ class DataQualityComparator:
|
|
|
501
597
|
columns_meta = self._execute_query((query, params), engine)
|
|
502
598
|
|
|
503
599
|
if columns_meta.empty:
|
|
504
|
-
raise ValueError(f
|
|
600
|
+
raise ValueError(f'Failed to get metadata for: {data_ref.full_name}')
|
|
505
601
|
|
|
506
602
|
return columns_meta
|
|
507
603
|
|
|
508
604
|
def _get_metadata_pk(self, data_ref: DataReference, engine: Engine) -> pd.DataFrame:
|
|
509
|
-
"""Get metadata with proper source handling
|
|
510
|
-
"""
|
|
605
|
+
"""Get metadata with proper source handling"""
|
|
511
606
|
adapter = self._get_adapter(DBMSType.from_engine(engine))
|
|
512
607
|
|
|
513
608
|
query, params = adapter.build_primary_key_query(data_ref)
|
|
@@ -531,19 +626,24 @@ class DataQualityComparator:
|
|
|
531
626
|
update_column: str,
|
|
532
627
|
start_date: Optional[str],
|
|
533
628
|
end_date: Optional[str],
|
|
534
|
-
exclude_recent_hours: Optional[int]
|
|
535
|
-
) -> Tuple[pd.DataFrame, str, Dict]
|
|
629
|
+
exclude_recent_hours: Optional[int],
|
|
630
|
+
) -> Tuple[pd.DataFrame, str, Dict]:
|
|
536
631
|
"""Retrieve and prepare table data"""
|
|
537
632
|
db_type = DBMSType.from_engine(engine)
|
|
538
633
|
adapter = self._get_adapter(db_type)
|
|
539
634
|
app_logger.info(db_type)
|
|
540
635
|
|
|
541
636
|
query, params = adapter.build_data_query_common(
|
|
542
|
-
data_ref,
|
|
543
|
-
|
|
637
|
+
data_ref,
|
|
638
|
+
columns,
|
|
639
|
+
date_column,
|
|
640
|
+
update_column,
|
|
641
|
+
start_date,
|
|
642
|
+
end_date,
|
|
643
|
+
exclude_recent_hours,
|
|
544
644
|
)
|
|
545
645
|
|
|
546
|
-
df = self._execute_query((query,params), engine, self.timezone)
|
|
646
|
+
df = self._execute_query((query, params), engine, self.timezone)
|
|
547
647
|
|
|
548
648
|
# Apply type conversions
|
|
549
649
|
df = adapter.convert_types(df, metadata, self.timezone)
|
|
@@ -555,9 +655,11 @@ class DataQualityComparator:
|
|
|
555
655
|
try:
|
|
556
656
|
return self.adapters[db_type]
|
|
557
657
|
except KeyError:
|
|
558
|
-
raise ValueError(f
|
|
658
|
+
raise ValueError(f'No adapter available for {db_type}')
|
|
559
659
|
|
|
560
|
-
def _execute_query(
|
|
660
|
+
def _execute_query(
|
|
661
|
+
self, query: Union[str, Tuple[str, Dict]], engine: Engine, timezone: str = None
|
|
662
|
+
) -> pd.DataFrame:
|
|
561
663
|
"""Execute SQL query using appropriate adapter"""
|
|
562
664
|
db_type = DBMSType.from_engine(engine)
|
|
563
665
|
adapter = self._get_adapter(db_type)
|
|
@@ -566,9 +668,7 @@ class DataQualityComparator:
|
|
|
566
668
|
return df
|
|
567
669
|
|
|
568
670
|
def _analyze_columns_meta(
|
|
569
|
-
self,
|
|
570
|
-
source_columns_meta: pd.DataFrame,
|
|
571
|
-
target_columns_meta: pd.DataFrame
|
|
671
|
+
self, source_columns_meta: pd.DataFrame, target_columns_meta: pd.DataFrame
|
|
572
672
|
) -> tuple[pd.DataFrame, list, list]:
|
|
573
673
|
"""Find common columns between source and target and return unique columns for each"""
|
|
574
674
|
|
|
@@ -576,8 +676,10 @@ class DataQualityComparator:
|
|
|
576
676
|
target_columns = target_columns_meta['column_name'].tolist()
|
|
577
677
|
|
|
578
678
|
common_columns = pd.merge(
|
|
579
|
-
source_columns_meta,
|
|
580
|
-
|
|
679
|
+
source_columns_meta,
|
|
680
|
+
target_columns_meta,
|
|
681
|
+
on='column_name',
|
|
682
|
+
suffixes=('_source', '_target'),
|
|
581
683
|
)
|
|
582
684
|
|
|
583
685
|
source_set = set(source_columns)
|
|
@@ -588,13 +690,9 @@ class DataQualityComparator:
|
|
|
588
690
|
|
|
589
691
|
return common_columns, source_unique, target_unique
|
|
590
692
|
|
|
591
|
-
def _validate_inputs(
|
|
592
|
-
self,
|
|
593
|
-
source: DataReference,
|
|
594
|
-
target: DataReference
|
|
595
|
-
):
|
|
693
|
+
def _validate_inputs(self, source: DataReference, target: DataReference):
|
|
596
694
|
"""Validate input parameters"""
|
|
597
695
|
if not isinstance(source, DataReference):
|
|
598
|
-
raise TypeError(
|
|
696
|
+
raise TypeError('source must be a DataReference')
|
|
599
697
|
if not isinstance(target, DataReference):
|
|
600
|
-
raise TypeError(
|
|
698
|
+
raise TypeError('target must be a DataReference')
|