xoverrr 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xoverrr/__init__.py +8 -12
- xoverrr/adapters/__init__.py +7 -2
- xoverrr/adapters/base.py +61 -32
- xoverrr/adapters/clickhouse.py +64 -35
- xoverrr/adapters/oracle.py +67 -38
- xoverrr/adapters/postgres.py +67 -35
- xoverrr/constants.py +4 -4
- xoverrr/core.py +299 -197
- xoverrr/exceptions.py +8 -1
- xoverrr/logger.py +4 -2
- xoverrr/models.py +11 -5
- xoverrr/utils.py +331 -259
- {xoverrr-1.1.4.dist-info → xoverrr-1.1.6.dist-info}/METADATA +67 -71
- xoverrr-1.1.6.dist-info/RECORD +17 -0
- {xoverrr-1.1.4.dist-info → xoverrr-1.1.6.dist-info}/WHEEL +1 -1
- xoverrr-1.1.4.dist-info/RECORD +0 -17
- {xoverrr-1.1.4.dist-info → xoverrr-1.1.6.dist-info}/licenses/LICENSE +0 -0
- {xoverrr-1.1.4.dist-info → xoverrr-1.1.6.dist-info}/top_level.txt +0 -0
xoverrr/core.py
CHANGED
|
@@ -1,41 +1,21 @@
|
|
|
1
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
1
2
|
|
|
2
|
-
import sys
|
|
3
|
-
from enum import Enum, auto
|
|
4
|
-
from typing import Optional, List, Dict, Callable, Union, Tuple, Any
|
|
5
3
|
import pandas as pd
|
|
6
4
|
from sqlalchemy.engine import Engine
|
|
7
|
-
from .models import (
|
|
8
|
-
DBMSType,
|
|
9
|
-
DataReference,
|
|
10
|
-
ObjectType
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
from .logger import app_logger
|
|
14
5
|
|
|
6
|
+
from . import constants as ct
|
|
7
|
+
from .adapters.base import BaseDatabaseAdapter
|
|
8
|
+
from .adapters.clickhouse import ClickHouseAdapter
|
|
15
9
|
from .adapters.oracle import OracleAdapter
|
|
16
10
|
from .adapters.postgres import PostgresAdapter
|
|
17
|
-
from .
|
|
18
|
-
from .
|
|
19
|
-
|
|
20
|
-
from . import
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
)
|
|
26
|
-
from .utils import (
|
|
27
|
-
prepare_dataframe,
|
|
28
|
-
compare_dataframes,
|
|
29
|
-
clean_recently_changed_data,
|
|
30
|
-
generate_comparison_sample_report,
|
|
31
|
-
generate_comparison_count_report,
|
|
32
|
-
cross_fill_missing_dates,
|
|
33
|
-
validate_dataframe_size,
|
|
34
|
-
ComparisonStats,
|
|
35
|
-
ComparisonDiffDetails
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
|
|
11
|
+
from .exceptions import DQCompareException, MetadataError
|
|
12
|
+
from .logger import app_logger
|
|
13
|
+
from .models import DataReference, DBMSType, ObjectType
|
|
14
|
+
from .utils import (ComparisonDiffDetails, ComparisonStats,
|
|
15
|
+
clean_recently_changed_data, compare_dataframes,
|
|
16
|
+
cross_fill_missing_dates, generate_comparison_count_report,
|
|
17
|
+
generate_comparison_sample_report, normalize_column_names,
|
|
18
|
+
prepare_dataframe, validate_dataframe_size)
|
|
39
19
|
|
|
40
20
|
|
|
41
21
|
class DataQualityComparator:
|
|
@@ -48,7 +28,7 @@ class DataQualityComparator:
|
|
|
48
28
|
source_engine: Engine,
|
|
49
29
|
target_engine: Engine,
|
|
50
30
|
default_exclude_recent_hours: Optional[int] = 24,
|
|
51
|
-
timezone: str = ct.DEFAULT_TZ
|
|
31
|
+
timezone: str = ct.DEFAULT_TZ,
|
|
52
32
|
):
|
|
53
33
|
self.source_engine = source_engine
|
|
54
34
|
self.target_engine = target_engine
|
|
@@ -63,7 +43,12 @@ class DataQualityComparator:
|
|
|
63
43
|
DBMSType.CLICKHOUSE: ClickHouseAdapter(),
|
|
64
44
|
}
|
|
65
45
|
self._reset_stats()
|
|
46
|
+
from . import __version__
|
|
47
|
+
|
|
66
48
|
app_logger.info('start')
|
|
49
|
+
app_logger.info(f'Version: v{__version__}')
|
|
50
|
+
app_logger.info(f'Source DB: {self.source_db_type.name}')
|
|
51
|
+
app_logger.info(f'Target DB: {self.target_db_type.name}')
|
|
67
52
|
|
|
68
53
|
def reset_stats(self):
|
|
69
54
|
self._reset_stats()
|
|
@@ -74,17 +59,19 @@ class DataQualityComparator:
|
|
|
74
59
|
ct.COMPARISON_SUCCESS: 0,
|
|
75
60
|
ct.COMPARISON_FAILED: 0,
|
|
76
61
|
ct.COMPARISON_SKIPPED: 0,
|
|
77
|
-
'tables_success'
|
|
78
|
-
'tables_failed'
|
|
62
|
+
'tables_success': set(),
|
|
63
|
+
'tables_failed': set(),
|
|
79
64
|
'tables_skipped': set(),
|
|
80
65
|
'start_time': pd.Timestamp.now().strftime(ct.DATETIME_FORMAT),
|
|
81
|
-
'end_time': None
|
|
66
|
+
'end_time': None,
|
|
82
67
|
}
|
|
83
68
|
|
|
84
|
-
def _update_stats(self, status: str, source_table:DataReference):
|
|
69
|
+
def _update_stats(self, status: str, source_table: DataReference):
|
|
85
70
|
"""Update comparison statistics"""
|
|
86
71
|
self.comparison_stats[status] += 1
|
|
87
|
-
self.comparison_stats['end_time'] = pd.Timestamp.now().strftime(
|
|
72
|
+
self.comparison_stats['end_time'] = pd.Timestamp.now().strftime(
|
|
73
|
+
ct.DATETIME_FORMAT
|
|
74
|
+
)
|
|
88
75
|
if source_table:
|
|
89
76
|
match status:
|
|
90
77
|
case ct.COMPARISON_SUCCESS:
|
|
@@ -101,7 +88,7 @@ class DataQualityComparator:
|
|
|
101
88
|
date_column: Optional[str] = None,
|
|
102
89
|
date_range: Optional[Tuple[str, str]] = None,
|
|
103
90
|
tolerance_percentage: float = 0.0,
|
|
104
|
-
max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES
|
|
91
|
+
max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
|
|
105
92
|
) -> Tuple[str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
|
|
106
93
|
|
|
107
94
|
self._validate_inputs(source_table, target_table)
|
|
@@ -111,17 +98,21 @@ class DataQualityComparator:
|
|
|
111
98
|
try:
|
|
112
99
|
self.comparison_stats['compared'] += 1
|
|
113
100
|
|
|
114
|
-
|
|
115
101
|
status, report, stats, details = self._compare_counts(
|
|
116
|
-
|
|
117
|
-
|
|
102
|
+
source_table,
|
|
103
|
+
target_table,
|
|
104
|
+
date_column,
|
|
105
|
+
start_date,
|
|
106
|
+
end_date,
|
|
107
|
+
tolerance_percentage,
|
|
108
|
+
max_examples,
|
|
118
109
|
)
|
|
119
110
|
|
|
120
111
|
self._update_stats(status, source_table)
|
|
121
112
|
return status, report, stats, details
|
|
122
113
|
|
|
123
114
|
except Exception as e:
|
|
124
|
-
app_logger.exception(f
|
|
115
|
+
app_logger.exception(f'Count comparison failed: {str(e)}')
|
|
125
116
|
status = ct.COMPARISON_FAILED
|
|
126
117
|
self._update_stats(status, source_table)
|
|
127
118
|
return status, None, None, None
|
|
@@ -138,25 +129,25 @@ class DataQualityComparator:
|
|
|
138
129
|
custom_primary_key: Optional[List[str]] = None,
|
|
139
130
|
tolerance_percentage: float = 0.0,
|
|
140
131
|
exclude_recent_hours: Optional[int] = None,
|
|
141
|
-
max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES
|
|
132
|
+
max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
|
|
142
133
|
) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
|
|
143
134
|
"""
|
|
144
135
|
Compare data from custom queries with specified key columns
|
|
145
136
|
|
|
146
137
|
Parameters:
|
|
147
|
-
source_table: `DataReference`
|
|
138
|
+
source_table: `DataReference`
|
|
148
139
|
source table to compare
|
|
149
140
|
target_table: `DataReference`
|
|
150
141
|
target table to compare
|
|
151
142
|
custom_primary_key : `List[str]`
|
|
152
143
|
List of primary key columns for comparison.
|
|
153
|
-
exclude_columns : `Optional[List[str]] = None`
|
|
144
|
+
exclude_columns : `Optional[List[str]] = None`
|
|
154
145
|
Columns to exclude from comparison.
|
|
155
|
-
include_columns : `Optional[List[str]] = None`
|
|
146
|
+
include_columns : `Optional[List[str]] = None`
|
|
156
147
|
Columns to include from comparison (default all cols)
|
|
157
|
-
tolerance_percentage : `float`
|
|
148
|
+
tolerance_percentage : `float`
|
|
158
149
|
Tolerance percentage for discrepancies.
|
|
159
|
-
max_examples
|
|
150
|
+
max_examples
|
|
160
151
|
Maximum number of discrepancy examples per column
|
|
161
152
|
"""
|
|
162
153
|
self._validate_inputs(source_table, target_table)
|
|
@@ -164,35 +155,51 @@ class DataQualityComparator:
|
|
|
164
155
|
exclude_hours = exclude_recent_hours or self.default_exclude_recent_hours
|
|
165
156
|
|
|
166
157
|
start_date, end_date = date_range or (None, None)
|
|
167
|
-
exclude_cols = exclude_columns or []
|
|
168
|
-
custom_keys =
|
|
169
|
-
|
|
158
|
+
exclude_cols = normalize_column_names(exclude_columns or [])
|
|
159
|
+
custom_keys = (
|
|
160
|
+
normalize_column_names(custom_primary_key or [])
|
|
161
|
+
if custom_primary_key
|
|
162
|
+
else None
|
|
163
|
+
)
|
|
164
|
+
include_cols = normalize_column_names(include_columns or [])
|
|
170
165
|
|
|
171
166
|
try:
|
|
172
167
|
self.comparison_stats['compared'] += 1
|
|
173
168
|
|
|
174
169
|
status, report, stats, details = self._compare_samples(
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
170
|
+
source_table,
|
|
171
|
+
target_table,
|
|
172
|
+
date_column,
|
|
173
|
+
update_column,
|
|
174
|
+
start_date,
|
|
175
|
+
end_date,
|
|
176
|
+
exclude_cols,
|
|
177
|
+
include_cols,
|
|
178
|
+
custom_keys,
|
|
179
|
+
tolerance_percentage,
|
|
180
|
+
exclude_hours,
|
|
181
|
+
max_examples,
|
|
178
182
|
)
|
|
179
183
|
|
|
180
184
|
self._update_stats(status, source_table)
|
|
181
185
|
return status, report, stats, details
|
|
182
186
|
|
|
183
187
|
except Exception as e:
|
|
184
|
-
app_logger.exception(f
|
|
188
|
+
app_logger.exception(f'Sample comparison failed: {str(e)}')
|
|
185
189
|
status = ct.COMPARISON_FAILED
|
|
186
190
|
self._update_stats(status, source_table)
|
|
187
191
|
return status, None, None, None
|
|
188
192
|
|
|
189
|
-
def _compare_counts(
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
193
|
+
def _compare_counts(
|
|
194
|
+
self,
|
|
195
|
+
source_table: DataReference,
|
|
196
|
+
target_table: DataReference,
|
|
197
|
+
date_column: str,
|
|
198
|
+
start_date: Optional[str],
|
|
199
|
+
end_date: Optional[str],
|
|
200
|
+
tolerance_percentage: float,
|
|
201
|
+
max_examples: int,
|
|
202
|
+
) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
|
|
196
203
|
|
|
197
204
|
try:
|
|
198
205
|
source_adapter = self._get_adapter(self.source_db_type)
|
|
@@ -201,59 +208,73 @@ class DataQualityComparator:
|
|
|
201
208
|
source_query, source_params = source_adapter.build_count_query(
|
|
202
209
|
source_table, date_column, start_date, end_date
|
|
203
210
|
)
|
|
204
|
-
source_counts = self._execute_query(
|
|
211
|
+
source_counts = self._execute_query(
|
|
212
|
+
(source_query, source_params), self.source_engine, self.timezone
|
|
213
|
+
)
|
|
205
214
|
|
|
206
215
|
target_query, target_params = target_adapter.build_count_query(
|
|
207
216
|
target_table, date_column, start_date, end_date
|
|
208
217
|
)
|
|
209
|
-
target_counts = self._execute_query(
|
|
218
|
+
target_counts = self._execute_query(
|
|
219
|
+
(target_query, target_params), self.target_engine, self.timezone
|
|
220
|
+
)
|
|
210
221
|
|
|
211
|
-
source_counts_filled, target_counts_filled = cross_fill_missing_dates(
|
|
212
|
-
|
|
213
|
-
|
|
222
|
+
source_counts_filled, target_counts_filled = cross_fill_missing_dates(
|
|
223
|
+
source_counts, target_counts
|
|
224
|
+
)
|
|
214
225
|
|
|
215
226
|
merged = source_counts_filled.merge(target_counts_filled, on='dt')
|
|
216
227
|
total_count_source = source_counts_filled['cnt'].sum()
|
|
217
|
-
total_count_taget =
|
|
228
|
+
total_count_taget = target_counts_filled['cnt'].sum()
|
|
218
229
|
|
|
219
|
-
if (total_count_source, total_count_taget)
|
|
230
|
+
if (total_count_source, total_count_taget) == (0, 0):
|
|
220
231
|
app_logger.warning('nothing to compare to you')
|
|
221
232
|
status = ct.COMPARISON_SKIPPED
|
|
222
233
|
return status, None, None, None
|
|
223
234
|
|
|
224
235
|
else:
|
|
225
|
-
|
|
226
236
|
result_diff_in_counters = abs(merged['cnt_x'] - merged['cnt_y']).sum()
|
|
227
237
|
result_equal_in_counters = merged[['cnt_x', 'cnt_y']].min(axis=1).sum()
|
|
228
238
|
|
|
229
|
-
discrepancies_counters_percentage =
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
239
|
+
discrepancies_counters_percentage = (
|
|
240
|
+
100
|
|
241
|
+
* result_diff_in_counters
|
|
242
|
+
/ (result_diff_in_counters + result_equal_in_counters)
|
|
243
|
+
)
|
|
244
|
+
stats, details = compare_dataframes(
|
|
245
|
+
source_df=source_counts_filled,
|
|
246
|
+
target_df=target_counts_filled,
|
|
247
|
+
key_columns=['dt'],
|
|
248
|
+
max_examples=max_examples,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
status = (
|
|
252
|
+
ct.COMPARISON_FAILED
|
|
253
|
+
if discrepancies_counters_percentage > tolerance_percentage
|
|
254
|
+
else ct.COMPARISON_SUCCESS
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
report = generate_comparison_count_report(
|
|
258
|
+
source_table.full_name,
|
|
259
|
+
target_table.full_name,
|
|
260
|
+
stats,
|
|
261
|
+
details,
|
|
262
|
+
total_count_source,
|
|
263
|
+
total_count_taget,
|
|
264
|
+
discrepancies_counters_percentage,
|
|
265
|
+
result_diff_in_counters,
|
|
266
|
+
result_equal_in_counters,
|
|
267
|
+
self.timezone,
|
|
268
|
+
source_query,
|
|
269
|
+
source_params,
|
|
270
|
+
target_query,
|
|
271
|
+
target_params,
|
|
272
|
+
)
|
|
252
273
|
|
|
253
274
|
return status, report, stats, details
|
|
254
275
|
|
|
255
276
|
except Exception as e:
|
|
256
|
-
app_logger.error(f
|
|
277
|
+
app_logger.error(f'Count comparison failed: {str(e)}')
|
|
257
278
|
raise
|
|
258
279
|
|
|
259
280
|
def _compare_samples(
|
|
@@ -267,28 +288,36 @@ class DataQualityComparator:
|
|
|
267
288
|
exclude_columns: List[str],
|
|
268
289
|
include_columns: List[str],
|
|
269
290
|
custom_key_columns: Optional[List[str]],
|
|
270
|
-
tolerance_percentage:float,
|
|
291
|
+
tolerance_percentage: float,
|
|
271
292
|
exclude_recent_hours: Optional[int],
|
|
272
|
-
max_examples:Optional[int]
|
|
293
|
+
max_examples: Optional[int],
|
|
273
294
|
) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
|
|
274
295
|
|
|
275
296
|
try:
|
|
276
297
|
source_object_type = self._get_object_type(source_table, self.source_engine)
|
|
277
298
|
target_object_type = self._get_object_type(target_table, self.target_engine)
|
|
278
|
-
app_logger.info(
|
|
299
|
+
app_logger.info(
|
|
300
|
+
f'object type source: {source_object_type} vs target {target_object_type}'
|
|
301
|
+
)
|
|
279
302
|
|
|
280
|
-
source_columns_meta = self._get_metadata_cols(
|
|
303
|
+
source_columns_meta = self._get_metadata_cols(
|
|
304
|
+
source_table, self.source_engine
|
|
305
|
+
)
|
|
281
306
|
app_logger.info('source_columns meta:\n')
|
|
282
307
|
app_logger.info(source_columns_meta.to_string(index=False))
|
|
283
308
|
|
|
284
|
-
target_columns_meta = self._get_metadata_cols(
|
|
309
|
+
target_columns_meta = self._get_metadata_cols(
|
|
310
|
+
target_table, self.target_engine
|
|
311
|
+
)
|
|
285
312
|
app_logger.info('target_columns meta:\n')
|
|
286
313
|
app_logger.info(target_columns_meta.to_string(index=False))
|
|
287
314
|
|
|
288
|
-
intersect = list(set(include_columns)&set(exclude_columns))
|
|
315
|
+
intersect = list(set(include_columns) & set(exclude_columns))
|
|
289
316
|
if intersect:
|
|
290
|
-
app_logger.warning(
|
|
291
|
-
|
|
317
|
+
app_logger.warning(
|
|
318
|
+
f'Intersection columns between Include and exclude: {",".join(intersect)}'
|
|
319
|
+
)
|
|
320
|
+
|
|
292
321
|
key_columns = None
|
|
293
322
|
|
|
294
323
|
if custom_key_columns:
|
|
@@ -296,30 +325,55 @@ class DataQualityComparator:
|
|
|
296
325
|
source_cols = source_columns_meta['column_name'].tolist()
|
|
297
326
|
target_cols = target_columns_meta['column_name'].tolist()
|
|
298
327
|
|
|
299
|
-
missing_in_source = [
|
|
300
|
-
|
|
328
|
+
missing_in_source = [
|
|
329
|
+
col for col in custom_key_columns if col not in source_cols
|
|
330
|
+
]
|
|
331
|
+
missing_in_target = [
|
|
332
|
+
col for col in custom_key_columns if col not in target_cols
|
|
333
|
+
]
|
|
301
334
|
|
|
302
335
|
if missing_in_source:
|
|
303
|
-
raise MetadataError(
|
|
336
|
+
raise MetadataError(
|
|
337
|
+
f'Custom key columns missing in source: {missing_in_source}'
|
|
338
|
+
)
|
|
304
339
|
if missing_in_target:
|
|
305
|
-
raise MetadataError(
|
|
340
|
+
raise MetadataError(
|
|
341
|
+
f'Custom key columns missing in target: {missing_in_target}'
|
|
342
|
+
)
|
|
306
343
|
else:
|
|
307
|
-
source_pk =
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
344
|
+
source_pk = (
|
|
345
|
+
self._get_metadata_pk(source_table, self.source_engine)
|
|
346
|
+
if source_object_type == ObjectType.TABLE
|
|
347
|
+
else pd.DataFrame({'pk_column_name': []})
|
|
348
|
+
)
|
|
349
|
+
target_pk = (
|
|
350
|
+
self._get_metadata_pk(target_table, self.target_engine)
|
|
351
|
+
if target_object_type == ObjectType.TABLE
|
|
352
|
+
else pd.DataFrame({'pk_column_name': []})
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
if (
|
|
356
|
+
source_pk['pk_column_name'].tolist()
|
|
357
|
+
!= target_pk['pk_column_name'].tolist()
|
|
358
|
+
):
|
|
359
|
+
app_logger.warning(
|
|
360
|
+
f'Primary keys differ: source={source_pk["pk_column_name"].tolist()}, target={target_pk["pk_column_name"].tolist()}'
|
|
361
|
+
)
|
|
362
|
+
key_columns = (
|
|
363
|
+
source_pk['pk_column_name'].tolist()
|
|
364
|
+
or target_pk['pk_column_name'].tolist()
|
|
365
|
+
)
|
|
315
366
|
if not key_columns:
|
|
316
|
-
raise MetadataError(
|
|
367
|
+
raise MetadataError(
|
|
368
|
+
f'Primary key not found in the source neither in the target and not provided'
|
|
369
|
+
)
|
|
317
370
|
|
|
318
371
|
if include_columns:
|
|
319
|
-
|
|
320
372
|
if not set(include_columns) & set(key_columns):
|
|
321
|
-
app_logger.warning(
|
|
322
|
-
|
|
373
|
+
app_logger.warning(
|
|
374
|
+
f'The primary key was not included in the column list.\
|
|
375
|
+
The key column was included in the resulting query automatically. PK:{key_columns}'
|
|
376
|
+
)
|
|
323
377
|
|
|
324
378
|
include_columns = list(set(include_columns + key_columns))
|
|
325
379
|
|
|
@@ -329,12 +383,13 @@ class DataQualityComparator:
|
|
|
329
383
|
target_columns_meta = target_columns_meta[
|
|
330
384
|
target_columns_meta['column_name'].isin(include_columns)
|
|
331
385
|
]
|
|
332
|
-
|
|
333
|
-
if exclude_columns:
|
|
334
386
|
|
|
387
|
+
if exclude_columns:
|
|
335
388
|
if set(exclude_columns) & set(key_columns):
|
|
336
|
-
app_logger.warning(
|
|
337
|
-
|
|
389
|
+
app_logger.warning(
|
|
390
|
+
f'The primary key has been excluded from the column list.\
|
|
391
|
+
However, the key column must be present in the resulting query.s PK:{key_columns}'
|
|
392
|
+
)
|
|
338
393
|
|
|
339
394
|
exclude_columns = list(set(exclude_columns) - set(key_columns))
|
|
340
395
|
|
|
@@ -345,63 +400,87 @@ class DataQualityComparator:
|
|
|
345
400
|
~target_columns_meta['column_name'].isin(exclude_columns)
|
|
346
401
|
]
|
|
347
402
|
|
|
348
|
-
common_cols_df, source_only_cols, target_only_cols =
|
|
403
|
+
common_cols_df, source_only_cols, target_only_cols = (
|
|
404
|
+
self._analyze_columns_meta(source_columns_meta, target_columns_meta)
|
|
405
|
+
)
|
|
349
406
|
common_cols = common_cols_df['column_name'].tolist()
|
|
350
407
|
|
|
351
408
|
if not common_cols:
|
|
352
|
-
raise MetadataError(
|
|
353
|
-
|
|
409
|
+
raise MetadataError(
|
|
410
|
+
f'No one column to compare, need to check tables or reduce the exclude_columns list: {",".join(exclude_columns)}'
|
|
411
|
+
)
|
|
412
|
+
|
|
354
413
|
source_data, source_query, source_params = self._get_table_data(
|
|
355
|
-
self.source_engine,
|
|
356
|
-
|
|
414
|
+
self.source_engine,
|
|
415
|
+
source_table,
|
|
416
|
+
source_columns_meta,
|
|
417
|
+
common_cols,
|
|
418
|
+
date_column,
|
|
419
|
+
update_column,
|
|
420
|
+
start_date,
|
|
421
|
+
end_date,
|
|
422
|
+
exclude_recent_hours,
|
|
357
423
|
)
|
|
358
424
|
|
|
359
425
|
target_data, target_query, target_params = self._get_table_data(
|
|
360
|
-
self.target_engine,
|
|
361
|
-
|
|
426
|
+
self.target_engine,
|
|
427
|
+
target_table,
|
|
428
|
+
target_columns_meta,
|
|
429
|
+
common_cols,
|
|
430
|
+
date_column,
|
|
431
|
+
update_column,
|
|
432
|
+
start_date,
|
|
433
|
+
end_date,
|
|
434
|
+
exclude_recent_hours,
|
|
362
435
|
)
|
|
363
436
|
status = None
|
|
364
|
-
#special case
|
|
437
|
+
# special case
|
|
365
438
|
if target_data.empty and source_data.empty:
|
|
366
439
|
status = ct.COMPARISON_SKIPPED
|
|
367
440
|
return status, None, None, None
|
|
368
441
|
elif source_data.empty or target_data.empty:
|
|
369
|
-
raise DQCompareException(
|
|
370
|
-
|
|
442
|
+
raise DQCompareException(
|
|
443
|
+
f'Nothing to compare, rows returned from source: {len(source_data)}, from target: {len(target_data)}'
|
|
444
|
+
)
|
|
371
445
|
|
|
372
446
|
source_data = prepare_dataframe(source_data)
|
|
373
447
|
target_data = prepare_dataframe(target_data)
|
|
374
448
|
if update_column and exclude_recent_hours:
|
|
375
|
-
source_data, target_data = clean_recently_changed_data(
|
|
376
|
-
|
|
449
|
+
source_data, target_data = clean_recently_changed_data(
|
|
450
|
+
source_data, target_data, key_columns
|
|
451
|
+
)
|
|
377
452
|
|
|
378
453
|
stats, details = compare_dataframes(
|
|
379
|
-
source_data, target_data,
|
|
380
|
-
key_columns, max_examples
|
|
454
|
+
source_data, target_data, key_columns, max_examples
|
|
381
455
|
)
|
|
382
456
|
|
|
383
457
|
if stats:
|
|
384
458
|
details.skipped_source_columns = source_only_cols
|
|
385
459
|
details.skipped_target_columns = target_only_cols
|
|
386
460
|
|
|
387
|
-
report = generate_comparison_sample_report(
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
461
|
+
report = generate_comparison_sample_report(
|
|
462
|
+
source_table.full_name,
|
|
463
|
+
target_table.full_name,
|
|
464
|
+
stats,
|
|
465
|
+
details,
|
|
466
|
+
self.timezone,
|
|
467
|
+
source_query,
|
|
468
|
+
source_params,
|
|
469
|
+
target_query,
|
|
470
|
+
target_params,
|
|
471
|
+
)
|
|
472
|
+
status = (
|
|
473
|
+
ct.COMPARISON_FAILED
|
|
474
|
+
if stats.final_diff_score > tolerance_percentage
|
|
475
|
+
else ct.COMPARISON_SUCCESS
|
|
476
|
+
)
|
|
398
477
|
return status, report, stats, details
|
|
399
478
|
else:
|
|
400
479
|
status = ct.COMPARISON_SKIPPED
|
|
401
480
|
return status, None, None, None
|
|
402
481
|
|
|
403
482
|
except Exception as e:
|
|
404
|
-
app_logger.error(f
|
|
483
|
+
app_logger.error(f'Sample comparison failed: {str(e)}')
|
|
405
484
|
raise
|
|
406
485
|
|
|
407
486
|
def compare_custom_query(
|
|
@@ -413,25 +492,25 @@ class DataQualityComparator:
|
|
|
413
492
|
custom_primary_key: List[str],
|
|
414
493
|
exclude_columns: Optional[List[str]] = None,
|
|
415
494
|
tolerance_percentage: float = 0.0,
|
|
416
|
-
max_examples:Optional[int] = ct.DEFAULT_MAX_EXAMPLES
|
|
495
|
+
max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
|
|
417
496
|
) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
|
|
418
497
|
"""
|
|
419
498
|
Compare data from custom queries with specified key columns
|
|
420
499
|
|
|
421
500
|
Parameters:
|
|
422
|
-
source_query : Union[str, Tuple[str, Dict]]
|
|
501
|
+
source_query : Union[str, Tuple[str, Dict]]
|
|
423
502
|
Source query (can be string or tuple with query and params).
|
|
424
503
|
target_query : Union[str, Tuple[str, Dict]]
|
|
425
504
|
Target query (can be string or tuple with query and params).
|
|
426
505
|
custom_primary_key : List[str]
|
|
427
506
|
List of primary key columns for comparison.
|
|
428
|
-
exclude_columns : Optional[List[str]] = None
|
|
507
|
+
exclude_columns : Optional[List[str]] = None
|
|
429
508
|
Columns to exclude from comparison.
|
|
430
|
-
tolerance_percentage : float
|
|
509
|
+
tolerance_percentage : float
|
|
431
510
|
Tolerance percentage for discrepancies.
|
|
432
511
|
max_examples: int
|
|
433
|
-
Maximum number of discrepancy examples per column
|
|
434
|
-
|
|
512
|
+
Maximum number of discrepancy examples per column
|
|
513
|
+
|
|
435
514
|
Returns:
|
|
436
515
|
----------
|
|
437
516
|
Tuple[str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]
|
|
@@ -444,8 +523,12 @@ class DataQualityComparator:
|
|
|
444
523
|
self.comparison_stats['compared'] += 1
|
|
445
524
|
|
|
446
525
|
# Execute queries
|
|
447
|
-
source_data = self._execute_query(
|
|
448
|
-
|
|
526
|
+
source_data = self._execute_query(
|
|
527
|
+
(source_query, source_params), source_engine, timezone
|
|
528
|
+
)
|
|
529
|
+
target_data = self._execute_query(
|
|
530
|
+
(target_query, target_params), target_engine, timezone
|
|
531
|
+
)
|
|
449
532
|
app_logger.info('preparing source dataframe')
|
|
450
533
|
source_data_prepared = prepare_dataframe(source_data)
|
|
451
534
|
app_logger.info('preparing target dataframe')
|
|
@@ -453,43 +536,60 @@ class DataQualityComparator:
|
|
|
453
536
|
|
|
454
537
|
# Exclude columns if specified
|
|
455
538
|
exclude_cols = exclude_columns or []
|
|
456
|
-
common_cols = [
|
|
457
|
-
|
|
539
|
+
common_cols = [
|
|
540
|
+
col
|
|
541
|
+
for col in source_data_prepared.columns
|
|
542
|
+
if col in target_data_prepared.columns and col not in exclude_cols
|
|
543
|
+
]
|
|
458
544
|
|
|
459
545
|
source_data_filtered = source_data_prepared[common_cols]
|
|
460
546
|
target_data_filtered = target_data_prepared[common_cols]
|
|
461
547
|
if 'xrecently_changed' in common_cols:
|
|
462
|
-
source_data_filtered, target_data_filtered =
|
|
548
|
+
source_data_filtered, target_data_filtered = (
|
|
549
|
+
clean_recently_changed_data(
|
|
550
|
+
source_data_filtered, target_data_filtered, custom_primary_key
|
|
551
|
+
)
|
|
552
|
+
)
|
|
463
553
|
# Compare dataframes
|
|
464
554
|
stats, details = compare_dataframes(
|
|
465
|
-
source_data_filtered,
|
|
555
|
+
source_data_filtered,
|
|
556
|
+
target_data_filtered,
|
|
557
|
+
custom_primary_key,
|
|
558
|
+
max_examples,
|
|
466
559
|
)
|
|
467
560
|
|
|
468
561
|
if stats:
|
|
469
|
-
report = generate_comparison_sample_report(
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
562
|
+
report = generate_comparison_sample_report(
|
|
563
|
+
None,
|
|
564
|
+
None,
|
|
565
|
+
stats,
|
|
566
|
+
details,
|
|
567
|
+
self.timezone,
|
|
568
|
+
source_query,
|
|
569
|
+
source_params,
|
|
570
|
+
target_query,
|
|
571
|
+
target_params,
|
|
572
|
+
)
|
|
573
|
+
status = (
|
|
574
|
+
ct.COMPARISON_FAILED
|
|
575
|
+
if stats.final_diff_score > tolerance_percentage
|
|
576
|
+
else ct.COMPARISON_SUCCESS
|
|
577
|
+
)
|
|
480
578
|
else:
|
|
481
579
|
status = ct.COMPARISON_SKIPPED
|
|
482
580
|
|
|
483
|
-
|
|
484
581
|
self._update_stats(status, None)
|
|
485
582
|
return status, report, stats, details
|
|
486
583
|
|
|
487
584
|
except Exception as e:
|
|
488
|
-
app_logger.exception(
|
|
585
|
+
app_logger.exception('Custom query comparison failed')
|
|
489
586
|
status = ct.COMPARISON_FAILED
|
|
490
587
|
self._update_stats(status, None)
|
|
491
588
|
return status, None, None, None
|
|
492
|
-
|
|
589
|
+
|
|
590
|
+
def _get_metadata_cols(
|
|
591
|
+
self, data_ref: DataReference, engine: Engine
|
|
592
|
+
) -> pd.DataFrame:
|
|
493
593
|
"""Get metadata with proper source handling"""
|
|
494
594
|
adapter = self._get_adapter(DBMSType.from_engine(engine))
|
|
495
595
|
|
|
@@ -497,13 +597,12 @@ class DataQualityComparator:
|
|
|
497
597
|
columns_meta = self._execute_query((query, params), engine)
|
|
498
598
|
|
|
499
599
|
if columns_meta.empty:
|
|
500
|
-
raise ValueError(f
|
|
600
|
+
raise ValueError(f'Failed to get metadata for: {data_ref.full_name}')
|
|
501
601
|
|
|
502
602
|
return columns_meta
|
|
503
603
|
|
|
504
604
|
def _get_metadata_pk(self, data_ref: DataReference, engine: Engine) -> pd.DataFrame:
|
|
505
|
-
"""Get metadata with proper source handling
|
|
506
|
-
"""
|
|
605
|
+
"""Get metadata with proper source handling"""
|
|
507
606
|
adapter = self._get_adapter(DBMSType.from_engine(engine))
|
|
508
607
|
|
|
509
608
|
query, params = adapter.build_primary_key_query(data_ref)
|
|
@@ -527,19 +626,24 @@ class DataQualityComparator:
|
|
|
527
626
|
update_column: str,
|
|
528
627
|
start_date: Optional[str],
|
|
529
628
|
end_date: Optional[str],
|
|
530
|
-
exclude_recent_hours: Optional[int]
|
|
531
|
-
) -> Tuple[pd.DataFrame, str, Dict]
|
|
629
|
+
exclude_recent_hours: Optional[int],
|
|
630
|
+
) -> Tuple[pd.DataFrame, str, Dict]:
|
|
532
631
|
"""Retrieve and prepare table data"""
|
|
533
632
|
db_type = DBMSType.from_engine(engine)
|
|
534
633
|
adapter = self._get_adapter(db_type)
|
|
535
634
|
app_logger.info(db_type)
|
|
536
635
|
|
|
537
636
|
query, params = adapter.build_data_query_common(
|
|
538
|
-
data_ref,
|
|
539
|
-
|
|
637
|
+
data_ref,
|
|
638
|
+
columns,
|
|
639
|
+
date_column,
|
|
640
|
+
update_column,
|
|
641
|
+
start_date,
|
|
642
|
+
end_date,
|
|
643
|
+
exclude_recent_hours,
|
|
540
644
|
)
|
|
541
645
|
|
|
542
|
-
df = self._execute_query((query,params), engine, self.timezone)
|
|
646
|
+
df = self._execute_query((query, params), engine, self.timezone)
|
|
543
647
|
|
|
544
648
|
# Apply type conversions
|
|
545
649
|
df = adapter.convert_types(df, metadata, self.timezone)
|
|
@@ -551,9 +655,11 @@ class DataQualityComparator:
|
|
|
551
655
|
try:
|
|
552
656
|
return self.adapters[db_type]
|
|
553
657
|
except KeyError:
|
|
554
|
-
raise ValueError(f
|
|
658
|
+
raise ValueError(f'No adapter available for {db_type}')
|
|
555
659
|
|
|
556
|
-
def _execute_query(
|
|
660
|
+
def _execute_query(
|
|
661
|
+
self, query: Union[str, Tuple[str, Dict]], engine: Engine, timezone: str = None
|
|
662
|
+
) -> pd.DataFrame:
|
|
557
663
|
"""Execute SQL query using appropriate adapter"""
|
|
558
664
|
db_type = DBMSType.from_engine(engine)
|
|
559
665
|
adapter = self._get_adapter(db_type)
|
|
@@ -562,9 +668,7 @@ class DataQualityComparator:
|
|
|
562
668
|
return df
|
|
563
669
|
|
|
564
670
|
def _analyze_columns_meta(
|
|
565
|
-
self,
|
|
566
|
-
source_columns_meta: pd.DataFrame,
|
|
567
|
-
target_columns_meta: pd.DataFrame
|
|
671
|
+
self, source_columns_meta: pd.DataFrame, target_columns_meta: pd.DataFrame
|
|
568
672
|
) -> tuple[pd.DataFrame, list, list]:
|
|
569
673
|
"""Find common columns between source and target and return unique columns for each"""
|
|
570
674
|
|
|
@@ -572,8 +676,10 @@ class DataQualityComparator:
|
|
|
572
676
|
target_columns = target_columns_meta['column_name'].tolist()
|
|
573
677
|
|
|
574
678
|
common_columns = pd.merge(
|
|
575
|
-
source_columns_meta,
|
|
576
|
-
|
|
679
|
+
source_columns_meta,
|
|
680
|
+
target_columns_meta,
|
|
681
|
+
on='column_name',
|
|
682
|
+
suffixes=('_source', '_target'),
|
|
577
683
|
)
|
|
578
684
|
|
|
579
685
|
source_set = set(source_columns)
|
|
@@ -584,13 +690,9 @@ class DataQualityComparator:
|
|
|
584
690
|
|
|
585
691
|
return common_columns, source_unique, target_unique
|
|
586
692
|
|
|
587
|
-
def _validate_inputs(
|
|
588
|
-
self,
|
|
589
|
-
source: DataReference,
|
|
590
|
-
target: DataReference
|
|
591
|
-
):
|
|
693
|
+
def _validate_inputs(self, source: DataReference, target: DataReference):
|
|
592
694
|
"""Validate input parameters"""
|
|
593
695
|
if not isinstance(source, DataReference):
|
|
594
|
-
raise TypeError(
|
|
696
|
+
raise TypeError('source must be a DataReference')
|
|
595
697
|
if not isinstance(target, DataReference):
|
|
596
|
-
raise TypeError(
|
|
698
|
+
raise TypeError('target must be a DataReference')
|