xoverrr 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xoverrr/__init__.py +8 -12
- xoverrr/adapters/__init__.py +7 -2
- xoverrr/adapters/base.py +61 -32
- xoverrr/adapters/clickhouse.py +62 -37
- xoverrr/adapters/oracle.py +65 -36
- xoverrr/adapters/postgres.py +67 -35
- xoverrr/constants.py +4 -4
- xoverrr/core.py +296 -198
- xoverrr/exceptions.py +8 -1
- xoverrr/logger.py +4 -2
- xoverrr/models.py +11 -5
- xoverrr/utils.py +328 -252
- {xoverrr-1.1.5.dist-info → xoverrr-1.1.6.dist-info}/METADATA +3 -3
- xoverrr-1.1.6.dist-info/RECORD +17 -0
- xoverrr-1.1.5.dist-info/RECORD +0 -17
- {xoverrr-1.1.5.dist-info → xoverrr-1.1.6.dist-info}/WHEEL +0 -0
- {xoverrr-1.1.5.dist-info → xoverrr-1.1.6.dist-info}/licenses/LICENSE +0 -0
- {xoverrr-1.1.5.dist-info → xoverrr-1.1.6.dist-info}/top_level.txt +0 -0
xoverrr/utils.py
CHANGED
|
@@ -1,17 +1,31 @@
|
|
|
1
|
-
import
|
|
2
|
-
import numpy as np
|
|
3
|
-
from typing import Dict, Any, List, Optional, Tuple, defaultdict
|
|
1
|
+
from dataclasses import dataclass, field
|
|
4
2
|
from datetime import datetime
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, defaultdict
|
|
5
4
|
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
6
7
|
|
|
7
|
-
from .constants import
|
|
8
|
+
from .constants import DATETIME_FORMAT, DEFAULT_MAX_EXAMPLES, NULL_REPLACEMENT
|
|
8
9
|
from .logger import app_logger
|
|
9
10
|
|
|
10
|
-
|
|
11
|
+
|
|
12
|
+
def normalize_column_names(columns: List[str]) -> List[str]:
|
|
13
|
+
"""
|
|
14
|
+
Normalize column names to lowercase for consistent comparison.
|
|
15
|
+
|
|
16
|
+
Parameters:
|
|
17
|
+
columns: List of column names to normalize
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
List of lowercased column names
|
|
21
|
+
"""
|
|
22
|
+
return [col.lower() for col in columns] if columns else []
|
|
23
|
+
|
|
11
24
|
|
|
12
25
|
@dataclass
|
|
13
26
|
class ComparisonStats:
|
|
14
27
|
"""Class for storing comparison statistics"""
|
|
28
|
+
|
|
15
29
|
total_source_rows: int
|
|
16
30
|
total_target_rows: int
|
|
17
31
|
|
|
@@ -28,13 +42,14 @@ class ComparisonStats:
|
|
|
28
42
|
|
|
29
43
|
source_only_percentage_rows: float
|
|
30
44
|
target_only_percentage_rows: float
|
|
31
|
-
total_diff_percentage_rows
|
|
45
|
+
total_diff_percentage_rows: float
|
|
32
46
|
#
|
|
33
|
-
max_diff_percentage_cols
|
|
47
|
+
max_diff_percentage_cols: float
|
|
34
48
|
median_diff_percentage_cols: float
|
|
35
49
|
#
|
|
36
50
|
final_diff_score: float
|
|
37
|
-
final_score
|
|
51
|
+
final_score: float
|
|
52
|
+
|
|
38
53
|
|
|
39
54
|
@dataclass
|
|
40
55
|
class ComparisonDiffDetails:
|
|
@@ -47,16 +62,14 @@ class ComparisonDiffDetails:
|
|
|
47
62
|
source_only_keys_examples: tuple
|
|
48
63
|
target_only_keys_examples: tuple
|
|
49
64
|
|
|
50
|
-
discrepant_data_examples:
|
|
65
|
+
discrepant_data_examples: pd.DataFrame
|
|
51
66
|
common_attribute_columns: List[str]
|
|
52
|
-
skipped_source_columns: List[str]= field(default_factory=list)
|
|
53
|
-
skipped_target_columns: List[str]= field(default_factory=list)
|
|
67
|
+
skipped_source_columns: List[str] = field(default_factory=list)
|
|
68
|
+
skipped_target_columns: List[str] = field(default_factory=list)
|
|
54
69
|
|
|
55
70
|
|
|
56
71
|
def compare_dataframes_meta(
|
|
57
|
-
df1: pd.DataFrame,
|
|
58
|
-
df2: pd.DataFrame,
|
|
59
|
-
primary_keys: List[str] = None
|
|
72
|
+
df1: pd.DataFrame, df2: pd.DataFrame, primary_keys: List[str] = None
|
|
60
73
|
) -> List[str]:
|
|
61
74
|
"""
|
|
62
75
|
Compare two pandas DataFrames and find common and different columns.
|
|
@@ -88,9 +101,12 @@ def compare_dataframes_meta(
|
|
|
88
101
|
|
|
89
102
|
return common_columns
|
|
90
103
|
|
|
91
|
-
def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_keys_cnt, examples_count=3):
|
|
92
104
|
|
|
93
|
-
|
|
105
|
+
def analyze_column_discrepancies(
|
|
106
|
+
df, primary_key_columns, value_columns, common_keys_cnt, examples_count=3
|
|
107
|
+
):
|
|
108
|
+
|
|
109
|
+
metrics = {'max_pct': 0.0, 'median_pct': 0.0}
|
|
94
110
|
diff_counters = defaultdict(int)
|
|
95
111
|
diff_examples = {col: [] for col in value_columns}
|
|
96
112
|
|
|
@@ -114,10 +130,11 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
|
|
|
114
130
|
src_val = getattr(src_row, col)
|
|
115
131
|
trg_val = getattr(trg_row, col)
|
|
116
132
|
if src_val != trg_val:
|
|
117
|
-
|
|
118
133
|
diff_counters[col] += 1
|
|
119
134
|
if len(diff_examples[col]) < examples_count:
|
|
120
|
-
diff_examples[col].append(
|
|
135
|
+
diff_examples[col].append(
|
|
136
|
+
{'pk': pk_value, 'src_val': src_val, 'trg_val': trg_val}
|
|
137
|
+
)
|
|
121
138
|
|
|
122
139
|
# filter out cols without examples
|
|
123
140
|
diff_examples = {k: v for k, v in diff_examples.items() if v}
|
|
@@ -127,7 +144,6 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
|
|
|
127
144
|
metrics['max_pct'] = max_pct
|
|
128
145
|
metrics['median_pct'] = median_pct
|
|
129
146
|
|
|
130
|
-
|
|
131
147
|
# transform to dataframes
|
|
132
148
|
# 1
|
|
133
149
|
diff_records = []
|
|
@@ -145,8 +161,8 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
|
|
|
145
161
|
# 2
|
|
146
162
|
df_diff_counters = pd.DataFrame(
|
|
147
163
|
list(diff_counters.items()), # преобразуем в список кортежей
|
|
148
|
-
columns=['column_name', 'mismatch_count'] # переименовываем колонки
|
|
149
|
-
|
|
164
|
+
columns=['column_name', 'mismatch_count'], # переименовываем колонки
|
|
165
|
+
)
|
|
150
166
|
|
|
151
167
|
return metrics, df_diff_examples, df_diff_counters
|
|
152
168
|
|
|
@@ -155,7 +171,7 @@ def compare_dataframes(
|
|
|
155
171
|
source_df: pd.DataFrame,
|
|
156
172
|
target_df: pd.DataFrame,
|
|
157
173
|
key_columns: List[str],
|
|
158
|
-
max_examples: int = DEFAULT_MAX_EXAMPLES
|
|
174
|
+
max_examples: int = DEFAULT_MAX_EXAMPLES,
|
|
159
175
|
) -> tuple[ComparisonStats, ComparisonDiffDetails]:
|
|
160
176
|
"""
|
|
161
177
|
Efficient comparison of two dataframes by primary key when discrepancies ratio quite small,
|
|
@@ -192,8 +208,12 @@ def compare_dataframes(
|
|
|
192
208
|
source_dup = source_df[source_df.duplicated(subset=key_columns, keep=False)]
|
|
193
209
|
target_dup = target_df[target_df.duplicated(subset=key_columns, keep=False)]
|
|
194
210
|
|
|
195
|
-
source_dup_keys =
|
|
196
|
-
|
|
211
|
+
source_dup_keys = (
|
|
212
|
+
_create_keys_set(source_dup, key_columns) if not source_dup.empty else set()
|
|
213
|
+
)
|
|
214
|
+
target_dup_keys = (
|
|
215
|
+
_create_keys_set(target_dup, key_columns) if not target_dup.empty else set()
|
|
216
|
+
)
|
|
197
217
|
|
|
198
218
|
source_dup_keys_examples = format_keys(source_dup_keys, max_examples)
|
|
199
219
|
target_dup_keys_examples = format_keys(target_dup_keys, max_examples)
|
|
@@ -214,13 +234,16 @@ def compare_dataframes(
|
|
|
214
234
|
xor_combined_df = (
|
|
215
235
|
pd.concat([source_clean, target_clean], ignore_index=True)
|
|
216
236
|
.drop_duplicates(subset=key_columns + non_key_columns, keep=False)
|
|
217
|
-
.assign(
|
|
237
|
+
.assign(
|
|
238
|
+
xcount_pairs=lambda df: df.groupby(key_columns)[key_columns[0]].transform(
|
|
239
|
+
'size'
|
|
240
|
+
)
|
|
241
|
+
)
|
|
218
242
|
)
|
|
219
243
|
|
|
220
244
|
# symmetrical difference between two datasets, sorted
|
|
221
245
|
xor_combined_sorted = xor_combined_df.sort_values(
|
|
222
|
-
by=key_columns + ['xflg'],
|
|
223
|
-
ascending=[False] * len(key_columns) + [True]
|
|
246
|
+
by=key_columns + ['xflg'], ascending=[False] * len(key_columns) + [True]
|
|
224
247
|
)
|
|
225
248
|
|
|
226
249
|
mask = xor_combined_sorted['xcount_pairs'] > 1
|
|
@@ -234,53 +257,66 @@ def compare_dataframes(
|
|
|
234
257
|
xor_source_only_keys = _create_keys_set(xor_df_source_only, key_columns)
|
|
235
258
|
xor_target_only_keys = _create_keys_set(xor_df_target_only, key_columns)
|
|
236
259
|
|
|
237
|
-
xor_common_keys_cnt = int(len(xor_df_multi)/2) if not xor_df_multi.empty else 0
|
|
260
|
+
xor_common_keys_cnt = int(len(xor_df_multi) / 2) if not xor_df_multi.empty else 0
|
|
238
261
|
xor_source_only_keys_cnt = len(xor_source_only_keys)
|
|
239
262
|
xor_target_only_keys_cnt = len(xor_target_only_keys)
|
|
240
263
|
|
|
241
264
|
# take n pairs that is why examples x2
|
|
242
|
-
xor_df_multi_example =
|
|
265
|
+
xor_df_multi_example = (
|
|
266
|
+
xor_df_multi.head(max_examples * 2).drop(columns=['xcount_pairs'])
|
|
267
|
+
if not xor_df_multi.empty
|
|
268
|
+
else pd.DataFrame()
|
|
269
|
+
)
|
|
243
270
|
|
|
244
271
|
xor_source_only_keys_examples = format_keys(xor_source_only_keys, max_examples)
|
|
245
272
|
xor_target_only_keys_examples = format_keys(xor_target_only_keys, max_examples)
|
|
246
273
|
|
|
247
274
|
# get number of records that present in two datasets based on primary key
|
|
248
|
-
common_keys_cnt = int(
|
|
275
|
+
common_keys_cnt = int(
|
|
276
|
+
(
|
|
277
|
+
len(source_clean)
|
|
278
|
+
- xor_source_only_keys_cnt
|
|
279
|
+
+ len(target_clean)
|
|
280
|
+
- xor_target_only_keys_cnt
|
|
281
|
+
)
|
|
282
|
+
/ 2
|
|
283
|
+
)
|
|
249
284
|
|
|
250
285
|
if not common_keys_cnt:
|
|
251
|
-
#Special case when there is no matched primary keys at all
|
|
286
|
+
# Special case when there is no matched primary keys at all
|
|
252
287
|
comparison_stats = ComparisonStats(
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
288
|
+
total_source_rows=len(source_df),
|
|
289
|
+
total_target_rows=len(target_df),
|
|
290
|
+
dup_source_rows=source_dup_cnt,
|
|
291
|
+
dup_target_rows=target_dup_cnt,
|
|
292
|
+
only_source_rows=xor_source_only_keys_cnt,
|
|
293
|
+
only_target_rows=xor_target_only_keys_cnt,
|
|
294
|
+
common_pk_rows=0,
|
|
295
|
+
total_matched_rows=0,
|
|
296
|
+
#
|
|
297
|
+
dup_source_percentage_rows=100,
|
|
298
|
+
dup_target_percentage_rows=100,
|
|
299
|
+
source_only_percentage_rows=100,
|
|
300
|
+
target_only_percentage_rows=100,
|
|
301
|
+
total_diff_percentage_rows=100,
|
|
302
|
+
#
|
|
303
|
+
max_diff_percentage_cols=100,
|
|
304
|
+
median_diff_percentage_cols=100,
|
|
305
|
+
#
|
|
306
|
+
final_diff_score=100,
|
|
307
|
+
final_score=0,
|
|
273
308
|
)
|
|
274
309
|
|
|
275
310
|
comparison_diff_detais = ComparisonDiffDetails(
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
311
|
+
mismatches_per_column=pd.DataFrame(),
|
|
312
|
+
discrepancies_per_col_examples=pd.DataFrame(),
|
|
313
|
+
dup_source_keys_examples=source_dup_keys_examples,
|
|
314
|
+
dup_target_keys_examples=target_dup_keys_examples,
|
|
315
|
+
common_attribute_columns=non_key_columns,
|
|
316
|
+
source_only_keys_examples=xor_source_only_keys_examples,
|
|
317
|
+
target_only_keys_examples=xor_target_only_keys_examples,
|
|
318
|
+
discrepant_data_examples=pd.DataFrame(),
|
|
319
|
+
)
|
|
284
320
|
app_logger.info('end')
|
|
285
321
|
|
|
286
322
|
return comparison_stats, comparison_diff_detais
|
|
@@ -288,73 +324,79 @@ def compare_dataframes(
|
|
|
288
324
|
# get number of that totally equal in two datasets
|
|
289
325
|
total_matched_records_cnt = common_keys_cnt - xor_common_keys_cnt
|
|
290
326
|
|
|
291
|
-
source_only_percentage = (xor_source_only_keys_cnt/common_keys_cnt)*100
|
|
292
|
-
target_only_percentage = (xor_target_only_keys_cnt/common_keys_cnt)*100
|
|
327
|
+
source_only_percentage = (xor_source_only_keys_cnt / common_keys_cnt) * 100
|
|
328
|
+
target_only_percentage = (xor_target_only_keys_cnt / common_keys_cnt) * 100
|
|
293
329
|
|
|
294
|
-
source_dup_percentage = (source_dup_cnt/len(source_df))*100
|
|
295
|
-
target_dup_percentage = (target_dup_cnt/len(target_df))*100
|
|
296
|
-
|
|
297
|
-
diff_col_metrics, \
|
|
298
|
-
diff_col_examples,\
|
|
299
|
-
diff_col_counters = analyze_column_discrepancies(xor_df_multi, key_columns, non_key_columns, common_keys_cnt, max_examples)
|
|
330
|
+
source_dup_percentage = (source_dup_cnt / len(source_df)) * 100
|
|
331
|
+
target_dup_percentage = (target_dup_cnt / len(target_df)) * 100
|
|
300
332
|
|
|
333
|
+
diff_col_metrics, diff_col_examples, diff_col_counters = (
|
|
334
|
+
analyze_column_discrepancies(
|
|
335
|
+
xor_df_multi, key_columns, non_key_columns, common_keys_cnt, max_examples
|
|
336
|
+
)
|
|
337
|
+
)
|
|
301
338
|
|
|
302
|
-
source_and_target_total_diff_percentage = (
|
|
339
|
+
source_and_target_total_diff_percentage = (
|
|
340
|
+
1 - total_matched_records_cnt / common_keys_cnt
|
|
341
|
+
) * 100
|
|
303
342
|
|
|
304
|
-
final_diff_score =
|
|
305
|
-
|
|
306
|
-
|
|
343
|
+
final_diff_score = (
|
|
344
|
+
source_dup_percentage * 0.1
|
|
345
|
+
+ target_dup_percentage * 0.1
|
|
346
|
+
+ source_only_percentage * 0.15
|
|
347
|
+
+ target_only_percentage * 0.15
|
|
348
|
+
+ source_and_target_total_diff_percentage * 0.5
|
|
349
|
+
)
|
|
307
350
|
|
|
308
351
|
comparison_stats = ComparisonStats(
|
|
309
|
-
total_source_rows
|
|
310
|
-
total_target_rows
|
|
311
|
-
dup_source_rows
|
|
312
|
-
dup_target_rows
|
|
313
|
-
only_source_rows
|
|
314
|
-
only_target_rows
|
|
315
|
-
common_pk_rows
|
|
316
|
-
total_matched_rows=
|
|
352
|
+
total_source_rows=len(source_df),
|
|
353
|
+
total_target_rows=len(target_df),
|
|
354
|
+
dup_source_rows=source_dup_cnt,
|
|
355
|
+
dup_target_rows=target_dup_cnt,
|
|
356
|
+
only_source_rows=xor_source_only_keys_cnt,
|
|
357
|
+
only_target_rows=xor_target_only_keys_cnt,
|
|
358
|
+
common_pk_rows=common_keys_cnt,
|
|
359
|
+
total_matched_rows=total_matched_records_cnt,
|
|
317
360
|
#
|
|
318
|
-
dup_source_percentage_rows
|
|
319
|
-
dup_target_percentage_rows
|
|
320
|
-
source_only_percentage_rows
|
|
321
|
-
target_only_percentage_rows
|
|
322
|
-
total_diff_percentage_rows
|
|
361
|
+
dup_source_percentage_rows=source_dup_percentage,
|
|
362
|
+
dup_target_percentage_rows=target_dup_percentage,
|
|
363
|
+
source_only_percentage_rows=source_only_percentage,
|
|
364
|
+
target_only_percentage_rows=target_only_percentage,
|
|
365
|
+
total_diff_percentage_rows=source_and_target_total_diff_percentage,
|
|
323
366
|
#
|
|
324
|
-
max_diff_percentage_cols
|
|
325
|
-
median_diff_percentage_cols
|
|
367
|
+
max_diff_percentage_cols=diff_col_metrics['max_pct'],
|
|
368
|
+
median_diff_percentage_cols=diff_col_metrics['median_pct'],
|
|
326
369
|
#
|
|
327
|
-
final_diff_score
|
|
328
|
-
final_score
|
|
329
|
-
|
|
370
|
+
final_diff_score=final_diff_score,
|
|
371
|
+
final_score=100 - final_diff_score,
|
|
372
|
+
)
|
|
330
373
|
|
|
331
374
|
comparison_diff_detais = ComparisonDiffDetails(
|
|
332
|
-
mismatches_per_column
|
|
333
|
-
discrepancies_per_col_examples
|
|
334
|
-
dup_source_keys_examples
|
|
335
|
-
dup_target_keys_examples
|
|
336
|
-
source_only_keys_examples
|
|
337
|
-
target_only_keys_examples
|
|
338
|
-
discrepant_data_examples
|
|
339
|
-
common_attribute_columns=non_key_columns
|
|
375
|
+
mismatches_per_column=diff_col_counters,
|
|
376
|
+
discrepancies_per_col_examples=diff_col_examples,
|
|
377
|
+
dup_source_keys_examples=source_dup_keys_examples,
|
|
378
|
+
dup_target_keys_examples=target_dup_keys_examples,
|
|
379
|
+
source_only_keys_examples=xor_source_only_keys_examples,
|
|
380
|
+
target_only_keys_examples=xor_target_only_keys_examples,
|
|
381
|
+
discrepant_data_examples=xor_df_multi_example,
|
|
382
|
+
common_attribute_columns=non_key_columns,
|
|
383
|
+
)
|
|
340
384
|
|
|
341
385
|
app_logger.info('end')
|
|
342
386
|
return comparison_stats, comparison_diff_detais
|
|
343
387
|
|
|
344
388
|
|
|
345
389
|
def _validate_input_data(
|
|
346
|
-
source_df: pd.DataFrame,
|
|
347
|
-
target_df: pd.DataFrame,
|
|
348
|
-
key_columns: List[str]
|
|
390
|
+
source_df: pd.DataFrame, target_df: pd.DataFrame, key_columns: List[str]
|
|
349
391
|
) -> None:
|
|
350
392
|
"""Input data validation"""
|
|
351
393
|
if not all(col in source_df.columns for col in key_columns):
|
|
352
394
|
missing = [col for col in key_columns if col not in source_df.columns]
|
|
353
|
-
raise ValueError(f
|
|
395
|
+
raise ValueError(f'Key columns missing in source: {missing}')
|
|
354
396
|
|
|
355
397
|
if not all(col in target_df.columns for col in key_columns):
|
|
356
398
|
missing = [col for col in key_columns if col not in target_df.columns]
|
|
357
|
-
raise ValueError(f
|
|
399
|
+
raise ValueError(f'Key columns missing in target: {missing}')
|
|
358
400
|
|
|
359
401
|
|
|
360
402
|
def _create_keys_set(df: pd.DataFrame, key_columns: List[str]) -> set:
|
|
@@ -362,154 +404,173 @@ def _create_keys_set(df: pd.DataFrame, key_columns: List[str]) -> set:
|
|
|
362
404
|
return set(df[key_columns].itertuples(index=False, name=None))
|
|
363
405
|
|
|
364
406
|
|
|
365
|
-
def generate_comparison_sample_report(
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
407
|
+
def generate_comparison_sample_report(
|
|
408
|
+
source_table: str,
|
|
409
|
+
target_table: str,
|
|
410
|
+
stats: ComparisonStats,
|
|
411
|
+
details: ComparisonDiffDetails,
|
|
412
|
+
timezone: str,
|
|
413
|
+
source_query: str = None,
|
|
414
|
+
source_params: Dict = None,
|
|
415
|
+
target_query: str = None,
|
|
416
|
+
target_params: Dict = None,
|
|
417
|
+
) -> None:
|
|
374
418
|
"""Generate comparison report (logger output looks uuugly)"""
|
|
375
419
|
rl = []
|
|
376
|
-
rl.append(
|
|
420
|
+
rl.append('=' * 80)
|
|
377
421
|
current_datetime = datetime.now()
|
|
378
422
|
rl.append(current_datetime.strftime(DATETIME_FORMAT))
|
|
379
|
-
rl.append(f
|
|
380
|
-
if source_table and target_table:
|
|
381
|
-
rl.append(f
|
|
382
|
-
rl.append(f
|
|
383
|
-
rl.append(f
|
|
384
|
-
rl.append(
|
|
423
|
+
rl.append(f'DATA SAMPLE COMPARISON REPORT: ')
|
|
424
|
+
if source_table and target_table: # empty for custom query
|
|
425
|
+
rl.append(f'{source_table}')
|
|
426
|
+
rl.append(f'VS')
|
|
427
|
+
rl.append(f'{target_table}')
|
|
428
|
+
rl.append('=' * 80)
|
|
385
429
|
|
|
386
430
|
if source_query and target_query:
|
|
387
|
-
rl.append(f
|
|
388
|
-
rl.append(f
|
|
431
|
+
rl.append(f'timezone: {timezone}')
|
|
432
|
+
rl.append(f' {source_query}')
|
|
389
433
|
if source_params:
|
|
390
|
-
rl.append(f
|
|
391
|
-
rl.append(
|
|
392
|
-
rl.append(f
|
|
434
|
+
rl.append(f' params: {source_params}')
|
|
435
|
+
rl.append('-' * 40)
|
|
436
|
+
rl.append(f' {target_query}')
|
|
393
437
|
if target_params:
|
|
394
|
-
rl.append(f
|
|
395
|
-
|
|
396
|
-
rl.append(
|
|
397
|
-
|
|
398
|
-
rl.append(f
|
|
399
|
-
rl.append(f
|
|
400
|
-
rl.append(f
|
|
401
|
-
rl.append(f
|
|
402
|
-
rl.append(f
|
|
403
|
-
rl.append(f
|
|
404
|
-
rl.append(f
|
|
405
|
-
rl.append(f
|
|
406
|
-
rl.append(f
|
|
407
|
-
rl.append(
|
|
408
|
-
rl.append(f
|
|
409
|
-
rl.append(f
|
|
410
|
-
rl.append(f
|
|
411
|
-
rl.append(f
|
|
412
|
-
rl.append(f
|
|
413
|
-
rl.append(f
|
|
414
|
-
rl.append(f
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
rl.append(f
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
rl.append(f
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
rl.append(f
|
|
438
|
+
rl.append(f' params: {target_params}')
|
|
439
|
+
|
|
440
|
+
rl.append('-' * 40)
|
|
441
|
+
|
|
442
|
+
rl.append(f'\nSUMMARY:')
|
|
443
|
+
rl.append(f' Source rows: {stats.total_source_rows}')
|
|
444
|
+
rl.append(f' Target rows: {stats.total_target_rows}')
|
|
445
|
+
rl.append(f' Duplicated source rows: {stats.dup_source_rows}')
|
|
446
|
+
rl.append(f' Duplicated target rows: {stats.dup_target_rows}')
|
|
447
|
+
rl.append(f' Only source rows: {stats.only_source_rows}')
|
|
448
|
+
rl.append(f' Only target rows: {stats.only_target_rows}')
|
|
449
|
+
rl.append(f' Common rows (by primary key): {stats.common_pk_rows}')
|
|
450
|
+
rl.append(f' Totally matched rows: {stats.total_matched_rows}')
|
|
451
|
+
rl.append('-' * 40)
|
|
452
|
+
rl.append(f' Source only rows %: {stats.source_only_percentage_rows:.5f}')
|
|
453
|
+
rl.append(f' Target only rows %: {stats.target_only_percentage_rows:.5f}')
|
|
454
|
+
rl.append(f' Duplicated source rows %: {stats.dup_source_percentage_rows:.5f}')
|
|
455
|
+
rl.append(f' Duplicated target rows %: {stats.dup_target_percentage_rows:.5f}')
|
|
456
|
+
rl.append(f' Mismatched rows %: {stats.total_diff_percentage_rows:.5f}')
|
|
457
|
+
rl.append(f' Final discrepancies score: {stats.final_diff_score:.5f}')
|
|
458
|
+
rl.append(f' Final data quality score: {stats.final_score:.5f}')
|
|
459
|
+
|
|
460
|
+
rl.append(f' Source-only key examples: {details.source_only_keys_examples}')
|
|
461
|
+
rl.append(f' Target-only key examples: {details.target_only_keys_examples}')
|
|
462
|
+
|
|
463
|
+
rl.append(f' Duplicated source key examples: {details.dup_source_keys_examples}')
|
|
464
|
+
rl.append(f' Duplicated target key examples: {details.dup_target_keys_examples}')
|
|
465
|
+
|
|
466
|
+
rl.append(
|
|
467
|
+
f' Common attribute columns: {", ".join(details.common_attribute_columns)}'
|
|
468
|
+
)
|
|
469
|
+
rl.append(f' Skipped source columns: {", ".join(details.skipped_source_columns)}')
|
|
470
|
+
rl.append(f' Skipped target columns: {", ".join(details.skipped_target_columns)}')
|
|
426
471
|
|
|
427
472
|
if stats.max_diff_percentage_cols > 0 and not details.mismatches_per_column.empty:
|
|
428
|
-
rl.append(f
|
|
473
|
+
rl.append(f'\nCOLUMN DIFFERENCES:')
|
|
429
474
|
|
|
430
|
-
rl.append(
|
|
431
|
-
|
|
475
|
+
rl.append(
|
|
476
|
+
f' Discrepancies per column (max %): {stats.max_diff_percentage_cols:.5f}'
|
|
477
|
+
)
|
|
478
|
+
rl.append(f' Count of mismatches per column:\n')
|
|
432
479
|
rl.append(details.mismatches_per_column.to_string(index=False))
|
|
433
480
|
|
|
434
|
-
rl.append(f
|
|
435
|
-
rl.append
|
|
436
|
-
|
|
481
|
+
rl.append(f' Some examples:\n')
|
|
482
|
+
rl.append(
|
|
483
|
+
details.discrepancies_per_col_examples.to_string(
|
|
484
|
+
index=False, max_colwidth=64, justify='left'
|
|
485
|
+
)
|
|
486
|
+
)
|
|
437
487
|
|
|
438
488
|
# Display sample data if available
|
|
439
|
-
if
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
rl.append(
|
|
444
|
-
rl.append(
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
489
|
+
if (
|
|
490
|
+
details.discrepant_data_examples is not None
|
|
491
|
+
and not details.discrepant_data_examples.empty
|
|
492
|
+
):
|
|
493
|
+
rl.append(f'\nDISCREPANT DATA (first pairs):')
|
|
494
|
+
rl.append('Sorted by primary key and dataset:')
|
|
495
|
+
rl.append(f'\n')
|
|
496
|
+
rl.append(
|
|
497
|
+
details.discrepant_data_examples.to_string(
|
|
498
|
+
index=False, max_colwidth=64, justify='left'
|
|
499
|
+
)
|
|
500
|
+
)
|
|
501
|
+
rl.append(f'\n')
|
|
502
|
+
|
|
503
|
+
rl.append('=' * 80)
|
|
504
|
+
|
|
505
|
+
return '\n'.join(rl)
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def generate_comparison_count_report(
|
|
509
|
+
source_table: str,
|
|
510
|
+
target_table: str,
|
|
511
|
+
stats: ComparisonStats,
|
|
512
|
+
details: ComparisonDiffDetails,
|
|
513
|
+
total_source_count: int,
|
|
514
|
+
total_target_count: int,
|
|
515
|
+
discrepancies_counters_percentage: int,
|
|
516
|
+
result_diff_in_counters: int,
|
|
517
|
+
result_equal_in_counters: int,
|
|
518
|
+
timezone: str,
|
|
519
|
+
source_query: str = None,
|
|
520
|
+
source_params: Dict = None,
|
|
521
|
+
target_query: str = None,
|
|
522
|
+
target_params: Dict = None,
|
|
523
|
+
) -> None:
|
|
465
524
|
"""Generates comparison report (logger output looks uuugly)"""
|
|
466
525
|
rl = []
|
|
467
|
-
rl.append(
|
|
526
|
+
rl.append('=' * 80)
|
|
468
527
|
current_datetime = datetime.now()
|
|
469
528
|
rl.append(current_datetime.strftime(DATETIME_FORMAT))
|
|
470
|
-
rl.append(f
|
|
471
|
-
rl.append(f
|
|
472
|
-
rl.append(f
|
|
473
|
-
rl.append(f
|
|
474
|
-
rl.append(
|
|
529
|
+
rl.append(f'COUNT COMPARISON REPORT:')
|
|
530
|
+
rl.append(f'{source_table}')
|
|
531
|
+
rl.append(f'VS')
|
|
532
|
+
rl.append(f'{target_table}')
|
|
533
|
+
rl.append('=' * 80)
|
|
475
534
|
|
|
476
535
|
if source_query and target_query:
|
|
477
|
-
rl.append(f
|
|
478
|
-
rl.append(f
|
|
536
|
+
rl.append(f'timezone: {timezone}')
|
|
537
|
+
rl.append(f' {source_query}')
|
|
479
538
|
if source_params:
|
|
480
|
-
rl.append(f
|
|
481
|
-
rl.append(
|
|
482
|
-
rl.append(f
|
|
539
|
+
rl.append(f' params: {source_params}')
|
|
540
|
+
rl.append('-' * 40)
|
|
541
|
+
rl.append(f' {target_query}')
|
|
483
542
|
if target_params:
|
|
484
|
-
rl.append(f
|
|
485
|
-
rl.append(
|
|
486
|
-
|
|
487
|
-
rl.append(f
|
|
488
|
-
rl.append(f
|
|
489
|
-
rl.append(f
|
|
490
|
-
rl.append(f
|
|
491
|
-
rl.append(f
|
|
492
|
-
rl.append(f
|
|
493
|
-
rl.append(f
|
|
494
|
-
rl.append(
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
rl.append(f
|
|
499
|
-
rl.append
|
|
500
|
-
|
|
501
|
-
)
|
|
543
|
+
rl.append(f' params: {target_params}')
|
|
544
|
+
rl.append('-' * 40)
|
|
545
|
+
|
|
546
|
+
rl.append(f'\nSUMMARY:')
|
|
547
|
+
rl.append(f' Source total count: {total_source_count}')
|
|
548
|
+
rl.append(f' Target total count: {total_target_count}')
|
|
549
|
+
rl.append(f' Common total count: {result_equal_in_counters}')
|
|
550
|
+
rl.append(f' Diff total count: {result_diff_in_counters}')
|
|
551
|
+
rl.append(f' Discrepancies percentage: {discrepancies_counters_percentage:.5f}%')
|
|
552
|
+
rl.append(f' Final discrepancies score: {discrepancies_counters_percentage:.5f}')
|
|
553
|
+
rl.append(
|
|
554
|
+
f' Final data quality score: {(100 - discrepancies_counters_percentage):.5f}'
|
|
555
|
+
)
|
|
556
|
+
if not details.mismatches_per_column.empty:
|
|
557
|
+
rl.append(f'\nDETAIL DIFFERENCES:')
|
|
558
|
+
rl.append(details.mismatches_per_column.to_string(index=False))
|
|
502
559
|
|
|
503
560
|
# Display sample data if available
|
|
504
|
-
if
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
561
|
+
if (
|
|
562
|
+
details.discrepant_data_examples is not None
|
|
563
|
+
and not details.discrepant_data_examples.empty
|
|
564
|
+
):
|
|
565
|
+
rl.append(f'\nDISCREPANT DATA (first pairs):')
|
|
566
|
+
rl.append('Sorted by primary key and dataset:')
|
|
567
|
+
rl.append(f'\n')
|
|
508
568
|
rl.append(details.discrepant_data_examples.to_string(index=False))
|
|
509
|
-
rl.append(f
|
|
510
|
-
rl.append(
|
|
569
|
+
rl.append(f'\n')
|
|
570
|
+
rl.append('=' * 80)
|
|
571
|
+
|
|
572
|
+
return '\n'.join(rl)
|
|
511
573
|
|
|
512
|
-
return "\n".join(rl)
|
|
513
574
|
|
|
514
575
|
def safe_remove_zeros(x):
|
|
515
576
|
if pd.isna(x):
|
|
@@ -518,11 +579,11 @@ def safe_remove_zeros(x):
|
|
|
518
579
|
return int(x)
|
|
519
580
|
return x
|
|
520
581
|
|
|
582
|
+
|
|
521
583
|
def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
522
584
|
"""Prepare DataFrame for comparison by handling nulls and empty strings"""
|
|
523
585
|
df = df.map(safe_remove_zeros)
|
|
524
586
|
|
|
525
|
-
|
|
526
587
|
df = df.fillna(NULL_REPLACEMENT)
|
|
527
588
|
df = df.replace(r'(?i)^(None|nan|NaN|NaT|\s*)$', NULL_REPLACEMENT, regex=True)
|
|
528
589
|
|
|
@@ -530,26 +591,36 @@ def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
530
591
|
|
|
531
592
|
return df
|
|
532
593
|
|
|
594
|
+
|
|
533
595
|
def exclude_by_keys(df, key_columns, exclude_set):
|
|
534
596
|
if len(key_columns) == 1:
|
|
535
597
|
exclude_values = [x[0] for x in exclude_set]
|
|
536
598
|
return df[~df[key_columns[0]].isin(exclude_values)]
|
|
537
599
|
else:
|
|
538
|
-
return df[
|
|
600
|
+
return df[
|
|
601
|
+
~df.apply(
|
|
602
|
+
lambda row: tuple(row[col] for col in key_columns) in exclude_set,
|
|
603
|
+
axis=1,
|
|
604
|
+
)
|
|
605
|
+
]
|
|
539
606
|
|
|
540
607
|
|
|
541
|
-
def clean_recently_changed_data(
|
|
608
|
+
def clean_recently_changed_data(
|
|
609
|
+
df1: pd.DataFrame, df2: pd.DataFrame, primary_keys: List[str]
|
|
610
|
+
):
|
|
542
611
|
"""
|
|
543
612
|
Mutually removes rows with recently changed records
|
|
544
613
|
|
|
545
614
|
Parameters:
|
|
546
615
|
df1, df2: pandas.DataFrame
|
|
547
|
-
primary_keys: list
|
|
616
|
+
primary_keys: list
|
|
548
617
|
|
|
549
618
|
Returns:
|
|
550
619
|
tuple: (df1_processed, df2_processed)
|
|
551
620
|
"""
|
|
552
|
-
app_logger.info(
|
|
621
|
+
app_logger.info(
|
|
622
|
+
f'before exclusion recently changed rows source: {len(df1)}, target {len(df2)}'
|
|
623
|
+
)
|
|
553
624
|
|
|
554
625
|
filtered_df1 = df1.copy()
|
|
555
626
|
filtered_df2 = df2.copy()
|
|
@@ -557,21 +628,26 @@ def clean_recently_changed_data(df1:pd.DataFrame, df2:pd.DataFrame, primary_keys
|
|
|
557
628
|
filtered_df1 = filtered_df1.loc[filtered_df1['xrecently_changed'] == 'y']
|
|
558
629
|
filtered_df2 = filtered_df2.loc[filtered_df2['xrecently_changed'] == 'y']
|
|
559
630
|
|
|
560
|
-
excluded_from_df1_keys = _create_keys_set(filtered_df1,primary_keys)
|
|
561
|
-
excluded_from_df2_keys = _create_keys_set(filtered_df2,primary_keys)
|
|
631
|
+
excluded_from_df1_keys = _create_keys_set(filtered_df1, primary_keys)
|
|
632
|
+
excluded_from_df2_keys = _create_keys_set(filtered_df2, primary_keys)
|
|
562
633
|
|
|
563
634
|
excluded_keys = excluded_from_df1_keys | excluded_from_df2_keys
|
|
564
|
-
df1_processed = exclude_by_keys(df1, primary_keys, excluded_keys).drop(
|
|
565
|
-
|
|
635
|
+
df1_processed = exclude_by_keys(df1, primary_keys, excluded_keys).drop(
|
|
636
|
+
'xrecently_changed', axis=1
|
|
637
|
+
)
|
|
638
|
+
df2_processed = exclude_by_keys(df2, primary_keys, excluded_keys).drop(
|
|
639
|
+
'xrecently_changed', axis=1
|
|
640
|
+
)
|
|
566
641
|
|
|
567
|
-
app_logger.info(
|
|
642
|
+
app_logger.info(
|
|
643
|
+
f'after exclusion recently changed rows source: {len(df1_processed)}, target {len(df2_processed)}'
|
|
644
|
+
)
|
|
568
645
|
|
|
569
646
|
return df1_processed, df2_processed
|
|
570
647
|
|
|
571
648
|
|
|
572
649
|
def find_count_discrepancies(
|
|
573
|
-
source_counts: pd.DataFrame,
|
|
574
|
-
target_counts: pd.DataFrame
|
|
650
|
+
source_counts: pd.DataFrame, target_counts: pd.DataFrame
|
|
575
651
|
) -> pd.DataFrame:
|
|
576
652
|
"""Find discrepancies in daily row counts between source and target"""
|
|
577
653
|
source_counts['flg'] = 'source'
|
|
@@ -580,44 +656,41 @@ def find_count_discrepancies(
|
|
|
580
656
|
# Find mismatches in counts per date
|
|
581
657
|
all_counts = pd.concat([source_counts, target_counts])
|
|
582
658
|
discrepancies = all_counts.drop_duplicates(
|
|
583
|
-
subset=['dt', 'cnt'],
|
|
584
|
-
|
|
585
|
-
).sort_values(
|
|
586
|
-
by=['dt', 'flg'],
|
|
587
|
-
ascending=[False, True]
|
|
588
|
-
)
|
|
659
|
+
subset=['dt', 'cnt'], keep=False
|
|
660
|
+
).sort_values(by=['dt', 'flg'], ascending=[False, True])
|
|
589
661
|
|
|
590
662
|
return discrepancies
|
|
591
663
|
|
|
664
|
+
|
|
592
665
|
def create_result_message(
|
|
593
666
|
source_total: int,
|
|
594
667
|
target_total: int,
|
|
595
668
|
discrepancies: pd.DataFrame,
|
|
596
|
-
comparison_type: str
|
|
669
|
+
comparison_type: str,
|
|
597
670
|
) -> str:
|
|
598
671
|
"""Create standardized result message"""
|
|
599
672
|
if discrepancies.empty:
|
|
600
|
-
return f
|
|
673
|
+
return f'{comparison_type} match: Source={source_total}, Target={target_total}'
|
|
601
674
|
|
|
602
675
|
mismatch_count = len(discrepancies)
|
|
603
676
|
diff = source_total - target_total
|
|
604
|
-
diff_msg = f
|
|
677
|
+
diff_msg = f' (Δ={diff})' if diff != 0 else ''
|
|
605
678
|
|
|
606
679
|
return (
|
|
607
|
-
f
|
|
608
|
-
f
|
|
680
|
+
f'{comparison_type} mismatch: Source={source_total}, Target={target_total}{diff_msg}, '
|
|
681
|
+
f'{mismatch_count} discrepancies found'
|
|
609
682
|
)
|
|
610
683
|
|
|
684
|
+
|
|
611
685
|
def filter_columns(
|
|
612
|
-
df: pd.DataFrame,
|
|
613
|
-
columns: List[str],
|
|
614
|
-
exclude: Optional[List[str]] = None
|
|
686
|
+
df: pd.DataFrame, columns: List[str], exclude: Optional[List[str]] = None
|
|
615
687
|
) -> pd.DataFrame:
|
|
616
688
|
"""Filter DataFrame columns with optional exclusions"""
|
|
617
689
|
if exclude:
|
|
618
690
|
columns = [col for col in columns if col not in exclude]
|
|
619
691
|
return df[columns]
|
|
620
692
|
|
|
693
|
+
|
|
621
694
|
def cross_fill_missing_dates(df1, df2, date_column='dt', value_column='cnt'):
|
|
622
695
|
"""
|
|
623
696
|
Fill missing dates between tow dataframes
|
|
@@ -636,6 +709,7 @@ def cross_fill_missing_dates(df1, df2, date_column='dt', value_column='cnt'):
|
|
|
636
709
|
|
|
637
710
|
return df1_full, df2_full
|
|
638
711
|
|
|
712
|
+
|
|
639
713
|
def format_keys(keys, max_examples):
|
|
640
714
|
if keys:
|
|
641
715
|
keys = {next(iter(x)) if len(x) == 1 else x for x in list(keys)[:max_examples]}
|
|
@@ -644,12 +718,14 @@ def format_keys(keys, max_examples):
|
|
|
644
718
|
else:
|
|
645
719
|
return None
|
|
646
720
|
|
|
721
|
+
|
|
647
722
|
def get_dataframe_size_gb(df: pd.DataFrame) -> float:
|
|
648
723
|
"""Calculate DataFrame size in GB"""
|
|
649
724
|
if df.empty:
|
|
650
725
|
return 0.0
|
|
651
726
|
return df.memory_usage(deep=True).sum() / 1024 / 1024 / 1024
|
|
652
727
|
|
|
728
|
+
|
|
653
729
|
def validate_dataframe_size(df: pd.DataFrame, max_size_gb: float) -> None:
|
|
654
730
|
"""Validate DataFrame size and raise exception if exceeds limit"""
|
|
655
731
|
if df is None:
|
|
@@ -659,6 +735,6 @@ def validate_dataframe_size(df: pd.DataFrame, max_size_gb: float) -> None:
|
|
|
659
735
|
|
|
660
736
|
if size_gb > max_size_gb:
|
|
661
737
|
raise ValueError(
|
|
662
|
-
f
|
|
663
|
-
f
|
|
664
|
-
)
|
|
738
|
+
f'DataFrame size {size_gb:.2f} GB exceeds limit of {max_size_gb} GB. '
|
|
739
|
+
f'Shape: {df.shape}'
|
|
740
|
+
)
|