xoverrr 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xoverrr/__init__.py +8 -12
- xoverrr/adapters/__init__.py +7 -2
- xoverrr/adapters/base.py +61 -32
- xoverrr/adapters/clickhouse.py +64 -35
- xoverrr/adapters/oracle.py +67 -38
- xoverrr/adapters/postgres.py +67 -35
- xoverrr/constants.py +4 -4
- xoverrr/core.py +299 -197
- xoverrr/exceptions.py +8 -1
- xoverrr/logger.py +4 -2
- xoverrr/models.py +11 -5
- xoverrr/utils.py +331 -259
- {xoverrr-1.1.4.dist-info → xoverrr-1.1.6.dist-info}/METADATA +67 -71
- xoverrr-1.1.6.dist-info/RECORD +17 -0
- {xoverrr-1.1.4.dist-info → xoverrr-1.1.6.dist-info}/WHEEL +1 -1
- xoverrr-1.1.4.dist-info/RECORD +0 -17
- {xoverrr-1.1.4.dist-info → xoverrr-1.1.6.dist-info}/licenses/LICENSE +0 -0
- {xoverrr-1.1.4.dist-info → xoverrr-1.1.6.dist-info}/top_level.txt +0 -0
xoverrr/utils.py
CHANGED
|
@@ -1,21 +1,31 @@
|
|
|
1
|
-
import
|
|
2
|
-
import numpy as np
|
|
3
|
-
from typing import Dict, Any, List, Optional, Tuple, defaultdict
|
|
1
|
+
from dataclasses import dataclass, field
|
|
4
2
|
from datetime import datetime
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple, defaultdict
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
5
7
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
8
|
+
from .constants import DATETIME_FORMAT, DEFAULT_MAX_EXAMPLES, NULL_REPLACEMENT
|
|
9
|
+
from .logger import app_logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def normalize_column_names(columns: List[str]) -> List[str]:
|
|
13
|
+
"""
|
|
14
|
+
Normalize column names to lowercase for consistent comparison.
|
|
15
|
+
|
|
16
|
+
Parameters:
|
|
17
|
+
columns: List of column names to normalize
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
List of lowercased column names
|
|
21
|
+
"""
|
|
22
|
+
return [col.lower() for col in columns] if columns else []
|
|
13
23
|
|
|
14
|
-
from dataclasses import dataclass, field
|
|
15
24
|
|
|
16
25
|
@dataclass
|
|
17
26
|
class ComparisonStats:
|
|
18
27
|
"""Class for storing comparison statistics"""
|
|
28
|
+
|
|
19
29
|
total_source_rows: int
|
|
20
30
|
total_target_rows: int
|
|
21
31
|
|
|
@@ -32,13 +42,14 @@ class ComparisonStats:
|
|
|
32
42
|
|
|
33
43
|
source_only_percentage_rows: float
|
|
34
44
|
target_only_percentage_rows: float
|
|
35
|
-
total_diff_percentage_rows
|
|
45
|
+
total_diff_percentage_rows: float
|
|
36
46
|
#
|
|
37
|
-
max_diff_percentage_cols
|
|
47
|
+
max_diff_percentage_cols: float
|
|
38
48
|
median_diff_percentage_cols: float
|
|
39
49
|
#
|
|
40
50
|
final_diff_score: float
|
|
41
|
-
final_score
|
|
51
|
+
final_score: float
|
|
52
|
+
|
|
42
53
|
|
|
43
54
|
@dataclass
|
|
44
55
|
class ComparisonDiffDetails:
|
|
@@ -51,16 +62,14 @@ class ComparisonDiffDetails:
|
|
|
51
62
|
source_only_keys_examples: tuple
|
|
52
63
|
target_only_keys_examples: tuple
|
|
53
64
|
|
|
54
|
-
discrepant_data_examples:
|
|
65
|
+
discrepant_data_examples: pd.DataFrame
|
|
55
66
|
common_attribute_columns: List[str]
|
|
56
|
-
skipped_source_columns: List[str]= field(default_factory=list)
|
|
57
|
-
skipped_target_columns: List[str]= field(default_factory=list)
|
|
67
|
+
skipped_source_columns: List[str] = field(default_factory=list)
|
|
68
|
+
skipped_target_columns: List[str] = field(default_factory=list)
|
|
58
69
|
|
|
59
70
|
|
|
60
71
|
def compare_dataframes_meta(
|
|
61
|
-
df1: pd.DataFrame,
|
|
62
|
-
df2: pd.DataFrame,
|
|
63
|
-
primary_keys: List[str] = None
|
|
72
|
+
df1: pd.DataFrame, df2: pd.DataFrame, primary_keys: List[str] = None
|
|
64
73
|
) -> List[str]:
|
|
65
74
|
"""
|
|
66
75
|
Compare two pandas DataFrames and find common and different columns.
|
|
@@ -92,9 +101,12 @@ def compare_dataframes_meta(
|
|
|
92
101
|
|
|
93
102
|
return common_columns
|
|
94
103
|
|
|
95
|
-
def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_keys_cnt, examples_count=3):
|
|
96
104
|
|
|
97
|
-
|
|
105
|
+
def analyze_column_discrepancies(
|
|
106
|
+
df, primary_key_columns, value_columns, common_keys_cnt, examples_count=3
|
|
107
|
+
):
|
|
108
|
+
|
|
109
|
+
metrics = {'max_pct': 0.0, 'median_pct': 0.0}
|
|
98
110
|
diff_counters = defaultdict(int)
|
|
99
111
|
diff_examples = {col: [] for col in value_columns}
|
|
100
112
|
|
|
@@ -118,10 +130,11 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
|
|
|
118
130
|
src_val = getattr(src_row, col)
|
|
119
131
|
trg_val = getattr(trg_row, col)
|
|
120
132
|
if src_val != trg_val:
|
|
121
|
-
|
|
122
133
|
diff_counters[col] += 1
|
|
123
134
|
if len(diff_examples[col]) < examples_count:
|
|
124
|
-
diff_examples[col].append(
|
|
135
|
+
diff_examples[col].append(
|
|
136
|
+
{'pk': pk_value, 'src_val': src_val, 'trg_val': trg_val}
|
|
137
|
+
)
|
|
125
138
|
|
|
126
139
|
# filter out cols without examples
|
|
127
140
|
diff_examples = {k: v for k, v in diff_examples.items() if v}
|
|
@@ -131,7 +144,6 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
|
|
|
131
144
|
metrics['max_pct'] = max_pct
|
|
132
145
|
metrics['median_pct'] = median_pct
|
|
133
146
|
|
|
134
|
-
|
|
135
147
|
# transform to dataframes
|
|
136
148
|
# 1
|
|
137
149
|
diff_records = []
|
|
@@ -149,8 +161,8 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
|
|
|
149
161
|
# 2
|
|
150
162
|
df_diff_counters = pd.DataFrame(
|
|
151
163
|
list(diff_counters.items()), # преобразуем в список кортежей
|
|
152
|
-
columns=['column_name', 'mismatch_count'] # переименовываем колонки
|
|
153
|
-
|
|
164
|
+
columns=['column_name', 'mismatch_count'], # переименовываем колонки
|
|
165
|
+
)
|
|
154
166
|
|
|
155
167
|
return metrics, df_diff_examples, df_diff_counters
|
|
156
168
|
|
|
@@ -159,7 +171,7 @@ def compare_dataframes(
|
|
|
159
171
|
source_df: pd.DataFrame,
|
|
160
172
|
target_df: pd.DataFrame,
|
|
161
173
|
key_columns: List[str],
|
|
162
|
-
max_examples: int = DEFAULT_MAX_EXAMPLES
|
|
174
|
+
max_examples: int = DEFAULT_MAX_EXAMPLES,
|
|
163
175
|
) -> tuple[ComparisonStats, ComparisonDiffDetails]:
|
|
164
176
|
"""
|
|
165
177
|
Efficient comparison of two dataframes by primary key when discrepancies ratio quite small,
|
|
@@ -196,8 +208,12 @@ def compare_dataframes(
|
|
|
196
208
|
source_dup = source_df[source_df.duplicated(subset=key_columns, keep=False)]
|
|
197
209
|
target_dup = target_df[target_df.duplicated(subset=key_columns, keep=False)]
|
|
198
210
|
|
|
199
|
-
source_dup_keys =
|
|
200
|
-
|
|
211
|
+
source_dup_keys = (
|
|
212
|
+
_create_keys_set(source_dup, key_columns) if not source_dup.empty else set()
|
|
213
|
+
)
|
|
214
|
+
target_dup_keys = (
|
|
215
|
+
_create_keys_set(target_dup, key_columns) if not target_dup.empty else set()
|
|
216
|
+
)
|
|
201
217
|
|
|
202
218
|
source_dup_keys_examples = format_keys(source_dup_keys, max_examples)
|
|
203
219
|
target_dup_keys_examples = format_keys(target_dup_keys, max_examples)
|
|
@@ -218,13 +234,16 @@ def compare_dataframes(
|
|
|
218
234
|
xor_combined_df = (
|
|
219
235
|
pd.concat([source_clean, target_clean], ignore_index=True)
|
|
220
236
|
.drop_duplicates(subset=key_columns + non_key_columns, keep=False)
|
|
221
|
-
.assign(
|
|
237
|
+
.assign(
|
|
238
|
+
xcount_pairs=lambda df: df.groupby(key_columns)[key_columns[0]].transform(
|
|
239
|
+
'size'
|
|
240
|
+
)
|
|
241
|
+
)
|
|
222
242
|
)
|
|
223
243
|
|
|
224
244
|
# symmetrical difference between two datasets, sorted
|
|
225
245
|
xor_combined_sorted = xor_combined_df.sort_values(
|
|
226
|
-
by=key_columns + ['xflg'],
|
|
227
|
-
ascending=[False] * len(key_columns) + [True]
|
|
246
|
+
by=key_columns + ['xflg'], ascending=[False] * len(key_columns) + [True]
|
|
228
247
|
)
|
|
229
248
|
|
|
230
249
|
mask = xor_combined_sorted['xcount_pairs'] > 1
|
|
@@ -238,53 +257,66 @@ def compare_dataframes(
|
|
|
238
257
|
xor_source_only_keys = _create_keys_set(xor_df_source_only, key_columns)
|
|
239
258
|
xor_target_only_keys = _create_keys_set(xor_df_target_only, key_columns)
|
|
240
259
|
|
|
241
|
-
xor_common_keys_cnt = int(len(xor_df_multi)/2) if not xor_df_multi.empty else 0
|
|
260
|
+
xor_common_keys_cnt = int(len(xor_df_multi) / 2) if not xor_df_multi.empty else 0
|
|
242
261
|
xor_source_only_keys_cnt = len(xor_source_only_keys)
|
|
243
262
|
xor_target_only_keys_cnt = len(xor_target_only_keys)
|
|
244
263
|
|
|
245
264
|
# take n pairs that is why examples x2
|
|
246
|
-
xor_df_multi_example =
|
|
265
|
+
xor_df_multi_example = (
|
|
266
|
+
xor_df_multi.head(max_examples * 2).drop(columns=['xcount_pairs'])
|
|
267
|
+
if not xor_df_multi.empty
|
|
268
|
+
else pd.DataFrame()
|
|
269
|
+
)
|
|
247
270
|
|
|
248
271
|
xor_source_only_keys_examples = format_keys(xor_source_only_keys, max_examples)
|
|
249
272
|
xor_target_only_keys_examples = format_keys(xor_target_only_keys, max_examples)
|
|
250
273
|
|
|
251
274
|
# get number of records that present in two datasets based on primary key
|
|
252
|
-
common_keys_cnt = int(
|
|
275
|
+
common_keys_cnt = int(
|
|
276
|
+
(
|
|
277
|
+
len(source_clean)
|
|
278
|
+
- xor_source_only_keys_cnt
|
|
279
|
+
+ len(target_clean)
|
|
280
|
+
- xor_target_only_keys_cnt
|
|
281
|
+
)
|
|
282
|
+
/ 2
|
|
283
|
+
)
|
|
253
284
|
|
|
254
285
|
if not common_keys_cnt:
|
|
255
|
-
#Special case when there is no matched primary keys at all
|
|
286
|
+
# Special case when there is no matched primary keys at all
|
|
256
287
|
comparison_stats = ComparisonStats(
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
288
|
+
total_source_rows=len(source_df),
|
|
289
|
+
total_target_rows=len(target_df),
|
|
290
|
+
dup_source_rows=source_dup_cnt,
|
|
291
|
+
dup_target_rows=target_dup_cnt,
|
|
292
|
+
only_source_rows=xor_source_only_keys_cnt,
|
|
293
|
+
only_target_rows=xor_target_only_keys_cnt,
|
|
294
|
+
common_pk_rows=0,
|
|
295
|
+
total_matched_rows=0,
|
|
296
|
+
#
|
|
297
|
+
dup_source_percentage_rows=100,
|
|
298
|
+
dup_target_percentage_rows=100,
|
|
299
|
+
source_only_percentage_rows=100,
|
|
300
|
+
target_only_percentage_rows=100,
|
|
301
|
+
total_diff_percentage_rows=100,
|
|
302
|
+
#
|
|
303
|
+
max_diff_percentage_cols=100,
|
|
304
|
+
median_diff_percentage_cols=100,
|
|
305
|
+
#
|
|
306
|
+
final_diff_score=100,
|
|
307
|
+
final_score=0,
|
|
277
308
|
)
|
|
278
309
|
|
|
279
310
|
comparison_diff_detais = ComparisonDiffDetails(
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
311
|
+
mismatches_per_column=pd.DataFrame(),
|
|
312
|
+
discrepancies_per_col_examples=pd.DataFrame(),
|
|
313
|
+
dup_source_keys_examples=source_dup_keys_examples,
|
|
314
|
+
dup_target_keys_examples=target_dup_keys_examples,
|
|
315
|
+
common_attribute_columns=non_key_columns,
|
|
316
|
+
source_only_keys_examples=xor_source_only_keys_examples,
|
|
317
|
+
target_only_keys_examples=xor_target_only_keys_examples,
|
|
318
|
+
discrepant_data_examples=pd.DataFrame(),
|
|
319
|
+
)
|
|
288
320
|
app_logger.info('end')
|
|
289
321
|
|
|
290
322
|
return comparison_stats, comparison_diff_detais
|
|
@@ -292,73 +324,79 @@ def compare_dataframes(
|
|
|
292
324
|
# get number of that totally equal in two datasets
|
|
293
325
|
total_matched_records_cnt = common_keys_cnt - xor_common_keys_cnt
|
|
294
326
|
|
|
295
|
-
source_only_percentage = (xor_source_only_keys_cnt/common_keys_cnt)*100
|
|
296
|
-
target_only_percentage = (xor_target_only_keys_cnt/common_keys_cnt)*100
|
|
327
|
+
source_only_percentage = (xor_source_only_keys_cnt / common_keys_cnt) * 100
|
|
328
|
+
target_only_percentage = (xor_target_only_keys_cnt / common_keys_cnt) * 100
|
|
297
329
|
|
|
298
|
-
source_dup_percentage = (source_dup_cnt/len(source_df))*100
|
|
299
|
-
target_dup_percentage = (target_dup_cnt/len(target_df))*100
|
|
300
|
-
|
|
301
|
-
diff_col_metrics, \
|
|
302
|
-
diff_col_examples,\
|
|
303
|
-
diff_col_counters = analyze_column_discrepancies(xor_df_multi, key_columns, non_key_columns, common_keys_cnt, max_examples)
|
|
330
|
+
source_dup_percentage = (source_dup_cnt / len(source_df)) * 100
|
|
331
|
+
target_dup_percentage = (target_dup_cnt / len(target_df)) * 100
|
|
304
332
|
|
|
333
|
+
diff_col_metrics, diff_col_examples, diff_col_counters = (
|
|
334
|
+
analyze_column_discrepancies(
|
|
335
|
+
xor_df_multi, key_columns, non_key_columns, common_keys_cnt, max_examples
|
|
336
|
+
)
|
|
337
|
+
)
|
|
305
338
|
|
|
306
|
-
source_and_target_total_diff_percentage = (
|
|
339
|
+
source_and_target_total_diff_percentage = (
|
|
340
|
+
1 - total_matched_records_cnt / common_keys_cnt
|
|
341
|
+
) * 100
|
|
307
342
|
|
|
308
|
-
final_diff_score =
|
|
309
|
-
|
|
310
|
-
|
|
343
|
+
final_diff_score = (
|
|
344
|
+
source_dup_percentage * 0.1
|
|
345
|
+
+ target_dup_percentage * 0.1
|
|
346
|
+
+ source_only_percentage * 0.15
|
|
347
|
+
+ target_only_percentage * 0.15
|
|
348
|
+
+ source_and_target_total_diff_percentage * 0.5
|
|
349
|
+
)
|
|
311
350
|
|
|
312
351
|
comparison_stats = ComparisonStats(
|
|
313
|
-
total_source_rows
|
|
314
|
-
total_target_rows
|
|
315
|
-
dup_source_rows
|
|
316
|
-
dup_target_rows
|
|
317
|
-
only_source_rows
|
|
318
|
-
only_target_rows
|
|
319
|
-
common_pk_rows
|
|
320
|
-
total_matched_rows=
|
|
352
|
+
total_source_rows=len(source_df),
|
|
353
|
+
total_target_rows=len(target_df),
|
|
354
|
+
dup_source_rows=source_dup_cnt,
|
|
355
|
+
dup_target_rows=target_dup_cnt,
|
|
356
|
+
only_source_rows=xor_source_only_keys_cnt,
|
|
357
|
+
only_target_rows=xor_target_only_keys_cnt,
|
|
358
|
+
common_pk_rows=common_keys_cnt,
|
|
359
|
+
total_matched_rows=total_matched_records_cnt,
|
|
321
360
|
#
|
|
322
|
-
dup_source_percentage_rows
|
|
323
|
-
dup_target_percentage_rows
|
|
324
|
-
source_only_percentage_rows
|
|
325
|
-
target_only_percentage_rows
|
|
326
|
-
total_diff_percentage_rows
|
|
361
|
+
dup_source_percentage_rows=source_dup_percentage,
|
|
362
|
+
dup_target_percentage_rows=target_dup_percentage,
|
|
363
|
+
source_only_percentage_rows=source_only_percentage,
|
|
364
|
+
target_only_percentage_rows=target_only_percentage,
|
|
365
|
+
total_diff_percentage_rows=source_and_target_total_diff_percentage,
|
|
327
366
|
#
|
|
328
|
-
max_diff_percentage_cols
|
|
329
|
-
median_diff_percentage_cols
|
|
367
|
+
max_diff_percentage_cols=diff_col_metrics['max_pct'],
|
|
368
|
+
median_diff_percentage_cols=diff_col_metrics['median_pct'],
|
|
330
369
|
#
|
|
331
|
-
final_diff_score
|
|
332
|
-
final_score
|
|
333
|
-
|
|
370
|
+
final_diff_score=final_diff_score,
|
|
371
|
+
final_score=100 - final_diff_score,
|
|
372
|
+
)
|
|
334
373
|
|
|
335
374
|
comparison_diff_detais = ComparisonDiffDetails(
|
|
336
|
-
mismatches_per_column
|
|
337
|
-
discrepancies_per_col_examples
|
|
338
|
-
dup_source_keys_examples
|
|
339
|
-
dup_target_keys_examples
|
|
340
|
-
source_only_keys_examples
|
|
341
|
-
target_only_keys_examples
|
|
342
|
-
discrepant_data_examples
|
|
343
|
-
common_attribute_columns=non_key_columns
|
|
375
|
+
mismatches_per_column=diff_col_counters,
|
|
376
|
+
discrepancies_per_col_examples=diff_col_examples,
|
|
377
|
+
dup_source_keys_examples=source_dup_keys_examples,
|
|
378
|
+
dup_target_keys_examples=target_dup_keys_examples,
|
|
379
|
+
source_only_keys_examples=xor_source_only_keys_examples,
|
|
380
|
+
target_only_keys_examples=xor_target_only_keys_examples,
|
|
381
|
+
discrepant_data_examples=xor_df_multi_example,
|
|
382
|
+
common_attribute_columns=non_key_columns,
|
|
383
|
+
)
|
|
344
384
|
|
|
345
385
|
app_logger.info('end')
|
|
346
386
|
return comparison_stats, comparison_diff_detais
|
|
347
387
|
|
|
348
388
|
|
|
349
389
|
def _validate_input_data(
|
|
350
|
-
source_df: pd.DataFrame,
|
|
351
|
-
target_df: pd.DataFrame,
|
|
352
|
-
key_columns: List[str]
|
|
390
|
+
source_df: pd.DataFrame, target_df: pd.DataFrame, key_columns: List[str]
|
|
353
391
|
) -> None:
|
|
354
392
|
"""Input data validation"""
|
|
355
393
|
if not all(col in source_df.columns for col in key_columns):
|
|
356
394
|
missing = [col for col in key_columns if col not in source_df.columns]
|
|
357
|
-
raise ValueError(f
|
|
395
|
+
raise ValueError(f'Key columns missing in source: {missing}')
|
|
358
396
|
|
|
359
397
|
if not all(col in target_df.columns for col in key_columns):
|
|
360
398
|
missing = [col for col in key_columns if col not in target_df.columns]
|
|
361
|
-
raise ValueError(f
|
|
399
|
+
raise ValueError(f'Key columns missing in target: {missing}')
|
|
362
400
|
|
|
363
401
|
|
|
364
402
|
def _create_keys_set(df: pd.DataFrame, key_columns: List[str]) -> set:
|
|
@@ -366,154 +404,173 @@ def _create_keys_set(df: pd.DataFrame, key_columns: List[str]) -> set:
|
|
|
366
404
|
return set(df[key_columns].itertuples(index=False, name=None))
|
|
367
405
|
|
|
368
406
|
|
|
369
|
-
def generate_comparison_sample_report(
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
407
|
+
def generate_comparison_sample_report(
|
|
408
|
+
source_table: str,
|
|
409
|
+
target_table: str,
|
|
410
|
+
stats: ComparisonStats,
|
|
411
|
+
details: ComparisonDiffDetails,
|
|
412
|
+
timezone: str,
|
|
413
|
+
source_query: str = None,
|
|
414
|
+
source_params: Dict = None,
|
|
415
|
+
target_query: str = None,
|
|
416
|
+
target_params: Dict = None,
|
|
417
|
+
) -> None:
|
|
378
418
|
"""Generate comparison report (logger output looks uuugly)"""
|
|
379
419
|
rl = []
|
|
380
|
-
rl.append(
|
|
420
|
+
rl.append('=' * 80)
|
|
381
421
|
current_datetime = datetime.now()
|
|
382
422
|
rl.append(current_datetime.strftime(DATETIME_FORMAT))
|
|
383
|
-
rl.append(f
|
|
384
|
-
if source_table and target_table:
|
|
385
|
-
rl.append(f
|
|
386
|
-
rl.append(f
|
|
387
|
-
rl.append(f
|
|
388
|
-
rl.append(
|
|
423
|
+
rl.append(f'DATA SAMPLE COMPARISON REPORT: ')
|
|
424
|
+
if source_table and target_table: # empty for custom query
|
|
425
|
+
rl.append(f'{source_table}')
|
|
426
|
+
rl.append(f'VS')
|
|
427
|
+
rl.append(f'{target_table}')
|
|
428
|
+
rl.append('=' * 80)
|
|
389
429
|
|
|
390
430
|
if source_query and target_query:
|
|
391
|
-
rl.append(f
|
|
392
|
-
rl.append(f
|
|
431
|
+
rl.append(f'timezone: {timezone}')
|
|
432
|
+
rl.append(f' {source_query}')
|
|
393
433
|
if source_params:
|
|
394
|
-
rl.append(f
|
|
395
|
-
rl.append(
|
|
396
|
-
rl.append(f
|
|
434
|
+
rl.append(f' params: {source_params}')
|
|
435
|
+
rl.append('-' * 40)
|
|
436
|
+
rl.append(f' {target_query}')
|
|
397
437
|
if target_params:
|
|
398
|
-
rl.append(f
|
|
399
|
-
|
|
400
|
-
rl.append(
|
|
401
|
-
|
|
402
|
-
rl.append(f
|
|
403
|
-
rl.append(f
|
|
404
|
-
rl.append(f
|
|
405
|
-
rl.append(f
|
|
406
|
-
rl.append(f
|
|
407
|
-
rl.append(f
|
|
408
|
-
rl.append(f
|
|
409
|
-
rl.append(f
|
|
410
|
-
rl.append(f
|
|
411
|
-
rl.append(
|
|
412
|
-
rl.append(f
|
|
413
|
-
rl.append(f
|
|
414
|
-
rl.append(f
|
|
415
|
-
rl.append(f
|
|
416
|
-
rl.append(f
|
|
417
|
-
rl.append(f
|
|
418
|
-
rl.append(f
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
rl.append(f
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
rl.append(f
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
rl.append(f
|
|
438
|
+
rl.append(f' params: {target_params}')
|
|
439
|
+
|
|
440
|
+
rl.append('-' * 40)
|
|
441
|
+
|
|
442
|
+
rl.append(f'\nSUMMARY:')
|
|
443
|
+
rl.append(f' Source rows: {stats.total_source_rows}')
|
|
444
|
+
rl.append(f' Target rows: {stats.total_target_rows}')
|
|
445
|
+
rl.append(f' Duplicated source rows: {stats.dup_source_rows}')
|
|
446
|
+
rl.append(f' Duplicated target rows: {stats.dup_target_rows}')
|
|
447
|
+
rl.append(f' Only source rows: {stats.only_source_rows}')
|
|
448
|
+
rl.append(f' Only target rows: {stats.only_target_rows}')
|
|
449
|
+
rl.append(f' Common rows (by primary key): {stats.common_pk_rows}')
|
|
450
|
+
rl.append(f' Totally matched rows: {stats.total_matched_rows}')
|
|
451
|
+
rl.append('-' * 40)
|
|
452
|
+
rl.append(f' Source only rows %: {stats.source_only_percentage_rows:.5f}')
|
|
453
|
+
rl.append(f' Target only rows %: {stats.target_only_percentage_rows:.5f}')
|
|
454
|
+
rl.append(f' Duplicated source rows %: {stats.dup_source_percentage_rows:.5f}')
|
|
455
|
+
rl.append(f' Duplicated target rows %: {stats.dup_target_percentage_rows:.5f}')
|
|
456
|
+
rl.append(f' Mismatched rows %: {stats.total_diff_percentage_rows:.5f}')
|
|
457
|
+
rl.append(f' Final discrepancies score: {stats.final_diff_score:.5f}')
|
|
458
|
+
rl.append(f' Final data quality score: {stats.final_score:.5f}')
|
|
459
|
+
|
|
460
|
+
rl.append(f' Source-only key examples: {details.source_only_keys_examples}')
|
|
461
|
+
rl.append(f' Target-only key examples: {details.target_only_keys_examples}')
|
|
462
|
+
|
|
463
|
+
rl.append(f' Duplicated source key examples: {details.dup_source_keys_examples}')
|
|
464
|
+
rl.append(f' Duplicated target key examples: {details.dup_target_keys_examples}')
|
|
465
|
+
|
|
466
|
+
rl.append(
|
|
467
|
+
f' Common attribute columns: {", ".join(details.common_attribute_columns)}'
|
|
468
|
+
)
|
|
469
|
+
rl.append(f' Skipped source columns: {", ".join(details.skipped_source_columns)}')
|
|
470
|
+
rl.append(f' Skipped target columns: {", ".join(details.skipped_target_columns)}')
|
|
430
471
|
|
|
431
472
|
if stats.max_diff_percentage_cols > 0 and not details.mismatches_per_column.empty:
|
|
432
|
-
rl.append(f
|
|
473
|
+
rl.append(f'\nCOLUMN DIFFERENCES:')
|
|
433
474
|
|
|
434
|
-
rl.append(
|
|
435
|
-
|
|
475
|
+
rl.append(
|
|
476
|
+
f' Discrepancies per column (max %): {stats.max_diff_percentage_cols:.5f}'
|
|
477
|
+
)
|
|
478
|
+
rl.append(f' Count of mismatches per column:\n')
|
|
436
479
|
rl.append(details.mismatches_per_column.to_string(index=False))
|
|
437
480
|
|
|
438
|
-
rl.append(f
|
|
439
|
-
rl.append
|
|
440
|
-
|
|
481
|
+
rl.append(f' Some examples:\n')
|
|
482
|
+
rl.append(
|
|
483
|
+
details.discrepancies_per_col_examples.to_string(
|
|
484
|
+
index=False, max_colwidth=64, justify='left'
|
|
485
|
+
)
|
|
486
|
+
)
|
|
441
487
|
|
|
442
488
|
# Display sample data if available
|
|
443
|
-
if
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
rl.append(
|
|
448
|
-
rl.append(
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
489
|
+
if (
|
|
490
|
+
details.discrepant_data_examples is not None
|
|
491
|
+
and not details.discrepant_data_examples.empty
|
|
492
|
+
):
|
|
493
|
+
rl.append(f'\nDISCREPANT DATA (first pairs):')
|
|
494
|
+
rl.append('Sorted by primary key and dataset:')
|
|
495
|
+
rl.append(f'\n')
|
|
496
|
+
rl.append(
|
|
497
|
+
details.discrepant_data_examples.to_string(
|
|
498
|
+
index=False, max_colwidth=64, justify='left'
|
|
499
|
+
)
|
|
500
|
+
)
|
|
501
|
+
rl.append(f'\n')
|
|
502
|
+
|
|
503
|
+
rl.append('=' * 80)
|
|
504
|
+
|
|
505
|
+
return '\n'.join(rl)
|
|
506
|
+
|
|
507
|
+
|
|
508
|
+
def generate_comparison_count_report(
|
|
509
|
+
source_table: str,
|
|
510
|
+
target_table: str,
|
|
511
|
+
stats: ComparisonStats,
|
|
512
|
+
details: ComparisonDiffDetails,
|
|
513
|
+
total_source_count: int,
|
|
514
|
+
total_target_count: int,
|
|
515
|
+
discrepancies_counters_percentage: int,
|
|
516
|
+
result_diff_in_counters: int,
|
|
517
|
+
result_equal_in_counters: int,
|
|
518
|
+
timezone: str,
|
|
519
|
+
source_query: str = None,
|
|
520
|
+
source_params: Dict = None,
|
|
521
|
+
target_query: str = None,
|
|
522
|
+
target_params: Dict = None,
|
|
523
|
+
) -> None:
|
|
469
524
|
"""Generates comparison report (logger output looks uuugly)"""
|
|
470
525
|
rl = []
|
|
471
|
-
rl.append(
|
|
526
|
+
rl.append('=' * 80)
|
|
472
527
|
current_datetime = datetime.now()
|
|
473
528
|
rl.append(current_datetime.strftime(DATETIME_FORMAT))
|
|
474
|
-
rl.append(f
|
|
475
|
-
rl.append(f
|
|
476
|
-
rl.append(f
|
|
477
|
-
rl.append(f
|
|
478
|
-
rl.append(
|
|
529
|
+
rl.append(f'COUNT COMPARISON REPORT:')
|
|
530
|
+
rl.append(f'{source_table}')
|
|
531
|
+
rl.append(f'VS')
|
|
532
|
+
rl.append(f'{target_table}')
|
|
533
|
+
rl.append('=' * 80)
|
|
479
534
|
|
|
480
535
|
if source_query and target_query:
|
|
481
|
-
rl.append(f
|
|
482
|
-
rl.append(f
|
|
536
|
+
rl.append(f'timezone: {timezone}')
|
|
537
|
+
rl.append(f' {source_query}')
|
|
483
538
|
if source_params:
|
|
484
|
-
rl.append(f
|
|
485
|
-
rl.append(
|
|
486
|
-
rl.append(f
|
|
539
|
+
rl.append(f' params: {source_params}')
|
|
540
|
+
rl.append('-' * 40)
|
|
541
|
+
rl.append(f' {target_query}')
|
|
487
542
|
if target_params:
|
|
488
|
-
rl.append(f
|
|
489
|
-
rl.append(
|
|
490
|
-
|
|
491
|
-
rl.append(f
|
|
492
|
-
rl.append(f
|
|
493
|
-
rl.append(f
|
|
494
|
-
rl.append(f
|
|
495
|
-
rl.append(f
|
|
496
|
-
rl.append(f
|
|
497
|
-
rl.append(f
|
|
498
|
-
rl.append(
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
rl.append(f
|
|
503
|
-
rl.append
|
|
504
|
-
|
|
505
|
-
)
|
|
543
|
+
rl.append(f' params: {target_params}')
|
|
544
|
+
rl.append('-' * 40)
|
|
545
|
+
|
|
546
|
+
rl.append(f'\nSUMMARY:')
|
|
547
|
+
rl.append(f' Source total count: {total_source_count}')
|
|
548
|
+
rl.append(f' Target total count: {total_target_count}')
|
|
549
|
+
rl.append(f' Common total count: {result_equal_in_counters}')
|
|
550
|
+
rl.append(f' Diff total count: {result_diff_in_counters}')
|
|
551
|
+
rl.append(f' Discrepancies percentage: {discrepancies_counters_percentage:.5f}%')
|
|
552
|
+
rl.append(f' Final discrepancies score: {discrepancies_counters_percentage:.5f}')
|
|
553
|
+
rl.append(
|
|
554
|
+
f' Final data quality score: {(100 - discrepancies_counters_percentage):.5f}'
|
|
555
|
+
)
|
|
556
|
+
if not details.mismatches_per_column.empty:
|
|
557
|
+
rl.append(f'\nDETAIL DIFFERENCES:')
|
|
558
|
+
rl.append(details.mismatches_per_column.to_string(index=False))
|
|
506
559
|
|
|
507
560
|
# Display sample data if available
|
|
508
|
-
if
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
561
|
+
if (
|
|
562
|
+
details.discrepant_data_examples is not None
|
|
563
|
+
and not details.discrepant_data_examples.empty
|
|
564
|
+
):
|
|
565
|
+
rl.append(f'\nDISCREPANT DATA (first pairs):')
|
|
566
|
+
rl.append('Sorted by primary key and dataset:')
|
|
567
|
+
rl.append(f'\n')
|
|
512
568
|
rl.append(details.discrepant_data_examples.to_string(index=False))
|
|
513
|
-
rl.append(f
|
|
514
|
-
rl.append(
|
|
569
|
+
rl.append(f'\n')
|
|
570
|
+
rl.append('=' * 80)
|
|
571
|
+
|
|
572
|
+
return '\n'.join(rl)
|
|
515
573
|
|
|
516
|
-
return "\n".join(rl)
|
|
517
574
|
|
|
518
575
|
def safe_remove_zeros(x):
|
|
519
576
|
if pd.isna(x):
|
|
@@ -522,38 +579,48 @@ def safe_remove_zeros(x):
|
|
|
522
579
|
return int(x)
|
|
523
580
|
return x
|
|
524
581
|
|
|
582
|
+
|
|
525
583
|
def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
526
584
|
"""Prepare DataFrame for comparison by handling nulls and empty strings"""
|
|
527
585
|
df = df.map(safe_remove_zeros)
|
|
528
586
|
|
|
529
|
-
|
|
530
587
|
df = df.fillna(NULL_REPLACEMENT)
|
|
531
|
-
df = df.replace(r'(?i)^(None|nan|NaN|\s*)$', NULL_REPLACEMENT, regex=True)
|
|
588
|
+
df = df.replace(r'(?i)^(None|nan|NaN|NaT|\s*)$', NULL_REPLACEMENT, regex=True)
|
|
532
589
|
|
|
533
590
|
df = df.astype(str)
|
|
534
591
|
|
|
535
592
|
return df
|
|
536
593
|
|
|
594
|
+
|
|
537
595
|
def exclude_by_keys(df, key_columns, exclude_set):
|
|
538
596
|
if len(key_columns) == 1:
|
|
539
597
|
exclude_values = [x[0] for x in exclude_set]
|
|
540
598
|
return df[~df[key_columns[0]].isin(exclude_values)]
|
|
541
599
|
else:
|
|
542
|
-
return df[
|
|
600
|
+
return df[
|
|
601
|
+
~df.apply(
|
|
602
|
+
lambda row: tuple(row[col] for col in key_columns) in exclude_set,
|
|
603
|
+
axis=1,
|
|
604
|
+
)
|
|
605
|
+
]
|
|
543
606
|
|
|
544
607
|
|
|
545
|
-
def clean_recently_changed_data(
|
|
608
|
+
def clean_recently_changed_data(
|
|
609
|
+
df1: pd.DataFrame, df2: pd.DataFrame, primary_keys: List[str]
|
|
610
|
+
):
|
|
546
611
|
"""
|
|
547
612
|
Mutually removes rows with recently changed records
|
|
548
613
|
|
|
549
614
|
Parameters:
|
|
550
615
|
df1, df2: pandas.DataFrame
|
|
551
|
-
primary_keys: list
|
|
616
|
+
primary_keys: list
|
|
552
617
|
|
|
553
618
|
Returns:
|
|
554
619
|
tuple: (df1_processed, df2_processed)
|
|
555
620
|
"""
|
|
556
|
-
app_logger.info(
|
|
621
|
+
app_logger.info(
|
|
622
|
+
f'before exclusion recently changed rows source: {len(df1)}, target {len(df2)}'
|
|
623
|
+
)
|
|
557
624
|
|
|
558
625
|
filtered_df1 = df1.copy()
|
|
559
626
|
filtered_df2 = df2.copy()
|
|
@@ -561,21 +628,26 @@ def clean_recently_changed_data(df1:pd.DataFrame, df2:pd.DataFrame, primary_keys
|
|
|
561
628
|
filtered_df1 = filtered_df1.loc[filtered_df1['xrecently_changed'] == 'y']
|
|
562
629
|
filtered_df2 = filtered_df2.loc[filtered_df2['xrecently_changed'] == 'y']
|
|
563
630
|
|
|
564
|
-
excluded_from_df1_keys = _create_keys_set(filtered_df1,primary_keys)
|
|
565
|
-
excluded_from_df2_keys = _create_keys_set(filtered_df2,primary_keys)
|
|
631
|
+
excluded_from_df1_keys = _create_keys_set(filtered_df1, primary_keys)
|
|
632
|
+
excluded_from_df2_keys = _create_keys_set(filtered_df2, primary_keys)
|
|
566
633
|
|
|
567
634
|
excluded_keys = excluded_from_df1_keys | excluded_from_df2_keys
|
|
568
|
-
df1_processed = exclude_by_keys(df1, primary_keys, excluded_keys).drop(
|
|
569
|
-
|
|
635
|
+
df1_processed = exclude_by_keys(df1, primary_keys, excluded_keys).drop(
|
|
636
|
+
'xrecently_changed', axis=1
|
|
637
|
+
)
|
|
638
|
+
df2_processed = exclude_by_keys(df2, primary_keys, excluded_keys).drop(
|
|
639
|
+
'xrecently_changed', axis=1
|
|
640
|
+
)
|
|
570
641
|
|
|
571
|
-
app_logger.info(
|
|
642
|
+
app_logger.info(
|
|
643
|
+
f'after exclusion recently changed rows source: {len(df1_processed)}, target {len(df2_processed)}'
|
|
644
|
+
)
|
|
572
645
|
|
|
573
646
|
return df1_processed, df2_processed
|
|
574
647
|
|
|
575
648
|
|
|
576
649
|
def find_count_discrepancies(
|
|
577
|
-
source_counts: pd.DataFrame,
|
|
578
|
-
target_counts: pd.DataFrame
|
|
650
|
+
source_counts: pd.DataFrame, target_counts: pd.DataFrame
|
|
579
651
|
) -> pd.DataFrame:
|
|
580
652
|
"""Find discrepancies in daily row counts between source and target"""
|
|
581
653
|
source_counts['flg'] = 'source'
|
|
@@ -584,44 +656,41 @@ def find_count_discrepancies(
|
|
|
584
656
|
# Find mismatches in counts per date
|
|
585
657
|
all_counts = pd.concat([source_counts, target_counts])
|
|
586
658
|
discrepancies = all_counts.drop_duplicates(
|
|
587
|
-
subset=['dt', 'cnt'],
|
|
588
|
-
|
|
589
|
-
).sort_values(
|
|
590
|
-
by=['dt', 'flg'],
|
|
591
|
-
ascending=[False, True]
|
|
592
|
-
)
|
|
659
|
+
subset=['dt', 'cnt'], keep=False
|
|
660
|
+
).sort_values(by=['dt', 'flg'], ascending=[False, True])
|
|
593
661
|
|
|
594
662
|
return discrepancies
|
|
595
663
|
|
|
664
|
+
|
|
596
665
|
def create_result_message(
|
|
597
666
|
source_total: int,
|
|
598
667
|
target_total: int,
|
|
599
668
|
discrepancies: pd.DataFrame,
|
|
600
|
-
comparison_type: str
|
|
669
|
+
comparison_type: str,
|
|
601
670
|
) -> str:
|
|
602
671
|
"""Create standardized result message"""
|
|
603
672
|
if discrepancies.empty:
|
|
604
|
-
return f
|
|
673
|
+
return f'{comparison_type} match: Source={source_total}, Target={target_total}'
|
|
605
674
|
|
|
606
675
|
mismatch_count = len(discrepancies)
|
|
607
676
|
diff = source_total - target_total
|
|
608
|
-
diff_msg = f
|
|
677
|
+
diff_msg = f' (Δ={diff})' if diff != 0 else ''
|
|
609
678
|
|
|
610
679
|
return (
|
|
611
|
-
f
|
|
612
|
-
f
|
|
680
|
+
f'{comparison_type} mismatch: Source={source_total}, Target={target_total}{diff_msg}, '
|
|
681
|
+
f'{mismatch_count} discrepancies found'
|
|
613
682
|
)
|
|
614
683
|
|
|
684
|
+
|
|
615
685
|
def filter_columns(
|
|
616
|
-
df: pd.DataFrame,
|
|
617
|
-
columns: List[str],
|
|
618
|
-
exclude: Optional[List[str]] = None
|
|
686
|
+
df: pd.DataFrame, columns: List[str], exclude: Optional[List[str]] = None
|
|
619
687
|
) -> pd.DataFrame:
|
|
620
688
|
"""Filter DataFrame columns with optional exclusions"""
|
|
621
689
|
if exclude:
|
|
622
690
|
columns = [col for col in columns if col not in exclude]
|
|
623
691
|
return df[columns]
|
|
624
692
|
|
|
693
|
+
|
|
625
694
|
def cross_fill_missing_dates(df1, df2, date_column='dt', value_column='cnt'):
|
|
626
695
|
"""
|
|
627
696
|
Fill missing dates between tow dataframes
|
|
@@ -640,6 +709,7 @@ def cross_fill_missing_dates(df1, df2, date_column='dt', value_column='cnt'):
|
|
|
640
709
|
|
|
641
710
|
return df1_full, df2_full
|
|
642
711
|
|
|
712
|
+
|
|
643
713
|
def format_keys(keys, max_examples):
|
|
644
714
|
if keys:
|
|
645
715
|
keys = {next(iter(x)) if len(x) == 1 else x for x in list(keys)[:max_examples]}
|
|
@@ -648,12 +718,14 @@ def format_keys(keys, max_examples):
|
|
|
648
718
|
else:
|
|
649
719
|
return None
|
|
650
720
|
|
|
721
|
+
|
|
651
722
|
def get_dataframe_size_gb(df: pd.DataFrame) -> float:
|
|
652
723
|
"""Calculate DataFrame size in GB"""
|
|
653
724
|
if df.empty:
|
|
654
725
|
return 0.0
|
|
655
726
|
return df.memory_usage(deep=True).sum() / 1024 / 1024 / 1024
|
|
656
727
|
|
|
728
|
+
|
|
657
729
|
def validate_dataframe_size(df: pd.DataFrame, max_size_gb: float) -> None:
|
|
658
730
|
"""Validate DataFrame size and raise exception if exceeds limit"""
|
|
659
731
|
if df is None:
|
|
@@ -663,6 +735,6 @@ def validate_dataframe_size(df: pd.DataFrame, max_size_gb: float) -> None:
|
|
|
663
735
|
|
|
664
736
|
if size_gb > max_size_gb:
|
|
665
737
|
raise ValueError(
|
|
666
|
-
f
|
|
667
|
-
f
|
|
668
|
-
)
|
|
738
|
+
f'DataFrame size {size_gb:.2f} GB exceeds limit of {max_size_gb} GB. '
|
|
739
|
+
f'Shape: {df.shape}'
|
|
740
|
+
)
|