xoverrr 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xoverrr/utils.py CHANGED
@@ -1,17 +1,31 @@
1
- import pandas as pd
2
- import numpy as np
3
- from typing import Dict, Any, List, Optional, Tuple, defaultdict
1
+ from dataclasses import dataclass, field
4
2
  from datetime import datetime
3
+ from typing import Any, Dict, List, Optional, Tuple, defaultdict
5
4
 
5
+ import numpy as np
6
+ import pandas as pd
6
7
 
7
- from .constants import NULL_REPLACEMENT, DEFAULT_MAX_EXAMPLES, DATETIME_FORMAT
8
+ from .constants import DATETIME_FORMAT, DEFAULT_MAX_EXAMPLES, NULL_REPLACEMENT
8
9
  from .logger import app_logger
9
10
 
10
- from dataclasses import dataclass, field
11
+
12
+ def normalize_column_names(columns: List[str]) -> List[str]:
13
+ """
14
+ Normalize column names to lowercase for consistent comparison.
15
+
16
+ Parameters:
17
+ columns: List of column names to normalize
18
+
19
+ Returns:
20
+ List of lowercased column names
21
+ """
22
+ return [col.lower() for col in columns] if columns else []
23
+
11
24
 
12
25
  @dataclass
13
26
  class ComparisonStats:
14
27
  """Class for storing comparison statistics"""
28
+
15
29
  total_source_rows: int
16
30
  total_target_rows: int
17
31
 
@@ -28,13 +42,14 @@ class ComparisonStats:
28
42
 
29
43
  source_only_percentage_rows: float
30
44
  target_only_percentage_rows: float
31
- total_diff_percentage_rows : float
45
+ total_diff_percentage_rows: float
32
46
  #
33
- max_diff_percentage_cols : float
47
+ max_diff_percentage_cols: float
34
48
  median_diff_percentage_cols: float
35
49
  #
36
50
  final_diff_score: float
37
- final_score : float
51
+ final_score: float
52
+
38
53
 
39
54
  @dataclass
40
55
  class ComparisonDiffDetails:
@@ -47,16 +62,14 @@ class ComparisonDiffDetails:
47
62
  source_only_keys_examples: tuple
48
63
  target_only_keys_examples: tuple
49
64
 
50
- discrepant_data_examples: pd.DataFrame
65
+ discrepant_data_examples: pd.DataFrame
51
66
  common_attribute_columns: List[str]
52
- skipped_source_columns: List[str]= field(default_factory=list)
53
- skipped_target_columns: List[str]= field(default_factory=list)
67
+ skipped_source_columns: List[str] = field(default_factory=list)
68
+ skipped_target_columns: List[str] = field(default_factory=list)
54
69
 
55
70
 
56
71
  def compare_dataframes_meta(
57
- df1: pd.DataFrame,
58
- df2: pd.DataFrame,
59
- primary_keys: List[str] = None
72
+ df1: pd.DataFrame, df2: pd.DataFrame, primary_keys: List[str] = None
60
73
  ) -> List[str]:
61
74
  """
62
75
  Compare two pandas DataFrames and find common and different columns.
@@ -88,9 +101,12 @@ def compare_dataframes_meta(
88
101
 
89
102
  return common_columns
90
103
 
91
- def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_keys_cnt, examples_count=3):
92
104
 
93
- metrics = {'max_pct' : 0.0, 'median_pct' : 0.0}
105
+ def analyze_column_discrepancies(
106
+ df, primary_key_columns, value_columns, common_keys_cnt, examples_count=3
107
+ ):
108
+
109
+ metrics = {'max_pct': 0.0, 'median_pct': 0.0}
94
110
  diff_counters = defaultdict(int)
95
111
  diff_examples = {col: [] for col in value_columns}
96
112
 
@@ -114,10 +130,11 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
114
130
  src_val = getattr(src_row, col)
115
131
  trg_val = getattr(trg_row, col)
116
132
  if src_val != trg_val:
117
-
118
133
  diff_counters[col] += 1
119
134
  if len(diff_examples[col]) < examples_count:
120
- diff_examples[col].append({'pk': pk_value, 'src_val': src_val, 'trg_val': trg_val })
135
+ diff_examples[col].append(
136
+ {'pk': pk_value, 'src_val': src_val, 'trg_val': trg_val}
137
+ )
121
138
 
122
139
  # filter out cols without examples
123
140
  diff_examples = {k: v for k, v in diff_examples.items() if v}
@@ -127,7 +144,6 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
127
144
  metrics['max_pct'] = max_pct
128
145
  metrics['median_pct'] = median_pct
129
146
 
130
-
131
147
  # transform to dataframes
132
148
  # 1
133
149
  diff_records = []
@@ -145,8 +161,8 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
145
161
  # 2
146
162
  df_diff_counters = pd.DataFrame(
147
163
  list(diff_counters.items()), # преобразуем в список кортежей
148
- columns=['column_name', 'mismatch_count'] # переименовываем колонки
149
- )
164
+ columns=['column_name', 'mismatch_count'], # переименовываем колонки
165
+ )
150
166
 
151
167
  return metrics, df_diff_examples, df_diff_counters
152
168
 
@@ -155,7 +171,7 @@ def compare_dataframes(
155
171
  source_df: pd.DataFrame,
156
172
  target_df: pd.DataFrame,
157
173
  key_columns: List[str],
158
- max_examples: int = DEFAULT_MAX_EXAMPLES
174
+ max_examples: int = DEFAULT_MAX_EXAMPLES,
159
175
  ) -> tuple[ComparisonStats, ComparisonDiffDetails]:
160
176
  """
161
177
  Efficient comparison of two dataframes by primary key when discrepancies ratio quite small,
@@ -192,8 +208,12 @@ def compare_dataframes(
192
208
  source_dup = source_df[source_df.duplicated(subset=key_columns, keep=False)]
193
209
  target_dup = target_df[target_df.duplicated(subset=key_columns, keep=False)]
194
210
 
195
- source_dup_keys = _create_keys_set(source_dup, key_columns) if not source_dup.empty else set()
196
- target_dup_keys = _create_keys_set(target_dup, key_columns) if not target_dup.empty else set()
211
+ source_dup_keys = (
212
+ _create_keys_set(source_dup, key_columns) if not source_dup.empty else set()
213
+ )
214
+ target_dup_keys = (
215
+ _create_keys_set(target_dup, key_columns) if not target_dup.empty else set()
216
+ )
197
217
 
198
218
  source_dup_keys_examples = format_keys(source_dup_keys, max_examples)
199
219
  target_dup_keys_examples = format_keys(target_dup_keys, max_examples)
@@ -214,13 +234,16 @@ def compare_dataframes(
214
234
  xor_combined_df = (
215
235
  pd.concat([source_clean, target_clean], ignore_index=True)
216
236
  .drop_duplicates(subset=key_columns + non_key_columns, keep=False)
217
- .assign(xcount_pairs=lambda df: df.groupby(key_columns)[key_columns[0]].transform('size'))
237
+ .assign(
238
+ xcount_pairs=lambda df: df.groupby(key_columns)[key_columns[0]].transform(
239
+ 'size'
240
+ )
241
+ )
218
242
  )
219
243
 
220
244
  # symmetrical difference between two datasets, sorted
221
245
  xor_combined_sorted = xor_combined_df.sort_values(
222
- by=key_columns + ['xflg'],
223
- ascending=[False] * len(key_columns) + [True]
246
+ by=key_columns + ['xflg'], ascending=[False] * len(key_columns) + [True]
224
247
  )
225
248
 
226
249
  mask = xor_combined_sorted['xcount_pairs'] > 1
@@ -234,53 +257,66 @@ def compare_dataframes(
234
257
  xor_source_only_keys = _create_keys_set(xor_df_source_only, key_columns)
235
258
  xor_target_only_keys = _create_keys_set(xor_df_target_only, key_columns)
236
259
 
237
- xor_common_keys_cnt = int(len(xor_df_multi)/2) if not xor_df_multi.empty else 0
260
+ xor_common_keys_cnt = int(len(xor_df_multi) / 2) if not xor_df_multi.empty else 0
238
261
  xor_source_only_keys_cnt = len(xor_source_only_keys)
239
262
  xor_target_only_keys_cnt = len(xor_target_only_keys)
240
263
 
241
264
  # take n pairs that is why examples x2
242
- xor_df_multi_example = xor_df_multi.head(max_examples*2).drop(columns=['xcount_pairs']) if not xor_df_multi.empty else pd.DataFrame()
265
+ xor_df_multi_example = (
266
+ xor_df_multi.head(max_examples * 2).drop(columns=['xcount_pairs'])
267
+ if not xor_df_multi.empty
268
+ else pd.DataFrame()
269
+ )
243
270
 
244
271
  xor_source_only_keys_examples = format_keys(xor_source_only_keys, max_examples)
245
272
  xor_target_only_keys_examples = format_keys(xor_target_only_keys, max_examples)
246
273
 
247
274
  # get number of records that present in two datasets based on primary key
248
- common_keys_cnt = int((len(source_clean) - xor_source_only_keys_cnt + len(target_clean) - xor_target_only_keys_cnt)/2)
275
+ common_keys_cnt = int(
276
+ (
277
+ len(source_clean)
278
+ - xor_source_only_keys_cnt
279
+ + len(target_clean)
280
+ - xor_target_only_keys_cnt
281
+ )
282
+ / 2
283
+ )
249
284
 
250
285
  if not common_keys_cnt:
251
- #Special case when there is no matched primary keys at all
286
+ # Special case when there is no matched primary keys at all
252
287
  comparison_stats = ComparisonStats(
253
- total_source_rows = len(source_df),
254
- total_target_rows = len(target_df),
255
- dup_source_rows = source_dup_cnt,
256
- dup_target_rows = target_dup_cnt,
257
- only_source_rows = xor_source_only_keys_cnt,
258
- only_target_rows = xor_target_only_keys_cnt,
259
- common_pk_rows = 0,
260
- total_matched_rows= 0,
261
- #
262
- dup_source_percentage_rows = 100,
263
- dup_target_percentage_rows = 100,
264
- source_only_percentage_rows = 100,
265
- target_only_percentage_rows = 100,
266
- total_diff_percentage_rows = 100,
267
- #
268
- max_diff_percentage_cols = 100,
269
- median_diff_percentage_cols = 100,
270
- #
271
- final_diff_score = 100,
272
- final_score = 0
288
+ total_source_rows=len(source_df),
289
+ total_target_rows=len(target_df),
290
+ dup_source_rows=source_dup_cnt,
291
+ dup_target_rows=target_dup_cnt,
292
+ only_source_rows=xor_source_only_keys_cnt,
293
+ only_target_rows=xor_target_only_keys_cnt,
294
+ common_pk_rows=0,
295
+ total_matched_rows=0,
296
+ #
297
+ dup_source_percentage_rows=100,
298
+ dup_target_percentage_rows=100,
299
+ source_only_percentage_rows=100,
300
+ target_only_percentage_rows=100,
301
+ total_diff_percentage_rows=100,
302
+ #
303
+ max_diff_percentage_cols=100,
304
+ median_diff_percentage_cols=100,
305
+ #
306
+ final_diff_score=100,
307
+ final_score=0,
273
308
  )
274
309
 
275
310
  comparison_diff_detais = ComparisonDiffDetails(
276
- mismatches_per_column = pd.DataFrame(),
277
- discrepancies_per_col_examples = pd.DataFrame(),
278
- dup_source_keys_examples = source_dup_keys_examples,
279
- dup_target_keys_examples = target_dup_keys_examples,
280
- common_attribute_columns=non_key_columns,
281
- source_only_keys_examples = xor_source_only_keys_examples,
282
- target_only_keys_examples = xor_target_only_keys_examples,
283
- discrepant_data_examples = pd.DataFrame())
311
+ mismatches_per_column=pd.DataFrame(),
312
+ discrepancies_per_col_examples=pd.DataFrame(),
313
+ dup_source_keys_examples=source_dup_keys_examples,
314
+ dup_target_keys_examples=target_dup_keys_examples,
315
+ common_attribute_columns=non_key_columns,
316
+ source_only_keys_examples=xor_source_only_keys_examples,
317
+ target_only_keys_examples=xor_target_only_keys_examples,
318
+ discrepant_data_examples=pd.DataFrame(),
319
+ )
284
320
  app_logger.info('end')
285
321
 
286
322
  return comparison_stats, comparison_diff_detais
@@ -288,73 +324,79 @@ def compare_dataframes(
288
324
  # get number of that totally equal in two datasets
289
325
  total_matched_records_cnt = common_keys_cnt - xor_common_keys_cnt
290
326
 
291
- source_only_percentage = (xor_source_only_keys_cnt/common_keys_cnt)*100
292
- target_only_percentage = (xor_target_only_keys_cnt/common_keys_cnt)*100
327
+ source_only_percentage = (xor_source_only_keys_cnt / common_keys_cnt) * 100
328
+ target_only_percentage = (xor_target_only_keys_cnt / common_keys_cnt) * 100
293
329
 
294
- source_dup_percentage = (source_dup_cnt/len(source_df))*100
295
- target_dup_percentage = (target_dup_cnt/len(target_df))*100
296
-
297
- diff_col_metrics, \
298
- diff_col_examples,\
299
- diff_col_counters = analyze_column_discrepancies(xor_df_multi, key_columns, non_key_columns, common_keys_cnt, max_examples)
330
+ source_dup_percentage = (source_dup_cnt / len(source_df)) * 100
331
+ target_dup_percentage = (target_dup_cnt / len(target_df)) * 100
300
332
 
333
+ diff_col_metrics, diff_col_examples, diff_col_counters = (
334
+ analyze_column_discrepancies(
335
+ xor_df_multi, key_columns, non_key_columns, common_keys_cnt, max_examples
336
+ )
337
+ )
301
338
 
302
- source_and_target_total_diff_percentage = (1-total_matched_records_cnt/common_keys_cnt)*100
339
+ source_and_target_total_diff_percentage = (
340
+ 1 - total_matched_records_cnt / common_keys_cnt
341
+ ) * 100
303
342
 
304
- final_diff_score = source_dup_percentage*0.1 + target_dup_percentage*0.1 + \
305
- source_only_percentage*0.15 + target_only_percentage*0.15 + \
306
- source_and_target_total_diff_percentage*0.5
343
+ final_diff_score = (
344
+ source_dup_percentage * 0.1
345
+ + target_dup_percentage * 0.1
346
+ + source_only_percentage * 0.15
347
+ + target_only_percentage * 0.15
348
+ + source_and_target_total_diff_percentage * 0.5
349
+ )
307
350
 
308
351
  comparison_stats = ComparisonStats(
309
- total_source_rows = len(source_df),
310
- total_target_rows = len(target_df),
311
- dup_source_rows = source_dup_cnt,
312
- dup_target_rows = target_dup_cnt,
313
- only_source_rows = xor_source_only_keys_cnt,
314
- only_target_rows = xor_target_only_keys_cnt,
315
- common_pk_rows = common_keys_cnt,
316
- total_matched_rows= total_matched_records_cnt,
352
+ total_source_rows=len(source_df),
353
+ total_target_rows=len(target_df),
354
+ dup_source_rows=source_dup_cnt,
355
+ dup_target_rows=target_dup_cnt,
356
+ only_source_rows=xor_source_only_keys_cnt,
357
+ only_target_rows=xor_target_only_keys_cnt,
358
+ common_pk_rows=common_keys_cnt,
359
+ total_matched_rows=total_matched_records_cnt,
317
360
  #
318
- dup_source_percentage_rows = source_dup_percentage,
319
- dup_target_percentage_rows = target_dup_percentage,
320
- source_only_percentage_rows = source_only_percentage,
321
- target_only_percentage_rows = target_only_percentage,
322
- total_diff_percentage_rows = source_and_target_total_diff_percentage,
361
+ dup_source_percentage_rows=source_dup_percentage,
362
+ dup_target_percentage_rows=target_dup_percentage,
363
+ source_only_percentage_rows=source_only_percentage,
364
+ target_only_percentage_rows=target_only_percentage,
365
+ total_diff_percentage_rows=source_and_target_total_diff_percentage,
323
366
  #
324
- max_diff_percentage_cols = diff_col_metrics['max_pct'],
325
- median_diff_percentage_cols = diff_col_metrics['median_pct'],
367
+ max_diff_percentage_cols=diff_col_metrics['max_pct'],
368
+ median_diff_percentage_cols=diff_col_metrics['median_pct'],
326
369
  #
327
- final_diff_score = final_diff_score,
328
- final_score = 100 - final_diff_score
329
- )
370
+ final_diff_score=final_diff_score,
371
+ final_score=100 - final_diff_score,
372
+ )
330
373
 
331
374
  comparison_diff_detais = ComparisonDiffDetails(
332
- mismatches_per_column = diff_col_counters,
333
- discrepancies_per_col_examples = diff_col_examples,
334
- dup_source_keys_examples = source_dup_keys_examples,
335
- dup_target_keys_examples = target_dup_keys_examples,
336
- source_only_keys_examples = xor_source_only_keys_examples,
337
- target_only_keys_examples = xor_target_only_keys_examples,
338
- discrepant_data_examples = xor_df_multi_example,
339
- common_attribute_columns=non_key_columns)
375
+ mismatches_per_column=diff_col_counters,
376
+ discrepancies_per_col_examples=diff_col_examples,
377
+ dup_source_keys_examples=source_dup_keys_examples,
378
+ dup_target_keys_examples=target_dup_keys_examples,
379
+ source_only_keys_examples=xor_source_only_keys_examples,
380
+ target_only_keys_examples=xor_target_only_keys_examples,
381
+ discrepant_data_examples=xor_df_multi_example,
382
+ common_attribute_columns=non_key_columns,
383
+ )
340
384
 
341
385
  app_logger.info('end')
342
386
  return comparison_stats, comparison_diff_detais
343
387
 
344
388
 
345
389
  def _validate_input_data(
346
- source_df: pd.DataFrame,
347
- target_df: pd.DataFrame,
348
- key_columns: List[str]
390
+ source_df: pd.DataFrame, target_df: pd.DataFrame, key_columns: List[str]
349
391
  ) -> None:
350
392
  """Input data validation"""
351
393
  if not all(col in source_df.columns for col in key_columns):
352
394
  missing = [col for col in key_columns if col not in source_df.columns]
353
- raise ValueError(f"Key columns missing in source: {missing}")
395
+ raise ValueError(f'Key columns missing in source: {missing}')
354
396
 
355
397
  if not all(col in target_df.columns for col in key_columns):
356
398
  missing = [col for col in key_columns if col not in target_df.columns]
357
- raise ValueError(f"Key columns missing in target: {missing}")
399
+ raise ValueError(f'Key columns missing in target: {missing}')
358
400
 
359
401
 
360
402
  def _create_keys_set(df: pd.DataFrame, key_columns: List[str]) -> set:
@@ -362,154 +404,173 @@ def _create_keys_set(df: pd.DataFrame, key_columns: List[str]) -> set:
362
404
  return set(df[key_columns].itertuples(index=False, name=None))
363
405
 
364
406
 
365
- def generate_comparison_sample_report(source_table:str,
366
- target_table:str,
367
- stats: ComparisonStats,
368
- details: ComparisonDiffDetails,
369
- timezone: str,
370
- source_query: str = None,
371
- source_params: Dict = None,
372
- target_query: str = None,
373
- target_params: Dict = None) -> None:
407
+ def generate_comparison_sample_report(
408
+ source_table: str,
409
+ target_table: str,
410
+ stats: ComparisonStats,
411
+ details: ComparisonDiffDetails,
412
+ timezone: str,
413
+ source_query: str = None,
414
+ source_params: Dict = None,
415
+ target_query: str = None,
416
+ target_params: Dict = None,
417
+ ) -> None:
374
418
  """Generate comparison report (logger output looks uuugly)"""
375
419
  rl = []
376
- rl.append("=" * 80)
420
+ rl.append('=' * 80)
377
421
  current_datetime = datetime.now()
378
422
  rl.append(current_datetime.strftime(DATETIME_FORMAT))
379
- rl.append(f"DATA SAMPLE COMPARISON REPORT: ")
380
- if source_table and target_table: #empty for custom query
381
- rl.append(f"{source_table}")
382
- rl.append(f"VS")
383
- rl.append(f"{target_table}")
384
- rl.append("=" * 80)
423
+ rl.append(f'DATA SAMPLE COMPARISON REPORT: ')
424
+ if source_table and target_table: # empty for custom query
425
+ rl.append(f'{source_table}')
426
+ rl.append(f'VS')
427
+ rl.append(f'{target_table}')
428
+ rl.append('=' * 80)
385
429
 
386
430
  if source_query and target_query:
387
- rl.append(f"timezone: {timezone}")
388
- rl.append(f" {source_query}")
431
+ rl.append(f'timezone: {timezone}')
432
+ rl.append(f' {source_query}')
389
433
  if source_params:
390
- rl.append(f" params: {source_params}")
391
- rl.append("-" * 40)
392
- rl.append(f" {target_query}")
434
+ rl.append(f' params: {source_params}')
435
+ rl.append('-' * 40)
436
+ rl.append(f' {target_query}')
393
437
  if target_params:
394
- rl.append(f" params: {target_params}")
395
-
396
- rl.append("-" * 40)
397
-
398
- rl.append(f"\nSUMMARY:")
399
- rl.append(f" Source rows: {stats.total_source_rows}")
400
- rl.append(f" Target rows: {stats.total_target_rows}")
401
- rl.append(f" Duplicated source rows: {stats.dup_source_rows}")
402
- rl.append(f" Duplicated target rows: {stats.dup_target_rows}")
403
- rl.append(f" Only source rows: {stats.only_source_rows}")
404
- rl.append(f" Only target rows: {stats.only_target_rows}")
405
- rl.append(f" Common rows (by primary key): {stats.common_pk_rows}")
406
- rl.append(f" Totally matched rows: {stats.total_matched_rows}")
407
- rl.append("-"*40)
408
- rl.append(f" Source only rows %: {stats.source_only_percentage_rows:.5f}")
409
- rl.append(f" Target only rows %: {stats.target_only_percentage_rows:.5f}")
410
- rl.append(f" Duplicated source rows %: {stats.dup_source_percentage_rows:.5f}")
411
- rl.append(f" Duplicated target rows %: {stats.dup_target_percentage_rows:.5f}")
412
- rl.append(f" Mismatched rows %: {stats.total_diff_percentage_rows:.5f}")
413
- rl.append(f" Final discrepancies score: {stats.final_diff_score:.5f}")
414
- rl.append(f" Final data quality score: {stats.final_score:.5f}")
415
-
416
-
417
- rl.append(f" Source-only key examples: {details.source_only_keys_examples}")
418
- rl.append(f" Target-only key examples: {details.target_only_keys_examples}")
419
-
420
- rl.append(f" Duplicated source key examples: {details.dup_source_keys_examples}")
421
- rl.append(f" Duplicated target key examples: {details.dup_target_keys_examples}")
422
-
423
- rl.append(f" Common attribute columns: {', '.join(details.common_attribute_columns)}")
424
- rl.append(f" Skipped source columns: {', '.join(details.skipped_source_columns)}")
425
- rl.append(f" Skipped target columns: {', '.join(details.skipped_target_columns)}")
438
+ rl.append(f' params: {target_params}')
439
+
440
+ rl.append('-' * 40)
441
+
442
+ rl.append(f'\nSUMMARY:')
443
+ rl.append(f' Source rows: {stats.total_source_rows}')
444
+ rl.append(f' Target rows: {stats.total_target_rows}')
445
+ rl.append(f' Duplicated source rows: {stats.dup_source_rows}')
446
+ rl.append(f' Duplicated target rows: {stats.dup_target_rows}')
447
+ rl.append(f' Only source rows: {stats.only_source_rows}')
448
+ rl.append(f' Only target rows: {stats.only_target_rows}')
449
+ rl.append(f' Common rows (by primary key): {stats.common_pk_rows}')
450
+ rl.append(f' Totally matched rows: {stats.total_matched_rows}')
451
+ rl.append('-' * 40)
452
+ rl.append(f' Source only rows %: {stats.source_only_percentage_rows:.5f}')
453
+ rl.append(f' Target only rows %: {stats.target_only_percentage_rows:.5f}')
454
+ rl.append(f' Duplicated source rows %: {stats.dup_source_percentage_rows:.5f}')
455
+ rl.append(f' Duplicated target rows %: {stats.dup_target_percentage_rows:.5f}')
456
+ rl.append(f' Mismatched rows %: {stats.total_diff_percentage_rows:.5f}')
457
+ rl.append(f' Final discrepancies score: {stats.final_diff_score:.5f}')
458
+ rl.append(f' Final data quality score: {stats.final_score:.5f}')
459
+
460
+ rl.append(f' Source-only key examples: {details.source_only_keys_examples}')
461
+ rl.append(f' Target-only key examples: {details.target_only_keys_examples}')
462
+
463
+ rl.append(f' Duplicated source key examples: {details.dup_source_keys_examples}')
464
+ rl.append(f' Duplicated target key examples: {details.dup_target_keys_examples}')
465
+
466
+ rl.append(
467
+ f' Common attribute columns: {", ".join(details.common_attribute_columns)}'
468
+ )
469
+ rl.append(f' Skipped source columns: {", ".join(details.skipped_source_columns)}')
470
+ rl.append(f' Skipped target columns: {", ".join(details.skipped_target_columns)}')
426
471
 
427
472
  if stats.max_diff_percentage_cols > 0 and not details.mismatches_per_column.empty:
428
- rl.append(f"\nCOLUMN DIFFERENCES:")
473
+ rl.append(f'\nCOLUMN DIFFERENCES:')
429
474
 
430
- rl.append(f" Discrepancies per column (max %): {stats.max_diff_percentage_cols:.5f}")
431
- rl.append(f" Count of mismatches per column:\n")
475
+ rl.append(
476
+ f' Discrepancies per column (max %): {stats.max_diff_percentage_cols:.5f}'
477
+ )
478
+ rl.append(f' Count of mismatches per column:\n')
432
479
  rl.append(details.mismatches_per_column.to_string(index=False))
433
480
 
434
- rl.append(f" Some examples:\n")
435
- rl.append (details.discrepancies_per_col_examples.to_string(index=False, max_colwidth=64,justify='left'))
436
-
481
+ rl.append(f' Some examples:\n')
482
+ rl.append(
483
+ details.discrepancies_per_col_examples.to_string(
484
+ index=False, max_colwidth=64, justify='left'
485
+ )
486
+ )
437
487
 
438
488
  # Display sample data if available
439
- if details.discrepant_data_examples is not None and not details.discrepant_data_examples.empty:
440
- rl.append(f"\nDISCREPANT DATA (first pairs):")
441
- rl.append("Sorted by primary key and dataset:")
442
- rl.append(f"\n")
443
- rl.append(details.discrepant_data_examples.to_string(index=False, max_colwidth=64,justify='left'))
444
- rl.append(f"\n")
445
-
446
- rl.append("=" * 80)
447
-
448
- return "\n".join(rl)
449
-
450
- def generate_comparison_count_report(source_table:str,
451
- target_table:str,
452
- stats: ComparisonStats,
453
- details: ComparisonDiffDetails,
454
- total_source_count:int,
455
- total_target_count:int,
456
- discrepancies_counters_percentage:int,
457
- result_diff_in_counters:int,
458
- result_equal_in_counters:int,
459
- timezone: str,
460
- source_query: str = None,
461
- source_params: Dict = None,
462
- target_query: str = None,
463
- target_params: Dict = None) -> None:
464
-
489
+ if (
490
+ details.discrepant_data_examples is not None
491
+ and not details.discrepant_data_examples.empty
492
+ ):
493
+ rl.append(f'\nDISCREPANT DATA (first pairs):')
494
+ rl.append('Sorted by primary key and dataset:')
495
+ rl.append(f'\n')
496
+ rl.append(
497
+ details.discrepant_data_examples.to_string(
498
+ index=False, max_colwidth=64, justify='left'
499
+ )
500
+ )
501
+ rl.append(f'\n')
502
+
503
+ rl.append('=' * 80)
504
+
505
+ return '\n'.join(rl)
506
+
507
+
508
+ def generate_comparison_count_report(
509
+ source_table: str,
510
+ target_table: str,
511
+ stats: ComparisonStats,
512
+ details: ComparisonDiffDetails,
513
+ total_source_count: int,
514
+ total_target_count: int,
515
+ discrepancies_counters_percentage: int,
516
+ result_diff_in_counters: int,
517
+ result_equal_in_counters: int,
518
+ timezone: str,
519
+ source_query: str = None,
520
+ source_params: Dict = None,
521
+ target_query: str = None,
522
+ target_params: Dict = None,
523
+ ) -> None:
465
524
  """Generates comparison report (logger output looks uuugly)"""
466
525
  rl = []
467
- rl.append("=" * 80)
526
+ rl.append('=' * 80)
468
527
  current_datetime = datetime.now()
469
528
  rl.append(current_datetime.strftime(DATETIME_FORMAT))
470
- rl.append(f"COUNT COMPARISON REPORT:")
471
- rl.append(f"{source_table}")
472
- rl.append(f"VS")
473
- rl.append(f"{target_table}")
474
- rl.append("=" * 80)
529
+ rl.append(f'COUNT COMPARISON REPORT:')
530
+ rl.append(f'{source_table}')
531
+ rl.append(f'VS')
532
+ rl.append(f'{target_table}')
533
+ rl.append('=' * 80)
475
534
 
476
535
  if source_query and target_query:
477
- rl.append(f"timezone: {timezone}")
478
- rl.append(f" {source_query}")
536
+ rl.append(f'timezone: {timezone}')
537
+ rl.append(f' {source_query}')
479
538
  if source_params:
480
- rl.append(f" params: {source_params}")
481
- rl.append("-" * 40)
482
- rl.append(f" {target_query}")
539
+ rl.append(f' params: {source_params}')
540
+ rl.append('-' * 40)
541
+ rl.append(f' {target_query}')
483
542
  if target_params:
484
- rl.append(f" params: {target_params}")
485
- rl.append("-" * 40)
486
-
487
- rl.append(f"\nSUMMARY:")
488
- rl.append(f" Source total count: {total_source_count}")
489
- rl.append(f" Target total count: {total_target_count}")
490
- rl.append(f" Common total count: {result_equal_in_counters}")
491
- rl.append(f" Diff total count: {result_diff_in_counters}")
492
- rl.append(f" Discrepancies percentage: {discrepancies_counters_percentage:.5f}%")
493
- rl.append(f" Final discrepancies score: {discrepancies_counters_percentage:.5f}")
494
- rl.append(f" Final data quality score: {(100-discrepancies_counters_percentage):.5f}")
495
- if not details.mismatches_per_column.empty :
496
-
497
-
498
- rl.append(f"\nDETAIL DIFFERENCES:")
499
- rl.append (details.mismatches_per_column.to_string(index=False)
500
-
501
- )
543
+ rl.append(f' params: {target_params}')
544
+ rl.append('-' * 40)
545
+
546
+ rl.append(f'\nSUMMARY:')
547
+ rl.append(f' Source total count: {total_source_count}')
548
+ rl.append(f' Target total count: {total_target_count}')
549
+ rl.append(f' Common total count: {result_equal_in_counters}')
550
+ rl.append(f' Diff total count: {result_diff_in_counters}')
551
+ rl.append(f' Discrepancies percentage: {discrepancies_counters_percentage:.5f}%')
552
+ rl.append(f' Final discrepancies score: {discrepancies_counters_percentage:.5f}')
553
+ rl.append(
554
+ f' Final data quality score: {(100 - discrepancies_counters_percentage):.5f}'
555
+ )
556
+ if not details.mismatches_per_column.empty:
557
+ rl.append(f'\nDETAIL DIFFERENCES:')
558
+ rl.append(details.mismatches_per_column.to_string(index=False))
502
559
 
503
560
  # Display sample data if available
504
- if details.discrepant_data_examples is not None and not details.discrepant_data_examples.empty:
505
- rl.append(f"\nDISCREPANT DATA (first pairs):")
506
- rl.append("Sorted by primary key and dataset:")
507
- rl.append(f"\n")
561
+ if (
562
+ details.discrepant_data_examples is not None
563
+ and not details.discrepant_data_examples.empty
564
+ ):
565
+ rl.append(f'\nDISCREPANT DATA (first pairs):')
566
+ rl.append('Sorted by primary key and dataset:')
567
+ rl.append(f'\n')
508
568
  rl.append(details.discrepant_data_examples.to_string(index=False))
509
- rl.append(f"\n")
510
- rl.append("=" * 80)
569
+ rl.append(f'\n')
570
+ rl.append('=' * 80)
571
+
572
+ return '\n'.join(rl)
511
573
 
512
- return "\n".join(rl)
513
574
 
514
575
  def safe_remove_zeros(x):
515
576
  if pd.isna(x):
@@ -518,11 +579,11 @@ def safe_remove_zeros(x):
518
579
  return int(x)
519
580
  return x
520
581
 
582
+
521
583
  def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
522
584
  """Prepare DataFrame for comparison by handling nulls and empty strings"""
523
585
  df = df.map(safe_remove_zeros)
524
586
 
525
-
526
587
  df = df.fillna(NULL_REPLACEMENT)
527
588
  df = df.replace(r'(?i)^(None|nan|NaN|NaT|\s*)$', NULL_REPLACEMENT, regex=True)
528
589
 
@@ -530,26 +591,36 @@ def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
530
591
 
531
592
  return df
532
593
 
594
+
533
595
  def exclude_by_keys(df, key_columns, exclude_set):
534
596
  if len(key_columns) == 1:
535
597
  exclude_values = [x[0] for x in exclude_set]
536
598
  return df[~df[key_columns[0]].isin(exclude_values)]
537
599
  else:
538
- return df[~df.apply(lambda row: tuple(row[col] for col in key_columns) in exclude_set, axis=1)]
600
+ return df[
601
+ ~df.apply(
602
+ lambda row: tuple(row[col] for col in key_columns) in exclude_set,
603
+ axis=1,
604
+ )
605
+ ]
539
606
 
540
607
 
541
- def clean_recently_changed_data(df1:pd.DataFrame, df2:pd.DataFrame, primary_keys:List[str]):
608
+ def clean_recently_changed_data(
609
+ df1: pd.DataFrame, df2: pd.DataFrame, primary_keys: List[str]
610
+ ):
542
611
  """
543
612
  Mutually removes rows with recently changed records
544
613
 
545
614
  Parameters:
546
615
  df1, df2: pandas.DataFrame
547
- primary_keys: list
616
+ primary_keys: list
548
617
 
549
618
  Returns:
550
619
  tuple: (df1_processed, df2_processed)
551
620
  """
552
- app_logger.info(f'before exclusion recently changed rows source: {len(df1)}, target {len(df2)}')
621
+ app_logger.info(
622
+ f'before exclusion recently changed rows source: {len(df1)}, target {len(df2)}'
623
+ )
553
624
 
554
625
  filtered_df1 = df1.copy()
555
626
  filtered_df2 = df2.copy()
@@ -557,21 +628,26 @@ def clean_recently_changed_data(df1:pd.DataFrame, df2:pd.DataFrame, primary_keys
557
628
  filtered_df1 = filtered_df1.loc[filtered_df1['xrecently_changed'] == 'y']
558
629
  filtered_df2 = filtered_df2.loc[filtered_df2['xrecently_changed'] == 'y']
559
630
 
560
- excluded_from_df1_keys = _create_keys_set(filtered_df1,primary_keys)
561
- excluded_from_df2_keys = _create_keys_set(filtered_df2,primary_keys)
631
+ excluded_from_df1_keys = _create_keys_set(filtered_df1, primary_keys)
632
+ excluded_from_df2_keys = _create_keys_set(filtered_df2, primary_keys)
562
633
 
563
634
  excluded_keys = excluded_from_df1_keys | excluded_from_df2_keys
564
- df1_processed = exclude_by_keys(df1, primary_keys, excluded_keys).drop('xrecently_changed', axis=1)
565
- df2_processed = exclude_by_keys(df2, primary_keys, excluded_keys).drop('xrecently_changed', axis=1)
635
+ df1_processed = exclude_by_keys(df1, primary_keys, excluded_keys).drop(
636
+ 'xrecently_changed', axis=1
637
+ )
638
+ df2_processed = exclude_by_keys(df2, primary_keys, excluded_keys).drop(
639
+ 'xrecently_changed', axis=1
640
+ )
566
641
 
567
- app_logger.info(f'after exclusion recently changed rows source: {len(df1_processed)}, target {len(df2_processed)}')
642
+ app_logger.info(
643
+ f'after exclusion recently changed rows source: {len(df1_processed)}, target {len(df2_processed)}'
644
+ )
568
645
 
569
646
  return df1_processed, df2_processed
570
647
 
571
648
 
572
649
  def find_count_discrepancies(
573
- source_counts: pd.DataFrame,
574
- target_counts: pd.DataFrame
650
+ source_counts: pd.DataFrame, target_counts: pd.DataFrame
575
651
  ) -> pd.DataFrame:
576
652
  """Find discrepancies in daily row counts between source and target"""
577
653
  source_counts['flg'] = 'source'
@@ -580,44 +656,41 @@ def find_count_discrepancies(
580
656
  # Find mismatches in counts per date
581
657
  all_counts = pd.concat([source_counts, target_counts])
582
658
  discrepancies = all_counts.drop_duplicates(
583
- subset=['dt', 'cnt'],
584
- keep=False
585
- ).sort_values(
586
- by=['dt', 'flg'],
587
- ascending=[False, True]
588
- )
659
+ subset=['dt', 'cnt'], keep=False
660
+ ).sort_values(by=['dt', 'flg'], ascending=[False, True])
589
661
 
590
662
  return discrepancies
591
663
 
664
+
592
665
  def create_result_message(
593
666
  source_total: int,
594
667
  target_total: int,
595
668
  discrepancies: pd.DataFrame,
596
- comparison_type: str
669
+ comparison_type: str,
597
670
  ) -> str:
598
671
  """Create standardized result message"""
599
672
  if discrepancies.empty:
600
- return f"{comparison_type} match: Source={source_total}, Target={target_total}"
673
+ return f'{comparison_type} match: Source={source_total}, Target={target_total}'
601
674
 
602
675
  mismatch_count = len(discrepancies)
603
676
  diff = source_total - target_total
604
- diff_msg = f" (Δ={diff})" if diff != 0 else ""
677
+ diff_msg = f' (Δ={diff})' if diff != 0 else ''
605
678
 
606
679
  return (
607
- f"{comparison_type} mismatch: Source={source_total}, Target={target_total}{diff_msg}, "
608
- f"{mismatch_count} discrepancies found"
680
+ f'{comparison_type} mismatch: Source={source_total}, Target={target_total}{diff_msg}, '
681
+ f'{mismatch_count} discrepancies found'
609
682
  )
610
683
 
684
+
611
685
  def filter_columns(
612
- df: pd.DataFrame,
613
- columns: List[str],
614
- exclude: Optional[List[str]] = None
686
+ df: pd.DataFrame, columns: List[str], exclude: Optional[List[str]] = None
615
687
  ) -> pd.DataFrame:
616
688
  """Filter DataFrame columns with optional exclusions"""
617
689
  if exclude:
618
690
  columns = [col for col in columns if col not in exclude]
619
691
  return df[columns]
620
692
 
693
+
621
694
  def cross_fill_missing_dates(df1, df2, date_column='dt', value_column='cnt'):
622
695
  """
623
696
  Fill missing dates between tow dataframes
@@ -636,6 +709,7 @@ def cross_fill_missing_dates(df1, df2, date_column='dt', value_column='cnt'):
636
709
 
637
710
  return df1_full, df2_full
638
711
 
712
+
639
713
  def format_keys(keys, max_examples):
640
714
  if keys:
641
715
  keys = {next(iter(x)) if len(x) == 1 else x for x in list(keys)[:max_examples]}
@@ -644,12 +718,14 @@ def format_keys(keys, max_examples):
644
718
  else:
645
719
  return None
646
720
 
721
+
647
722
  def get_dataframe_size_gb(df: pd.DataFrame) -> float:
648
723
  """Calculate DataFrame size in GB"""
649
724
  if df.empty:
650
725
  return 0.0
651
726
  return df.memory_usage(deep=True).sum() / 1024 / 1024 / 1024
652
727
 
728
+
653
729
  def validate_dataframe_size(df: pd.DataFrame, max_size_gb: float) -> None:
654
730
  """Validate DataFrame size and raise exception if exceeds limit"""
655
731
  if df is None:
@@ -659,6 +735,6 @@ def validate_dataframe_size(df: pd.DataFrame, max_size_gb: float) -> None:
659
735
 
660
736
  if size_gb > max_size_gb:
661
737
  raise ValueError(
662
- f"DataFrame size {size_gb:.2f} GB exceeds limit of {max_size_gb} GB. "
663
- f"Shape: {df.shape}"
664
- )
738
+ f'DataFrame size {size_gb:.2f} GB exceeds limit of {max_size_gb} GB. '
739
+ f'Shape: {df.shape}'
740
+ )