xoverrr 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xoverrr/utils.py CHANGED
@@ -1,21 +1,31 @@
1
- import pandas as pd
2
- import numpy as np
3
- from typing import Dict, Any, List, Optional, Tuple, defaultdict
1
+ from dataclasses import dataclass, field
4
2
  from datetime import datetime
3
+ from typing import Any, Dict, List, Optional, Tuple, defaultdict
4
+
5
+ import numpy as np
6
+ import pandas as pd
5
7
 
6
- try:
7
- from .constants import NULL_REPLACEMENT, DEFAULT_MAX_EXAMPLES, DATETIME_FORMAT
8
- from .logger import app_logger
9
- except ImportError:
10
- # for cases when used as standalone script
11
- from constants import NULL_REPLACEMENT, DEFAULT_MAX_EXAMPLES, DATETIME_FORMAT
12
- from logger import app_logger
8
+ from .constants import DATETIME_FORMAT, DEFAULT_MAX_EXAMPLES, NULL_REPLACEMENT
9
+ from .logger import app_logger
10
+
11
+
12
+ def normalize_column_names(columns: List[str]) -> List[str]:
13
+ """
14
+ Normalize column names to lowercase for consistent comparison.
15
+
16
+ Parameters:
17
+ columns: List of column names to normalize
18
+
19
+ Returns:
20
+ List of lowercased column names
21
+ """
22
+ return [col.lower() for col in columns] if columns else []
13
23
 
14
- from dataclasses import dataclass, field
15
24
 
16
25
  @dataclass
17
26
  class ComparisonStats:
18
27
  """Class for storing comparison statistics"""
28
+
19
29
  total_source_rows: int
20
30
  total_target_rows: int
21
31
 
@@ -32,13 +42,14 @@ class ComparisonStats:
32
42
 
33
43
  source_only_percentage_rows: float
34
44
  target_only_percentage_rows: float
35
- total_diff_percentage_rows : float
45
+ total_diff_percentage_rows: float
36
46
  #
37
- max_diff_percentage_cols : float
47
+ max_diff_percentage_cols: float
38
48
  median_diff_percentage_cols: float
39
49
  #
40
50
  final_diff_score: float
41
- final_score : float
51
+ final_score: float
52
+
42
53
 
43
54
  @dataclass
44
55
  class ComparisonDiffDetails:
@@ -51,16 +62,14 @@ class ComparisonDiffDetails:
51
62
  source_only_keys_examples: tuple
52
63
  target_only_keys_examples: tuple
53
64
 
54
- discrepant_data_examples: pd.DataFrame
65
+ discrepant_data_examples: pd.DataFrame
55
66
  common_attribute_columns: List[str]
56
- skipped_source_columns: List[str]= field(default_factory=list)
57
- skipped_target_columns: List[str]= field(default_factory=list)
67
+ skipped_source_columns: List[str] = field(default_factory=list)
68
+ skipped_target_columns: List[str] = field(default_factory=list)
58
69
 
59
70
 
60
71
  def compare_dataframes_meta(
61
- df1: pd.DataFrame,
62
- df2: pd.DataFrame,
63
- primary_keys: List[str] = None
72
+ df1: pd.DataFrame, df2: pd.DataFrame, primary_keys: List[str] = None
64
73
  ) -> List[str]:
65
74
  """
66
75
  Compare two pandas DataFrames and find common and different columns.
@@ -92,9 +101,12 @@ def compare_dataframes_meta(
92
101
 
93
102
  return common_columns
94
103
 
95
- def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_keys_cnt, examples_count=3):
96
104
 
97
- metrics = {'max_pct' : 0.0, 'median_pct' : 0.0}
105
+ def analyze_column_discrepancies(
106
+ df, primary_key_columns, value_columns, common_keys_cnt, examples_count=3
107
+ ):
108
+
109
+ metrics = {'max_pct': 0.0, 'median_pct': 0.0}
98
110
  diff_counters = defaultdict(int)
99
111
  diff_examples = {col: [] for col in value_columns}
100
112
 
@@ -118,10 +130,11 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
118
130
  src_val = getattr(src_row, col)
119
131
  trg_val = getattr(trg_row, col)
120
132
  if src_val != trg_val:
121
-
122
133
  diff_counters[col] += 1
123
134
  if len(diff_examples[col]) < examples_count:
124
- diff_examples[col].append({'pk': pk_value, 'src_val': src_val, 'trg_val': trg_val })
135
+ diff_examples[col].append(
136
+ {'pk': pk_value, 'src_val': src_val, 'trg_val': trg_val}
137
+ )
125
138
 
126
139
  # filter out cols without examples
127
140
  diff_examples = {k: v for k, v in diff_examples.items() if v}
@@ -131,7 +144,6 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
131
144
  metrics['max_pct'] = max_pct
132
145
  metrics['median_pct'] = median_pct
133
146
 
134
-
135
147
  # transform to dataframes
136
148
  # 1
137
149
  diff_records = []
@@ -149,8 +161,8 @@ def analyze_column_discrepancies(df, primary_key_columns, value_columns, common_
149
161
  # 2
150
162
  df_diff_counters = pd.DataFrame(
151
163
  list(diff_counters.items()), # преобразуем в список кортежей
152
- columns=['column_name', 'mismatch_count'] # переименовываем колонки
153
- )
164
+ columns=['column_name', 'mismatch_count'], # переименовываем колонки
165
+ )
154
166
 
155
167
  return metrics, df_diff_examples, df_diff_counters
156
168
 
@@ -159,7 +171,7 @@ def compare_dataframes(
159
171
  source_df: pd.DataFrame,
160
172
  target_df: pd.DataFrame,
161
173
  key_columns: List[str],
162
- max_examples: int = DEFAULT_MAX_EXAMPLES
174
+ max_examples: int = DEFAULT_MAX_EXAMPLES,
163
175
  ) -> tuple[ComparisonStats, ComparisonDiffDetails]:
164
176
  """
165
177
  Efficient comparison of two dataframes by primary key when discrepancies ratio quite small,
@@ -196,8 +208,12 @@ def compare_dataframes(
196
208
  source_dup = source_df[source_df.duplicated(subset=key_columns, keep=False)]
197
209
  target_dup = target_df[target_df.duplicated(subset=key_columns, keep=False)]
198
210
 
199
- source_dup_keys = _create_keys_set(source_dup, key_columns) if not source_dup.empty else set()
200
- target_dup_keys = _create_keys_set(target_dup, key_columns) if not target_dup.empty else set()
211
+ source_dup_keys = (
212
+ _create_keys_set(source_dup, key_columns) if not source_dup.empty else set()
213
+ )
214
+ target_dup_keys = (
215
+ _create_keys_set(target_dup, key_columns) if not target_dup.empty else set()
216
+ )
201
217
 
202
218
  source_dup_keys_examples = format_keys(source_dup_keys, max_examples)
203
219
  target_dup_keys_examples = format_keys(target_dup_keys, max_examples)
@@ -218,13 +234,16 @@ def compare_dataframes(
218
234
  xor_combined_df = (
219
235
  pd.concat([source_clean, target_clean], ignore_index=True)
220
236
  .drop_duplicates(subset=key_columns + non_key_columns, keep=False)
221
- .assign(xcount_pairs=lambda df: df.groupby(key_columns)[key_columns[0]].transform('size'))
237
+ .assign(
238
+ xcount_pairs=lambda df: df.groupby(key_columns)[key_columns[0]].transform(
239
+ 'size'
240
+ )
241
+ )
222
242
  )
223
243
 
224
244
  # symmetrical difference between two datasets, sorted
225
245
  xor_combined_sorted = xor_combined_df.sort_values(
226
- by=key_columns + ['xflg'],
227
- ascending=[False] * len(key_columns) + [True]
246
+ by=key_columns + ['xflg'], ascending=[False] * len(key_columns) + [True]
228
247
  )
229
248
 
230
249
  mask = xor_combined_sorted['xcount_pairs'] > 1
@@ -238,53 +257,66 @@ def compare_dataframes(
238
257
  xor_source_only_keys = _create_keys_set(xor_df_source_only, key_columns)
239
258
  xor_target_only_keys = _create_keys_set(xor_df_target_only, key_columns)
240
259
 
241
- xor_common_keys_cnt = int(len(xor_df_multi)/2) if not xor_df_multi.empty else 0
260
+ xor_common_keys_cnt = int(len(xor_df_multi) / 2) if not xor_df_multi.empty else 0
242
261
  xor_source_only_keys_cnt = len(xor_source_only_keys)
243
262
  xor_target_only_keys_cnt = len(xor_target_only_keys)
244
263
 
245
264
  # take n pairs that is why examples x2
246
- xor_df_multi_example = xor_df_multi.head(max_examples*2).drop(columns=['xcount_pairs']) if not xor_df_multi.empty else pd.DataFrame()
265
+ xor_df_multi_example = (
266
+ xor_df_multi.head(max_examples * 2).drop(columns=['xcount_pairs'])
267
+ if not xor_df_multi.empty
268
+ else pd.DataFrame()
269
+ )
247
270
 
248
271
  xor_source_only_keys_examples = format_keys(xor_source_only_keys, max_examples)
249
272
  xor_target_only_keys_examples = format_keys(xor_target_only_keys, max_examples)
250
273
 
251
274
  # get number of records that present in two datasets based on primary key
252
- common_keys_cnt = int((len(source_clean) - xor_source_only_keys_cnt + len(target_clean) - xor_target_only_keys_cnt)/2)
275
+ common_keys_cnt = int(
276
+ (
277
+ len(source_clean)
278
+ - xor_source_only_keys_cnt
279
+ + len(target_clean)
280
+ - xor_target_only_keys_cnt
281
+ )
282
+ / 2
283
+ )
253
284
 
254
285
  if not common_keys_cnt:
255
- #Special case when there is no matched primary keys at all
286
+ # Special case when there is no matched primary keys at all
256
287
  comparison_stats = ComparisonStats(
257
- total_source_rows = len(source_df),
258
- total_target_rows = len(target_df),
259
- dup_source_rows = source_dup_cnt,
260
- dup_target_rows = target_dup_cnt,
261
- only_source_rows = xor_source_only_keys_cnt,
262
- only_target_rows = xor_target_only_keys_cnt,
263
- common_pk_rows = 0,
264
- total_matched_rows= 0,
265
- #
266
- dup_source_percentage_rows = 100,
267
- dup_target_percentage_rows = 100,
268
- source_only_percentage_rows = 100,
269
- target_only_percentage_rows = 100,
270
- total_diff_percentage_rows = 100,
271
- #
272
- max_diff_percentage_cols = 100,
273
- median_diff_percentage_cols = 100,
274
- #
275
- final_diff_score = 100,
276
- final_score = 0
288
+ total_source_rows=len(source_df),
289
+ total_target_rows=len(target_df),
290
+ dup_source_rows=source_dup_cnt,
291
+ dup_target_rows=target_dup_cnt,
292
+ only_source_rows=xor_source_only_keys_cnt,
293
+ only_target_rows=xor_target_only_keys_cnt,
294
+ common_pk_rows=0,
295
+ total_matched_rows=0,
296
+ #
297
+ dup_source_percentage_rows=100,
298
+ dup_target_percentage_rows=100,
299
+ source_only_percentage_rows=100,
300
+ target_only_percentage_rows=100,
301
+ total_diff_percentage_rows=100,
302
+ #
303
+ max_diff_percentage_cols=100,
304
+ median_diff_percentage_cols=100,
305
+ #
306
+ final_diff_score=100,
307
+ final_score=0,
277
308
  )
278
309
 
279
310
  comparison_diff_detais = ComparisonDiffDetails(
280
- mismatches_per_column = pd.DataFrame(),
281
- discrepancies_per_col_examples = pd.DataFrame(),
282
- dup_source_keys_examples = source_dup_keys_examples,
283
- dup_target_keys_examples = target_dup_keys_examples,
284
- common_attribute_columns=non_key_columns,
285
- source_only_keys_examples = xor_source_only_keys_examples,
286
- target_only_keys_examples = xor_target_only_keys_examples,
287
- discrepant_data_examples = pd.DataFrame())
311
+ mismatches_per_column=pd.DataFrame(),
312
+ discrepancies_per_col_examples=pd.DataFrame(),
313
+ dup_source_keys_examples=source_dup_keys_examples,
314
+ dup_target_keys_examples=target_dup_keys_examples,
315
+ common_attribute_columns=non_key_columns,
316
+ source_only_keys_examples=xor_source_only_keys_examples,
317
+ target_only_keys_examples=xor_target_only_keys_examples,
318
+ discrepant_data_examples=pd.DataFrame(),
319
+ )
288
320
  app_logger.info('end')
289
321
 
290
322
  return comparison_stats, comparison_diff_detais
@@ -292,73 +324,79 @@ def compare_dataframes(
292
324
  # get number of that totally equal in two datasets
293
325
  total_matched_records_cnt = common_keys_cnt - xor_common_keys_cnt
294
326
 
295
- source_only_percentage = (xor_source_only_keys_cnt/common_keys_cnt)*100
296
- target_only_percentage = (xor_target_only_keys_cnt/common_keys_cnt)*100
327
+ source_only_percentage = (xor_source_only_keys_cnt / common_keys_cnt) * 100
328
+ target_only_percentage = (xor_target_only_keys_cnt / common_keys_cnt) * 100
297
329
 
298
- source_dup_percentage = (source_dup_cnt/len(source_df))*100
299
- target_dup_percentage = (target_dup_cnt/len(target_df))*100
300
-
301
- diff_col_metrics, \
302
- diff_col_examples,\
303
- diff_col_counters = analyze_column_discrepancies(xor_df_multi, key_columns, non_key_columns, common_keys_cnt, max_examples)
330
+ source_dup_percentage = (source_dup_cnt / len(source_df)) * 100
331
+ target_dup_percentage = (target_dup_cnt / len(target_df)) * 100
304
332
 
333
+ diff_col_metrics, diff_col_examples, diff_col_counters = (
334
+ analyze_column_discrepancies(
335
+ xor_df_multi, key_columns, non_key_columns, common_keys_cnt, max_examples
336
+ )
337
+ )
305
338
 
306
- source_and_target_total_diff_percentage = (1-total_matched_records_cnt/common_keys_cnt)*100
339
+ source_and_target_total_diff_percentage = (
340
+ 1 - total_matched_records_cnt / common_keys_cnt
341
+ ) * 100
307
342
 
308
- final_diff_score = source_dup_percentage*0.1 + target_dup_percentage*0.1 + \
309
- source_only_percentage*0.15 + target_only_percentage*0.15 + \
310
- source_and_target_total_diff_percentage*0.5
343
+ final_diff_score = (
344
+ source_dup_percentage * 0.1
345
+ + target_dup_percentage * 0.1
346
+ + source_only_percentage * 0.15
347
+ + target_only_percentage * 0.15
348
+ + source_and_target_total_diff_percentage * 0.5
349
+ )
311
350
 
312
351
  comparison_stats = ComparisonStats(
313
- total_source_rows = len(source_df),
314
- total_target_rows = len(target_df),
315
- dup_source_rows = source_dup_cnt,
316
- dup_target_rows = target_dup_cnt,
317
- only_source_rows = xor_source_only_keys_cnt,
318
- only_target_rows = xor_target_only_keys_cnt,
319
- common_pk_rows = common_keys_cnt,
320
- total_matched_rows= total_matched_records_cnt,
352
+ total_source_rows=len(source_df),
353
+ total_target_rows=len(target_df),
354
+ dup_source_rows=source_dup_cnt,
355
+ dup_target_rows=target_dup_cnt,
356
+ only_source_rows=xor_source_only_keys_cnt,
357
+ only_target_rows=xor_target_only_keys_cnt,
358
+ common_pk_rows=common_keys_cnt,
359
+ total_matched_rows=total_matched_records_cnt,
321
360
  #
322
- dup_source_percentage_rows = source_dup_percentage,
323
- dup_target_percentage_rows = target_dup_percentage,
324
- source_only_percentage_rows = source_only_percentage,
325
- target_only_percentage_rows = target_only_percentage,
326
- total_diff_percentage_rows = source_and_target_total_diff_percentage,
361
+ dup_source_percentage_rows=source_dup_percentage,
362
+ dup_target_percentage_rows=target_dup_percentage,
363
+ source_only_percentage_rows=source_only_percentage,
364
+ target_only_percentage_rows=target_only_percentage,
365
+ total_diff_percentage_rows=source_and_target_total_diff_percentage,
327
366
  #
328
- max_diff_percentage_cols = diff_col_metrics['max_pct'],
329
- median_diff_percentage_cols = diff_col_metrics['median_pct'],
367
+ max_diff_percentage_cols=diff_col_metrics['max_pct'],
368
+ median_diff_percentage_cols=diff_col_metrics['median_pct'],
330
369
  #
331
- final_diff_score = final_diff_score,
332
- final_score = 100 - final_diff_score
333
- )
370
+ final_diff_score=final_diff_score,
371
+ final_score=100 - final_diff_score,
372
+ )
334
373
 
335
374
  comparison_diff_detais = ComparisonDiffDetails(
336
- mismatches_per_column = diff_col_counters,
337
- discrepancies_per_col_examples = diff_col_examples,
338
- dup_source_keys_examples = source_dup_keys_examples,
339
- dup_target_keys_examples = target_dup_keys_examples,
340
- source_only_keys_examples = xor_source_only_keys_examples,
341
- target_only_keys_examples = xor_target_only_keys_examples,
342
- discrepant_data_examples = xor_df_multi_example,
343
- common_attribute_columns=non_key_columns)
375
+ mismatches_per_column=diff_col_counters,
376
+ discrepancies_per_col_examples=diff_col_examples,
377
+ dup_source_keys_examples=source_dup_keys_examples,
378
+ dup_target_keys_examples=target_dup_keys_examples,
379
+ source_only_keys_examples=xor_source_only_keys_examples,
380
+ target_only_keys_examples=xor_target_only_keys_examples,
381
+ discrepant_data_examples=xor_df_multi_example,
382
+ common_attribute_columns=non_key_columns,
383
+ )
344
384
 
345
385
  app_logger.info('end')
346
386
  return comparison_stats, comparison_diff_detais
347
387
 
348
388
 
349
389
  def _validate_input_data(
350
- source_df: pd.DataFrame,
351
- target_df: pd.DataFrame,
352
- key_columns: List[str]
390
+ source_df: pd.DataFrame, target_df: pd.DataFrame, key_columns: List[str]
353
391
  ) -> None:
354
392
  """Input data validation"""
355
393
  if not all(col in source_df.columns for col in key_columns):
356
394
  missing = [col for col in key_columns if col not in source_df.columns]
357
- raise ValueError(f"Key columns missing in source: {missing}")
395
+ raise ValueError(f'Key columns missing in source: {missing}')
358
396
 
359
397
  if not all(col in target_df.columns for col in key_columns):
360
398
  missing = [col for col in key_columns if col not in target_df.columns]
361
- raise ValueError(f"Key columns missing in target: {missing}")
399
+ raise ValueError(f'Key columns missing in target: {missing}')
362
400
 
363
401
 
364
402
  def _create_keys_set(df: pd.DataFrame, key_columns: List[str]) -> set:
@@ -366,154 +404,173 @@ def _create_keys_set(df: pd.DataFrame, key_columns: List[str]) -> set:
366
404
  return set(df[key_columns].itertuples(index=False, name=None))
367
405
 
368
406
 
369
- def generate_comparison_sample_report(source_table:str,
370
- target_table:str,
371
- stats: ComparisonStats,
372
- details: ComparisonDiffDetails,
373
- timezone: str,
374
- source_query: str = None,
375
- source_params: Dict = None,
376
- target_query: str = None,
377
- target_params: Dict = None) -> None:
407
+ def generate_comparison_sample_report(
408
+ source_table: str,
409
+ target_table: str,
410
+ stats: ComparisonStats,
411
+ details: ComparisonDiffDetails,
412
+ timezone: str,
413
+ source_query: str = None,
414
+ source_params: Dict = None,
415
+ target_query: str = None,
416
+ target_params: Dict = None,
417
+ ) -> None:
378
418
  """Generate comparison report (logger output looks uuugly)"""
379
419
  rl = []
380
- rl.append("=" * 80)
420
+ rl.append('=' * 80)
381
421
  current_datetime = datetime.now()
382
422
  rl.append(current_datetime.strftime(DATETIME_FORMAT))
383
- rl.append(f"DATA SAMPLE COMPARISON REPORT: ")
384
- if source_table and target_table: #empty for custom query
385
- rl.append(f"{source_table}")
386
- rl.append(f"VS")
387
- rl.append(f"{target_table}")
388
- rl.append("=" * 80)
423
+ rl.append(f'DATA SAMPLE COMPARISON REPORT: ')
424
+ if source_table and target_table: # empty for custom query
425
+ rl.append(f'{source_table}')
426
+ rl.append(f'VS')
427
+ rl.append(f'{target_table}')
428
+ rl.append('=' * 80)
389
429
 
390
430
  if source_query and target_query:
391
- rl.append(f"timezone: {timezone}")
392
- rl.append(f" {source_query}")
431
+ rl.append(f'timezone: {timezone}')
432
+ rl.append(f' {source_query}')
393
433
  if source_params:
394
- rl.append(f" params: {source_params}")
395
- rl.append("-" * 40)
396
- rl.append(f" {target_query}")
434
+ rl.append(f' params: {source_params}')
435
+ rl.append('-' * 40)
436
+ rl.append(f' {target_query}')
397
437
  if target_params:
398
- rl.append(f" params: {target_params}")
399
-
400
- rl.append("-" * 40)
401
-
402
- rl.append(f"\nSUMMARY:")
403
- rl.append(f" Source rows: {stats.total_source_rows}")
404
- rl.append(f" Target rows: {stats.total_target_rows}")
405
- rl.append(f" Duplicated source rows: {stats.dup_source_rows}")
406
- rl.append(f" Duplicated target rows: {stats.dup_target_rows}")
407
- rl.append(f" Only source rows: {stats.only_source_rows}")
408
- rl.append(f" Only target rows: {stats.only_target_rows}")
409
- rl.append(f" Common rows (by primary key): {stats.common_pk_rows}")
410
- rl.append(f" Totally matched rows: {stats.total_matched_rows}")
411
- rl.append("-"*40)
412
- rl.append(f" Source only rows %: {stats.source_only_percentage_rows:.5f}")
413
- rl.append(f" Target only rows %: {stats.target_only_percentage_rows:.5f}")
414
- rl.append(f" Duplicated source rows %: {stats.dup_source_percentage_rows:.5f}")
415
- rl.append(f" Duplicated target rows %: {stats.dup_target_percentage_rows:.5f}")
416
- rl.append(f" Mismatched rows %: {stats.total_diff_percentage_rows:.5f}")
417
- rl.append(f" Final discrepancies score: {stats.final_diff_score:.5f}")
418
- rl.append(f" Final data quality score: {stats.final_score:.5f}")
419
-
420
-
421
- rl.append(f" Source-only key examples: {details.source_only_keys_examples}")
422
- rl.append(f" Target-only key examples: {details.target_only_keys_examples}")
423
-
424
- rl.append(f" Duplicated source key examples: {details.dup_source_keys_examples}")
425
- rl.append(f" Duplicated target key examples: {details.dup_target_keys_examples}")
426
-
427
- rl.append(f" Common attribute columns: {', '.join(details.common_attribute_columns)}")
428
- rl.append(f" Skipped source columns: {', '.join(details.skipped_source_columns)}")
429
- rl.append(f" Skipped target columns: {', '.join(details.skipped_target_columns)}")
438
+ rl.append(f' params: {target_params}')
439
+
440
+ rl.append('-' * 40)
441
+
442
+ rl.append(f'\nSUMMARY:')
443
+ rl.append(f' Source rows: {stats.total_source_rows}')
444
+ rl.append(f' Target rows: {stats.total_target_rows}')
445
+ rl.append(f' Duplicated source rows: {stats.dup_source_rows}')
446
+ rl.append(f' Duplicated target rows: {stats.dup_target_rows}')
447
+ rl.append(f' Only source rows: {stats.only_source_rows}')
448
+ rl.append(f' Only target rows: {stats.only_target_rows}')
449
+ rl.append(f' Common rows (by primary key): {stats.common_pk_rows}')
450
+ rl.append(f' Totally matched rows: {stats.total_matched_rows}')
451
+ rl.append('-' * 40)
452
+ rl.append(f' Source only rows %: {stats.source_only_percentage_rows:.5f}')
453
+ rl.append(f' Target only rows %: {stats.target_only_percentage_rows:.5f}')
454
+ rl.append(f' Duplicated source rows %: {stats.dup_source_percentage_rows:.5f}')
455
+ rl.append(f' Duplicated target rows %: {stats.dup_target_percentage_rows:.5f}')
456
+ rl.append(f' Mismatched rows %: {stats.total_diff_percentage_rows:.5f}')
457
+ rl.append(f' Final discrepancies score: {stats.final_diff_score:.5f}')
458
+ rl.append(f' Final data quality score: {stats.final_score:.5f}')
459
+
460
+ rl.append(f' Source-only key examples: {details.source_only_keys_examples}')
461
+ rl.append(f' Target-only key examples: {details.target_only_keys_examples}')
462
+
463
+ rl.append(f' Duplicated source key examples: {details.dup_source_keys_examples}')
464
+ rl.append(f' Duplicated target key examples: {details.dup_target_keys_examples}')
465
+
466
+ rl.append(
467
+ f' Common attribute columns: {", ".join(details.common_attribute_columns)}'
468
+ )
469
+ rl.append(f' Skipped source columns: {", ".join(details.skipped_source_columns)}')
470
+ rl.append(f' Skipped target columns: {", ".join(details.skipped_target_columns)}')
430
471
 
431
472
  if stats.max_diff_percentage_cols > 0 and not details.mismatches_per_column.empty:
432
- rl.append(f"\nCOLUMN DIFFERENCES:")
473
+ rl.append(f'\nCOLUMN DIFFERENCES:')
433
474
 
434
- rl.append(f" Discrepancies per column (max %): {stats.max_diff_percentage_cols:.5f}")
435
- rl.append(f" Count of mismatches per column:\n")
475
+ rl.append(
476
+ f' Discrepancies per column (max %): {stats.max_diff_percentage_cols:.5f}'
477
+ )
478
+ rl.append(f' Count of mismatches per column:\n')
436
479
  rl.append(details.mismatches_per_column.to_string(index=False))
437
480
 
438
- rl.append(f" Some examples:\n")
439
- rl.append (details.discrepancies_per_col_examples.to_string(index=False, max_colwidth=64,justify='left'))
440
-
481
+ rl.append(f' Some examples:\n')
482
+ rl.append(
483
+ details.discrepancies_per_col_examples.to_string(
484
+ index=False, max_colwidth=64, justify='left'
485
+ )
486
+ )
441
487
 
442
488
  # Display sample data if available
443
- if details.discrepant_data_examples is not None and not details.discrepant_data_examples.empty:
444
- rl.append(f"\nDISCREPANT DATA (first pairs):")
445
- rl.append("Sorted by primary key and dataset:")
446
- rl.append(f"\n")
447
- rl.append(details.discrepant_data_examples.to_string(index=False, max_colwidth=64,justify='left'))
448
- rl.append(f"\n")
449
-
450
- rl.append("=" * 80)
451
-
452
- return "\n".join(rl)
453
-
454
- def generate_comparison_count_report(source_table:str,
455
- target_table:str,
456
- stats: ComparisonStats,
457
- details: ComparisonDiffDetails,
458
- total_source_count:int,
459
- total_target_count:int,
460
- discrepancies_counters_percentage:int,
461
- result_diff_in_counters:int,
462
- result_equal_in_counters:int,
463
- timezone: str,
464
- source_query: str = None,
465
- source_params: Dict = None,
466
- target_query: str = None,
467
- target_params: Dict = None) -> None:
468
-
489
+ if (
490
+ details.discrepant_data_examples is not None
491
+ and not details.discrepant_data_examples.empty
492
+ ):
493
+ rl.append(f'\nDISCREPANT DATA (first pairs):')
494
+ rl.append('Sorted by primary key and dataset:')
495
+ rl.append(f'\n')
496
+ rl.append(
497
+ details.discrepant_data_examples.to_string(
498
+ index=False, max_colwidth=64, justify='left'
499
+ )
500
+ )
501
+ rl.append(f'\n')
502
+
503
+ rl.append('=' * 80)
504
+
505
+ return '\n'.join(rl)
506
+
507
+
508
+ def generate_comparison_count_report(
509
+ source_table: str,
510
+ target_table: str,
511
+ stats: ComparisonStats,
512
+ details: ComparisonDiffDetails,
513
+ total_source_count: int,
514
+ total_target_count: int,
515
+ discrepancies_counters_percentage: int,
516
+ result_diff_in_counters: int,
517
+ result_equal_in_counters: int,
518
+ timezone: str,
519
+ source_query: str = None,
520
+ source_params: Dict = None,
521
+ target_query: str = None,
522
+ target_params: Dict = None,
523
+ ) -> None:
469
524
  """Generates comparison report (logger output looks uuugly)"""
470
525
  rl = []
471
- rl.append("=" * 80)
526
+ rl.append('=' * 80)
472
527
  current_datetime = datetime.now()
473
528
  rl.append(current_datetime.strftime(DATETIME_FORMAT))
474
- rl.append(f"COUNT COMPARISON REPORT:")
475
- rl.append(f"{source_table}")
476
- rl.append(f"VS")
477
- rl.append(f"{target_table}")
478
- rl.append("=" * 80)
529
+ rl.append(f'COUNT COMPARISON REPORT:')
530
+ rl.append(f'{source_table}')
531
+ rl.append(f'VS')
532
+ rl.append(f'{target_table}')
533
+ rl.append('=' * 80)
479
534
 
480
535
  if source_query and target_query:
481
- rl.append(f"timezone: {timezone}")
482
- rl.append(f" {source_query}")
536
+ rl.append(f'timezone: {timezone}')
537
+ rl.append(f' {source_query}')
483
538
  if source_params:
484
- rl.append(f" params: {source_params}")
485
- rl.append("-" * 40)
486
- rl.append(f" {target_query}")
539
+ rl.append(f' params: {source_params}')
540
+ rl.append('-' * 40)
541
+ rl.append(f' {target_query}')
487
542
  if target_params:
488
- rl.append(f" params: {target_params}")
489
- rl.append("-" * 40)
490
-
491
- rl.append(f"\nSUMMARY:")
492
- rl.append(f" Source total count: {total_source_count}")
493
- rl.append(f" Target total count: {total_target_count}")
494
- rl.append(f" Common total count: {result_equal_in_counters}")
495
- rl.append(f" Diff total count: {result_diff_in_counters}")
496
- rl.append(f" Discrepancies percentage: {discrepancies_counters_percentage:.5f}%")
497
- rl.append(f" Final discrepancies score: {discrepancies_counters_percentage:.5f}")
498
- rl.append(f" Final data quality score: {(100-discrepancies_counters_percentage):.5f}")
499
- if not details.mismatches_per_column.empty :
500
-
501
-
502
- rl.append(f"\nDETAIL DIFFERENCES:")
503
- rl.append (details.mismatches_per_column.to_string(index=False)
504
-
505
- )
543
+ rl.append(f' params: {target_params}')
544
+ rl.append('-' * 40)
545
+
546
+ rl.append(f'\nSUMMARY:')
547
+ rl.append(f' Source total count: {total_source_count}')
548
+ rl.append(f' Target total count: {total_target_count}')
549
+ rl.append(f' Common total count: {result_equal_in_counters}')
550
+ rl.append(f' Diff total count: {result_diff_in_counters}')
551
+ rl.append(f' Discrepancies percentage: {discrepancies_counters_percentage:.5f}%')
552
+ rl.append(f' Final discrepancies score: {discrepancies_counters_percentage:.5f}')
553
+ rl.append(
554
+ f' Final data quality score: {(100 - discrepancies_counters_percentage):.5f}'
555
+ )
556
+ if not details.mismatches_per_column.empty:
557
+ rl.append(f'\nDETAIL DIFFERENCES:')
558
+ rl.append(details.mismatches_per_column.to_string(index=False))
506
559
 
507
560
  # Display sample data if available
508
- if details.discrepant_data_examples is not None and not details.discrepant_data_examples.empty:
509
- rl.append(f"\nDISCREPANT DATA (first pairs):")
510
- rl.append("Sorted by primary key and dataset:")
511
- rl.append(f"\n")
561
+ if (
562
+ details.discrepant_data_examples is not None
563
+ and not details.discrepant_data_examples.empty
564
+ ):
565
+ rl.append(f'\nDISCREPANT DATA (first pairs):')
566
+ rl.append('Sorted by primary key and dataset:')
567
+ rl.append(f'\n')
512
568
  rl.append(details.discrepant_data_examples.to_string(index=False))
513
- rl.append(f"\n")
514
- rl.append("=" * 80)
569
+ rl.append(f'\n')
570
+ rl.append('=' * 80)
571
+
572
+ return '\n'.join(rl)
515
573
 
516
- return "\n".join(rl)
517
574
 
518
575
  def safe_remove_zeros(x):
519
576
  if pd.isna(x):
@@ -522,38 +579,48 @@ def safe_remove_zeros(x):
522
579
  return int(x)
523
580
  return x
524
581
 
582
+
525
583
  def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
526
584
  """Prepare DataFrame for comparison by handling nulls and empty strings"""
527
585
  df = df.map(safe_remove_zeros)
528
586
 
529
-
530
587
  df = df.fillna(NULL_REPLACEMENT)
531
- df = df.replace(r'(?i)^(None|nan|NaN|\s*)$', NULL_REPLACEMENT, regex=True)
588
+ df = df.replace(r'(?i)^(None|nan|NaN|NaT|\s*)$', NULL_REPLACEMENT, regex=True)
532
589
 
533
590
  df = df.astype(str)
534
591
 
535
592
  return df
536
593
 
594
+
537
595
  def exclude_by_keys(df, key_columns, exclude_set):
538
596
  if len(key_columns) == 1:
539
597
  exclude_values = [x[0] for x in exclude_set]
540
598
  return df[~df[key_columns[0]].isin(exclude_values)]
541
599
  else:
542
- return df[~df.apply(lambda row: tuple(row[col] for col in key_columns) in exclude_set, axis=1)]
600
+ return df[
601
+ ~df.apply(
602
+ lambda row: tuple(row[col] for col in key_columns) in exclude_set,
603
+ axis=1,
604
+ )
605
+ ]
543
606
 
544
607
 
545
- def clean_recently_changed_data(df1:pd.DataFrame, df2:pd.DataFrame, primary_keys:List[str]):
608
+ def clean_recently_changed_data(
609
+ df1: pd.DataFrame, df2: pd.DataFrame, primary_keys: List[str]
610
+ ):
546
611
  """
547
612
  Mutually removes rows with recently changed records
548
613
 
549
614
  Parameters:
550
615
  df1, df2: pandas.DataFrame
551
- primary_keys: list
616
+ primary_keys: list
552
617
 
553
618
  Returns:
554
619
  tuple: (df1_processed, df2_processed)
555
620
  """
556
- app_logger.info(f'before exclusion recently changed rows source: {len(df1)}, target {len(df2)}')
621
+ app_logger.info(
622
+ f'before exclusion recently changed rows source: {len(df1)}, target {len(df2)}'
623
+ )
557
624
 
558
625
  filtered_df1 = df1.copy()
559
626
  filtered_df2 = df2.copy()
@@ -561,21 +628,26 @@ def clean_recently_changed_data(df1:pd.DataFrame, df2:pd.DataFrame, primary_keys
561
628
  filtered_df1 = filtered_df1.loc[filtered_df1['xrecently_changed'] == 'y']
562
629
  filtered_df2 = filtered_df2.loc[filtered_df2['xrecently_changed'] == 'y']
563
630
 
564
- excluded_from_df1_keys = _create_keys_set(filtered_df1,primary_keys)
565
- excluded_from_df2_keys = _create_keys_set(filtered_df2,primary_keys)
631
+ excluded_from_df1_keys = _create_keys_set(filtered_df1, primary_keys)
632
+ excluded_from_df2_keys = _create_keys_set(filtered_df2, primary_keys)
566
633
 
567
634
  excluded_keys = excluded_from_df1_keys | excluded_from_df2_keys
568
- df1_processed = exclude_by_keys(df1, primary_keys, excluded_keys).drop('xrecently_changed', axis=1)
569
- df2_processed = exclude_by_keys(df2, primary_keys, excluded_keys).drop('xrecently_changed', axis=1)
635
+ df1_processed = exclude_by_keys(df1, primary_keys, excluded_keys).drop(
636
+ 'xrecently_changed', axis=1
637
+ )
638
+ df2_processed = exclude_by_keys(df2, primary_keys, excluded_keys).drop(
639
+ 'xrecently_changed', axis=1
640
+ )
570
641
 
571
- app_logger.info(f'after exclusion recently changed rows source: {len(df1_processed)}, target {len(df2_processed)}')
642
+ app_logger.info(
643
+ f'after exclusion recently changed rows source: {len(df1_processed)}, target {len(df2_processed)}'
644
+ )
572
645
 
573
646
  return df1_processed, df2_processed
574
647
 
575
648
 
576
649
  def find_count_discrepancies(
577
- source_counts: pd.DataFrame,
578
- target_counts: pd.DataFrame
650
+ source_counts: pd.DataFrame, target_counts: pd.DataFrame
579
651
  ) -> pd.DataFrame:
580
652
  """Find discrepancies in daily row counts between source and target"""
581
653
  source_counts['flg'] = 'source'
@@ -584,44 +656,41 @@ def find_count_discrepancies(
584
656
  # Find mismatches in counts per date
585
657
  all_counts = pd.concat([source_counts, target_counts])
586
658
  discrepancies = all_counts.drop_duplicates(
587
- subset=['dt', 'cnt'],
588
- keep=False
589
- ).sort_values(
590
- by=['dt', 'flg'],
591
- ascending=[False, True]
592
- )
659
+ subset=['dt', 'cnt'], keep=False
660
+ ).sort_values(by=['dt', 'flg'], ascending=[False, True])
593
661
 
594
662
  return discrepancies
595
663
 
664
+
596
665
  def create_result_message(
597
666
  source_total: int,
598
667
  target_total: int,
599
668
  discrepancies: pd.DataFrame,
600
- comparison_type: str
669
+ comparison_type: str,
601
670
  ) -> str:
602
671
  """Create standardized result message"""
603
672
  if discrepancies.empty:
604
- return f"{comparison_type} match: Source={source_total}, Target={target_total}"
673
+ return f'{comparison_type} match: Source={source_total}, Target={target_total}'
605
674
 
606
675
  mismatch_count = len(discrepancies)
607
676
  diff = source_total - target_total
608
- diff_msg = f" (Δ={diff})" if diff != 0 else ""
677
+ diff_msg = f' (Δ={diff})' if diff != 0 else ''
609
678
 
610
679
  return (
611
- f"{comparison_type} mismatch: Source={source_total}, Target={target_total}{diff_msg}, "
612
- f"{mismatch_count} discrepancies found"
680
+ f'{comparison_type} mismatch: Source={source_total}, Target={target_total}{diff_msg}, '
681
+ f'{mismatch_count} discrepancies found'
613
682
  )
614
683
 
684
+
615
685
  def filter_columns(
616
- df: pd.DataFrame,
617
- columns: List[str],
618
- exclude: Optional[List[str]] = None
686
+ df: pd.DataFrame, columns: List[str], exclude: Optional[List[str]] = None
619
687
  ) -> pd.DataFrame:
620
688
  """Filter DataFrame columns with optional exclusions"""
621
689
  if exclude:
622
690
  columns = [col for col in columns if col not in exclude]
623
691
  return df[columns]
624
692
 
693
+
625
694
  def cross_fill_missing_dates(df1, df2, date_column='dt', value_column='cnt'):
626
695
  """
627
696
  Fill missing dates between tow dataframes
@@ -640,6 +709,7 @@ def cross_fill_missing_dates(df1, df2, date_column='dt', value_column='cnt'):
640
709
 
641
710
  return df1_full, df2_full
642
711
 
712
+
643
713
  def format_keys(keys, max_examples):
644
714
  if keys:
645
715
  keys = {next(iter(x)) if len(x) == 1 else x for x in list(keys)[:max_examples]}
@@ -648,12 +718,14 @@ def format_keys(keys, max_examples):
648
718
  else:
649
719
  return None
650
720
 
721
+
651
722
  def get_dataframe_size_gb(df: pd.DataFrame) -> float:
652
723
  """Calculate DataFrame size in GB"""
653
724
  if df.empty:
654
725
  return 0.0
655
726
  return df.memory_usage(deep=True).sum() / 1024 / 1024 / 1024
656
727
 
728
+
657
729
  def validate_dataframe_size(df: pd.DataFrame, max_size_gb: float) -> None:
658
730
  """Validate DataFrame size and raise exception if exceeds limit"""
659
731
  if df is None:
@@ -663,6 +735,6 @@ def validate_dataframe_size(df: pd.DataFrame, max_size_gb: float) -> None:
663
735
 
664
736
  if size_gb > max_size_gb:
665
737
  raise ValueError(
666
- f"DataFrame size {size_gb:.2f} GB exceeds limit of {max_size_gb} GB. "
667
- f"Shape: {df.shape}"
668
- )
738
+ f'DataFrame size {size_gb:.2f} GB exceeds limit of {max_size_gb} GB. '
739
+ f'Shape: {df.shape}'
740
+ )