xoverrr 1.1.5__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xoverrr/core.py CHANGED
@@ -1,41 +1,21 @@
1
+ from typing import Dict, List, Optional, Tuple, Union
1
2
 
2
- import sys
3
- from enum import Enum, auto
4
- from typing import Optional, List, Dict, Callable, Union, Tuple, Any
5
3
  import pandas as pd
6
4
  from sqlalchemy.engine import Engine
7
- from .models import (
8
- DBMSType,
9
- DataReference,
10
- ObjectType
11
- )
12
-
13
- from .logger import app_logger
14
5
 
6
+ from . import constants as ct
7
+ from .adapters.base import BaseDatabaseAdapter
8
+ from .adapters.clickhouse import ClickHouseAdapter
15
9
  from .adapters.oracle import OracleAdapter
16
10
  from .adapters.postgres import PostgresAdapter
17
- from .adapters.clickhouse import ClickHouseAdapter
18
- from .adapters.base import BaseDatabaseAdapter
19
-
20
- from . import constants as ct
21
-
22
- from .exceptions import (
23
- MetadataError,
24
- DQCompareException
25
- )
26
- from .utils import (
27
- prepare_dataframe,
28
- compare_dataframes,
29
- clean_recently_changed_data,
30
- generate_comparison_sample_report,
31
- generate_comparison_count_report,
32
- cross_fill_missing_dates,
33
- validate_dataframe_size,
34
- ComparisonStats,
35
- ComparisonDiffDetails
36
- )
37
-
38
-
11
+ from .exceptions import DQCompareException, MetadataError
12
+ from .logger import app_logger
13
+ from .models import DataReference, DBMSType, ObjectType
14
+ from .utils import (ComparisonDiffDetails, ComparisonStats,
15
+ clean_recently_changed_data, compare_dataframes,
16
+ cross_fill_missing_dates, generate_comparison_count_report,
17
+ generate_comparison_sample_report, normalize_column_names,
18
+ prepare_dataframe, validate_dataframe_size)
39
19
 
40
20
 
41
21
  class DataQualityComparator:
@@ -48,7 +28,7 @@ class DataQualityComparator:
48
28
  source_engine: Engine,
49
29
  target_engine: Engine,
50
30
  default_exclude_recent_hours: Optional[int] = 24,
51
- timezone: str = ct.DEFAULT_TZ
31
+ timezone: str = ct.DEFAULT_TZ,
52
32
  ):
53
33
  self.source_engine = source_engine
54
34
  self.target_engine = target_engine
@@ -64,11 +44,11 @@ class DataQualityComparator:
64
44
  }
65
45
  self._reset_stats()
66
46
  from . import __version__
47
+
67
48
  app_logger.info('start')
68
49
  app_logger.info(f'Version: v{__version__}')
69
50
  app_logger.info(f'Source DB: {self.source_db_type.name}')
70
- app_logger.info(f'Target DB: {self.target_db_type.name}')
71
-
51
+ app_logger.info(f'Target DB: {self.target_db_type.name}')
72
52
 
73
53
  def reset_stats(self):
74
54
  self._reset_stats()
@@ -79,17 +59,19 @@ class DataQualityComparator:
79
59
  ct.COMPARISON_SUCCESS: 0,
80
60
  ct.COMPARISON_FAILED: 0,
81
61
  ct.COMPARISON_SKIPPED: 0,
82
- 'tables_success' : set(),
83
- 'tables_failed' : set(),
62
+ 'tables_success': set(),
63
+ 'tables_failed': set(),
84
64
  'tables_skipped': set(),
85
65
  'start_time': pd.Timestamp.now().strftime(ct.DATETIME_FORMAT),
86
- 'end_time': None
66
+ 'end_time': None,
87
67
  }
88
68
 
89
- def _update_stats(self, status: str, source_table:DataReference):
69
+ def _update_stats(self, status: str, source_table: DataReference):
90
70
  """Update comparison statistics"""
91
71
  self.comparison_stats[status] += 1
92
- self.comparison_stats['end_time'] = pd.Timestamp.now().strftime(ct.DATETIME_FORMAT)
72
+ self.comparison_stats['end_time'] = pd.Timestamp.now().strftime(
73
+ ct.DATETIME_FORMAT
74
+ )
93
75
  if source_table:
94
76
  match status:
95
77
  case ct.COMPARISON_SUCCESS:
@@ -106,7 +88,7 @@ class DataQualityComparator:
106
88
  date_column: Optional[str] = None,
107
89
  date_range: Optional[Tuple[str, str]] = None,
108
90
  tolerance_percentage: float = 0.0,
109
- max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES
91
+ max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
110
92
  ) -> Tuple[str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
111
93
 
112
94
  self._validate_inputs(source_table, target_table)
@@ -116,17 +98,21 @@ class DataQualityComparator:
116
98
  try:
117
99
  self.comparison_stats['compared'] += 1
118
100
 
119
-
120
101
  status, report, stats, details = self._compare_counts(
121
- source_table, target_table, date_column, start_date, end_date,
122
- tolerance_percentage, max_examples
102
+ source_table,
103
+ target_table,
104
+ date_column,
105
+ start_date,
106
+ end_date,
107
+ tolerance_percentage,
108
+ max_examples,
123
109
  )
124
110
 
125
111
  self._update_stats(status, source_table)
126
112
  return status, report, stats, details
127
113
 
128
114
  except Exception as e:
129
- app_logger.exception(f"Count comparison failed: {str(e)}")
115
+ app_logger.exception(f'Count comparison failed: {str(e)}')
130
116
  status = ct.COMPARISON_FAILED
131
117
  self._update_stats(status, source_table)
132
118
  return status, None, None, None
@@ -143,25 +129,25 @@ class DataQualityComparator:
143
129
  custom_primary_key: Optional[List[str]] = None,
144
130
  tolerance_percentage: float = 0.0,
145
131
  exclude_recent_hours: Optional[int] = None,
146
- max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES
132
+ max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
147
133
  ) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
148
134
  """
149
135
  Compare data from custom queries with specified key columns
150
136
 
151
137
  Parameters:
152
- source_table: `DataReference`
138
+ source_table: `DataReference`
153
139
  source table to compare
154
140
  target_table: `DataReference`
155
141
  target table to compare
156
142
  custom_primary_key : `List[str]`
157
143
  List of primary key columns for comparison.
158
- exclude_columns : `Optional[List[str]] = None`
144
+ exclude_columns : `Optional[List[str]] = None`
159
145
  Columns to exclude from comparison.
160
- include_columns : `Optional[List[str]] = None`
146
+ include_columns : `Optional[List[str]] = None`
161
147
  Columns to include from comparison (default all cols)
162
- tolerance_percentage : `float`
148
+ tolerance_percentage : `float`
163
149
  Tolerance percentage for discrepancies.
164
- max_examples
150
+ max_examples
165
151
  Maximum number of discrepancy examples per column
166
152
  """
167
153
  self._validate_inputs(source_table, target_table)
@@ -169,35 +155,51 @@ class DataQualityComparator:
169
155
  exclude_hours = exclude_recent_hours or self.default_exclude_recent_hours
170
156
 
171
157
  start_date, end_date = date_range or (None, None)
172
- exclude_cols = exclude_columns or []
173
- custom_keys = custom_primary_key
174
- include_cols = include_columns or []
158
+ exclude_cols = normalize_column_names(exclude_columns or [])
159
+ custom_keys = (
160
+ normalize_column_names(custom_primary_key or [])
161
+ if custom_primary_key
162
+ else None
163
+ )
164
+ include_cols = normalize_column_names(include_columns or [])
175
165
 
176
166
  try:
177
167
  self.comparison_stats['compared'] += 1
178
168
 
179
169
  status, report, stats, details = self._compare_samples(
180
- source_table, target_table, date_column, update_column,
181
- start_date, end_date, exclude_cols,include_cols,
182
- custom_keys, tolerance_percentage, exclude_hours, max_examples
170
+ source_table,
171
+ target_table,
172
+ date_column,
173
+ update_column,
174
+ start_date,
175
+ end_date,
176
+ exclude_cols,
177
+ include_cols,
178
+ custom_keys,
179
+ tolerance_percentage,
180
+ exclude_hours,
181
+ max_examples,
183
182
  )
184
183
 
185
184
  self._update_stats(status, source_table)
186
185
  return status, report, stats, details
187
186
 
188
187
  except Exception as e:
189
- app_logger.exception(f"Sample comparison failed: {str(e)}")
188
+ app_logger.exception(f'Sample comparison failed: {str(e)}')
190
189
  status = ct.COMPARISON_FAILED
191
190
  self._update_stats(status, source_table)
192
191
  return status, None, None, None
193
192
 
194
- def _compare_counts(self, source_table: DataReference,
195
- target_table: DataReference,
196
- date_column: str,
197
- start_date: Optional[str],
198
- end_date: Optional[str],
199
- tolerance_percentage:float,
200
- max_examples:int) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
193
+ def _compare_counts(
194
+ self,
195
+ source_table: DataReference,
196
+ target_table: DataReference,
197
+ date_column: str,
198
+ start_date: Optional[str],
199
+ end_date: Optional[str],
200
+ tolerance_percentage: float,
201
+ max_examples: int,
202
+ ) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
201
203
 
202
204
  try:
203
205
  source_adapter = self._get_adapter(self.source_db_type)
@@ -206,58 +208,73 @@ class DataQualityComparator:
206
208
  source_query, source_params = source_adapter.build_count_query(
207
209
  source_table, date_column, start_date, end_date
208
210
  )
209
- source_counts = self._execute_query((source_query, source_params), self.source_engine, self.timezone)
211
+ source_counts = self._execute_query(
212
+ (source_query, source_params), self.source_engine, self.timezone
213
+ )
210
214
 
211
215
  target_query, target_params = target_adapter.build_count_query(
212
216
  target_table, date_column, start_date, end_date
213
217
  )
214
- target_counts = self._execute_query((target_query, target_params), self.target_engine, self.timezone)
215
-
218
+ target_counts = self._execute_query(
219
+ (target_query, target_params), self.target_engine, self.timezone
220
+ )
216
221
 
217
- source_counts_filled, target_counts_filled = cross_fill_missing_dates(source_counts, target_counts)
222
+ source_counts_filled, target_counts_filled = cross_fill_missing_dates(
223
+ source_counts, target_counts
224
+ )
218
225
 
219
226
  merged = source_counts_filled.merge(target_counts_filled, on='dt')
220
227
  total_count_source = source_counts_filled['cnt'].sum()
221
- total_count_taget = target_counts_filled['cnt'].sum()
228
+ total_count_taget = target_counts_filled['cnt'].sum()
222
229
 
223
- if (total_count_source, total_count_taget) == (0,0):
230
+ if (total_count_source, total_count_taget) == (0, 0):
224
231
  app_logger.warning('nothing to compare to you')
225
232
  status = ct.COMPARISON_SKIPPED
226
233
  return status, None, None, None
227
234
 
228
235
  else:
229
-
230
236
  result_diff_in_counters = abs(merged['cnt_x'] - merged['cnt_y']).sum()
231
237
  result_equal_in_counters = merged[['cnt_x', 'cnt_y']].min(axis=1).sum()
232
238
 
233
- discrepancies_counters_percentage = 100*result_diff_in_counters/(result_diff_in_counters+result_equal_in_counters)
234
- stats, details = compare_dataframes(source_df=source_counts_filled,
235
- target_df=target_counts_filled,
236
- key_columns=['dt'],
237
- max_examples=max_examples)
238
-
239
- status = ct.COMPARISON_FAILED if discrepancies_counters_percentage > tolerance_percentage else ct.COMPARISON_SUCCESS
240
-
241
- report = generate_comparison_count_report(source_table.full_name,
242
- target_table.full_name,
243
- stats,
244
- details,
245
- total_count_source,
246
- total_count_taget,
247
- discrepancies_counters_percentage,
248
- result_diff_in_counters,
249
- result_equal_in_counters,
250
- self.timezone,
251
- source_query,
252
- source_params,
253
- target_query,
254
- target_params
255
- )
239
+ discrepancies_counters_percentage = (
240
+ 100
241
+ * result_diff_in_counters
242
+ / (result_diff_in_counters + result_equal_in_counters)
243
+ )
244
+ stats, details = compare_dataframes(
245
+ source_df=source_counts_filled,
246
+ target_df=target_counts_filled,
247
+ key_columns=['dt'],
248
+ max_examples=max_examples,
249
+ )
250
+
251
+ status = (
252
+ ct.COMPARISON_FAILED
253
+ if discrepancies_counters_percentage > tolerance_percentage
254
+ else ct.COMPARISON_SUCCESS
255
+ )
256
+
257
+ report = generate_comparison_count_report(
258
+ source_table.full_name,
259
+ target_table.full_name,
260
+ stats,
261
+ details,
262
+ total_count_source,
263
+ total_count_taget,
264
+ discrepancies_counters_percentage,
265
+ result_diff_in_counters,
266
+ result_equal_in_counters,
267
+ self.timezone,
268
+ source_query,
269
+ source_params,
270
+ target_query,
271
+ target_params,
272
+ )
256
273
 
257
274
  return status, report, stats, details
258
275
 
259
276
  except Exception as e:
260
- app_logger.error(f"Count comparison failed: {str(e)}")
277
+ app_logger.error(f'Count comparison failed: {str(e)}')
261
278
  raise
262
279
 
263
280
  def _compare_samples(
@@ -271,28 +288,36 @@ class DataQualityComparator:
271
288
  exclude_columns: List[str],
272
289
  include_columns: List[str],
273
290
  custom_key_columns: Optional[List[str]],
274
- tolerance_percentage:float,
291
+ tolerance_percentage: float,
275
292
  exclude_recent_hours: Optional[int],
276
- max_examples:Optional[int]
293
+ max_examples: Optional[int],
277
294
  ) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
278
295
 
279
296
  try:
280
297
  source_object_type = self._get_object_type(source_table, self.source_engine)
281
298
  target_object_type = self._get_object_type(target_table, self.target_engine)
282
- app_logger.info(f'object type source: {source_object_type} vs target {target_object_type}')
299
+ app_logger.info(
300
+ f'object type source: {source_object_type} vs target {target_object_type}'
301
+ )
283
302
 
284
- source_columns_meta = self._get_metadata_cols(source_table, self.source_engine)
303
+ source_columns_meta = self._get_metadata_cols(
304
+ source_table, self.source_engine
305
+ )
285
306
  app_logger.info('source_columns meta:\n')
286
307
  app_logger.info(source_columns_meta.to_string(index=False))
287
308
 
288
- target_columns_meta = self._get_metadata_cols(target_table, self.target_engine)
309
+ target_columns_meta = self._get_metadata_cols(
310
+ target_table, self.target_engine
311
+ )
289
312
  app_logger.info('target_columns meta:\n')
290
313
  app_logger.info(target_columns_meta.to_string(index=False))
291
314
 
292
- intersect = list(set(include_columns)&set(exclude_columns))
315
+ intersect = list(set(include_columns) & set(exclude_columns))
293
316
  if intersect:
294
- app_logger.warning(f'Intersection columns between Include and exclude: {",".join(intersect)}')
295
-
317
+ app_logger.warning(
318
+ f'Intersection columns between Include and exclude: {",".join(intersect)}'
319
+ )
320
+
296
321
  key_columns = None
297
322
 
298
323
  if custom_key_columns:
@@ -300,30 +325,55 @@ class DataQualityComparator:
300
325
  source_cols = source_columns_meta['column_name'].tolist()
301
326
  target_cols = target_columns_meta['column_name'].tolist()
302
327
 
303
- missing_in_source = [col for col in custom_key_columns if col not in source_cols]
304
- missing_in_target = [col for col in custom_key_columns if col not in target_cols]
328
+ missing_in_source = [
329
+ col for col in custom_key_columns if col not in source_cols
330
+ ]
331
+ missing_in_target = [
332
+ col for col in custom_key_columns if col not in target_cols
333
+ ]
305
334
 
306
335
  if missing_in_source:
307
- raise MetadataError(f"Custom key columns missing in source: {missing_in_source}")
336
+ raise MetadataError(
337
+ f'Custom key columns missing in source: {missing_in_source}'
338
+ )
308
339
  if missing_in_target:
309
- raise MetadataError(f"Custom key columns missing in target: {missing_in_target}")
340
+ raise MetadataError(
341
+ f'Custom key columns missing in target: {missing_in_target}'
342
+ )
310
343
  else:
311
- source_pk = self._get_metadata_pk(source_table, self.source_engine) \
312
- if source_object_type == ObjectType.TABLE else pd.DataFrame({'pk_column_name': []})
313
- target_pk = self._get_metadata_pk(target_table, self.target_engine) \
314
- if target_object_type == ObjectType.TABLE else pd.DataFrame({'pk_column_name': []})
315
-
316
- if source_pk['pk_column_name'].tolist() != target_pk['pk_column_name'].tolist():
317
- app_logger.warning(f"Primary keys differ: source={source_pk['pk_column_name'].tolist()}, target={target_pk['pk_column_name'].tolist()}")
318
- key_columns = source_pk['pk_column_name'].tolist() or target_pk['pk_column_name'].tolist()
344
+ source_pk = (
345
+ self._get_metadata_pk(source_table, self.source_engine)
346
+ if source_object_type == ObjectType.TABLE
347
+ else pd.DataFrame({'pk_column_name': []})
348
+ )
349
+ target_pk = (
350
+ self._get_metadata_pk(target_table, self.target_engine)
351
+ if target_object_type == ObjectType.TABLE
352
+ else pd.DataFrame({'pk_column_name': []})
353
+ )
354
+
355
+ if (
356
+ source_pk['pk_column_name'].tolist()
357
+ != target_pk['pk_column_name'].tolist()
358
+ ):
359
+ app_logger.warning(
360
+ f'Primary keys differ: source={source_pk["pk_column_name"].tolist()}, target={target_pk["pk_column_name"].tolist()}'
361
+ )
362
+ key_columns = (
363
+ source_pk['pk_column_name'].tolist()
364
+ or target_pk['pk_column_name'].tolist()
365
+ )
319
366
  if not key_columns:
320
- raise MetadataError(f"Primary key not found in the source neither in the target and not provided")
367
+ raise MetadataError(
368
+ f'Primary key not found in the source neither in the target and not provided'
369
+ )
321
370
 
322
371
  if include_columns:
323
-
324
372
  if not set(include_columns) & set(key_columns):
325
- app_logger.warning(f'The primary key was not included in the column list.\
326
- The key column was included in the resulting query automatically. PK:{key_columns}')
373
+ app_logger.warning(
374
+ f'The primary key was not included in the column list.\
375
+ The key column was included in the resulting query automatically. PK:{key_columns}'
376
+ )
327
377
 
328
378
  include_columns = list(set(include_columns + key_columns))
329
379
 
@@ -333,12 +383,13 @@ class DataQualityComparator:
333
383
  target_columns_meta = target_columns_meta[
334
384
  target_columns_meta['column_name'].isin(include_columns)
335
385
  ]
336
-
337
- if exclude_columns:
338
386
 
387
+ if exclude_columns:
339
388
  if set(exclude_columns) & set(key_columns):
340
- app_logger.warning(f'The primary key has been excluded from the column list.\
341
- However, the key column must be present in the resulting query.s PK:{key_columns}')
389
+ app_logger.warning(
390
+ f'The primary key has been excluded from the column list.\
391
+ However, the key column must be present in the resulting query.s PK:{key_columns}'
392
+ )
342
393
 
343
394
  exclude_columns = list(set(exclude_columns) - set(key_columns))
344
395
 
@@ -349,63 +400,87 @@ class DataQualityComparator:
349
400
  ~target_columns_meta['column_name'].isin(exclude_columns)
350
401
  ]
351
402
 
352
- common_cols_df, source_only_cols, target_only_cols = self._analyze_columns_meta(source_columns_meta, target_columns_meta)
403
+ common_cols_df, source_only_cols, target_only_cols = (
404
+ self._analyze_columns_meta(source_columns_meta, target_columns_meta)
405
+ )
353
406
  common_cols = common_cols_df['column_name'].tolist()
354
407
 
355
408
  if not common_cols:
356
- raise MetadataError(f"No one column to compare, need to check tables or reduce the exclude_columns list: {','.join(exclude_columns)}")
357
-
409
+ raise MetadataError(
410
+ f'No one column to compare, need to check tables or reduce the exclude_columns list: {",".join(exclude_columns)}'
411
+ )
412
+
358
413
  source_data, source_query, source_params = self._get_table_data(
359
- self.source_engine, source_table, source_columns_meta, common_cols,
360
- date_column, update_column, start_date, end_date, exclude_recent_hours
414
+ self.source_engine,
415
+ source_table,
416
+ source_columns_meta,
417
+ common_cols,
418
+ date_column,
419
+ update_column,
420
+ start_date,
421
+ end_date,
422
+ exclude_recent_hours,
361
423
  )
362
424
 
363
425
  target_data, target_query, target_params = self._get_table_data(
364
- self.target_engine, target_table, target_columns_meta, common_cols,
365
- date_column, update_column, start_date, end_date, exclude_recent_hours
426
+ self.target_engine,
427
+ target_table,
428
+ target_columns_meta,
429
+ common_cols,
430
+ date_column,
431
+ update_column,
432
+ start_date,
433
+ end_date,
434
+ exclude_recent_hours,
366
435
  )
367
436
  status = None
368
- #special case
437
+ # special case
369
438
  if target_data.empty and source_data.empty:
370
439
  status = ct.COMPARISON_SKIPPED
371
440
  return status, None, None, None
372
441
  elif source_data.empty or target_data.empty:
373
- raise DQCompareException(f"Nothing to compare, rows returned from source: {len(source_data)}, from target: {len(target_data)}")
374
-
442
+ raise DQCompareException(
443
+ f'Nothing to compare, rows returned from source: {len(source_data)}, from target: {len(target_data)}'
444
+ )
375
445
 
376
446
  source_data = prepare_dataframe(source_data)
377
447
  target_data = prepare_dataframe(target_data)
378
448
  if update_column and exclude_recent_hours:
379
- source_data, target_data = clean_recently_changed_data(source_data, target_data, key_columns)
380
-
449
+ source_data, target_data = clean_recently_changed_data(
450
+ source_data, target_data, key_columns
451
+ )
381
452
 
382
453
  stats, details = compare_dataframes(
383
- source_data, target_data,
384
- key_columns, max_examples
454
+ source_data, target_data, key_columns, max_examples
385
455
  )
386
456
 
387
457
  if stats:
388
458
  details.skipped_source_columns = source_only_cols
389
459
  details.skipped_target_columns = target_only_cols
390
460
 
391
- report = generate_comparison_sample_report(source_table.full_name,
392
- target_table.full_name,
393
- stats,
394
- details,
395
- self.timezone,
396
- source_query,
397
- source_params,
398
- target_query,
399
- target_params
400
- )
401
- status = ct.COMPARISON_FAILED if stats.final_diff_score > tolerance_percentage else ct.COMPARISON_SUCCESS
461
+ report = generate_comparison_sample_report(
462
+ source_table.full_name,
463
+ target_table.full_name,
464
+ stats,
465
+ details,
466
+ self.timezone,
467
+ source_query,
468
+ source_params,
469
+ target_query,
470
+ target_params,
471
+ )
472
+ status = (
473
+ ct.COMPARISON_FAILED
474
+ if stats.final_diff_score > tolerance_percentage
475
+ else ct.COMPARISON_SUCCESS
476
+ )
402
477
  return status, report, stats, details
403
478
  else:
404
479
  status = ct.COMPARISON_SKIPPED
405
480
  return status, None, None, None
406
481
 
407
482
  except Exception as e:
408
- app_logger.error(f"Sample comparison failed: {str(e)}")
483
+ app_logger.error(f'Sample comparison failed: {str(e)}')
409
484
  raise
410
485
 
411
486
  def compare_custom_query(
@@ -417,25 +492,25 @@ class DataQualityComparator:
417
492
  custom_primary_key: List[str],
418
493
  exclude_columns: Optional[List[str]] = None,
419
494
  tolerance_percentage: float = 0.0,
420
- max_examples:Optional[int] = ct.DEFAULT_MAX_EXAMPLES
495
+ max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
421
496
  ) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
422
497
  """
423
498
  Compare data from custom queries with specified key columns
424
499
 
425
500
  Parameters:
426
- source_query : Union[str, Tuple[str, Dict]]
501
+ source_query : Union[str, Tuple[str, Dict]]
427
502
  Source query (can be string or tuple with query and params).
428
503
  target_query : Union[str, Tuple[str, Dict]]
429
504
  Target query (can be string or tuple with query and params).
430
505
  custom_primary_key : List[str]
431
506
  List of primary key columns for comparison.
432
- exclude_columns : Optional[List[str]] = None
507
+ exclude_columns : Optional[List[str]] = None
433
508
  Columns to exclude from comparison.
434
- tolerance_percentage : float
509
+ tolerance_percentage : float
435
510
  Tolerance percentage for discrepancies.
436
511
  max_examples: int
437
- Maximum number of discrepancy examples per column
438
-
512
+ Maximum number of discrepancy examples per column
513
+
439
514
  Returns:
440
515
  ----------
441
516
  Tuple[str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]
@@ -448,8 +523,12 @@ class DataQualityComparator:
448
523
  self.comparison_stats['compared'] += 1
449
524
 
450
525
  # Execute queries
451
- source_data = self._execute_query((source_query,source_params), source_engine, timezone)
452
- target_data = self._execute_query((target_query,target_params), target_engine, timezone)
526
+ source_data = self._execute_query(
527
+ (source_query, source_params), source_engine, timezone
528
+ )
529
+ target_data = self._execute_query(
530
+ (target_query, target_params), target_engine, timezone
531
+ )
453
532
  app_logger.info('preparing source dataframe')
454
533
  source_data_prepared = prepare_dataframe(source_data)
455
534
  app_logger.info('preparing target dataframe')
@@ -457,43 +536,60 @@ class DataQualityComparator:
457
536
 
458
537
  # Exclude columns if specified
459
538
  exclude_cols = exclude_columns or []
460
- common_cols = [col for col in source_data_prepared.columns
461
- if col in target_data_prepared.columns and col not in exclude_cols]
539
+ common_cols = [
540
+ col
541
+ for col in source_data_prepared.columns
542
+ if col in target_data_prepared.columns and col not in exclude_cols
543
+ ]
462
544
 
463
545
  source_data_filtered = source_data_prepared[common_cols]
464
546
  target_data_filtered = target_data_prepared[common_cols]
465
547
  if 'xrecently_changed' in common_cols:
466
- source_data_filtered, target_data_filtered = clean_recently_changed_data(source_data_filtered, target_data_filtered, custom_primary_key)
548
+ source_data_filtered, target_data_filtered = (
549
+ clean_recently_changed_data(
550
+ source_data_filtered, target_data_filtered, custom_primary_key
551
+ )
552
+ )
467
553
  # Compare dataframes
468
554
  stats, details = compare_dataframes(
469
- source_data_filtered, target_data_filtered, custom_primary_key, max_examples
555
+ source_data_filtered,
556
+ target_data_filtered,
557
+ custom_primary_key,
558
+ max_examples,
470
559
  )
471
560
 
472
561
  if stats:
473
- report = generate_comparison_sample_report(None,
474
- None,
475
- stats,
476
- details,
477
- self.timezone,
478
- source_query,
479
- source_params,
480
- target_query,
481
- target_params
482
- )
483
- status = ct.COMPARISON_FAILED if stats.final_diff_score > tolerance_percentage else ct.COMPARISON_SUCCESS
562
+ report = generate_comparison_sample_report(
563
+ None,
564
+ None,
565
+ stats,
566
+ details,
567
+ self.timezone,
568
+ source_query,
569
+ source_params,
570
+ target_query,
571
+ target_params,
572
+ )
573
+ status = (
574
+ ct.COMPARISON_FAILED
575
+ if stats.final_diff_score > tolerance_percentage
576
+ else ct.COMPARISON_SUCCESS
577
+ )
484
578
  else:
485
579
  status = ct.COMPARISON_SKIPPED
486
580
 
487
-
488
581
  self._update_stats(status, None)
489
582
  return status, report, stats, details
490
583
 
491
584
  except Exception as e:
492
- app_logger.exception("Custom query comparison failed")
585
+ app_logger.exception('Custom query comparison failed')
493
586
  status = ct.COMPARISON_FAILED
494
587
  self._update_stats(status, None)
495
588
  return status, None, None, None
496
- def _get_metadata_cols(self, data_ref: DataReference, engine: Engine) -> pd.DataFrame:
589
+
590
+ def _get_metadata_cols(
591
+ self, data_ref: DataReference, engine: Engine
592
+ ) -> pd.DataFrame:
497
593
  """Get metadata with proper source handling"""
498
594
  adapter = self._get_adapter(DBMSType.from_engine(engine))
499
595
 
@@ -501,13 +597,12 @@ class DataQualityComparator:
501
597
  columns_meta = self._execute_query((query, params), engine)
502
598
 
503
599
  if columns_meta.empty:
504
- raise ValueError(f"Failed to get metadata for: {data_ref.full_name}")
600
+ raise ValueError(f'Failed to get metadata for: {data_ref.full_name}')
505
601
 
506
602
  return columns_meta
507
603
 
508
604
  def _get_metadata_pk(self, data_ref: DataReference, engine: Engine) -> pd.DataFrame:
509
- """Get metadata with proper source handling
510
- """
605
+ """Get metadata with proper source handling"""
511
606
  adapter = self._get_adapter(DBMSType.from_engine(engine))
512
607
 
513
608
  query, params = adapter.build_primary_key_query(data_ref)
@@ -531,19 +626,24 @@ class DataQualityComparator:
531
626
  update_column: str,
532
627
  start_date: Optional[str],
533
628
  end_date: Optional[str],
534
- exclude_recent_hours: Optional[int]
535
- ) -> Tuple[pd.DataFrame, str, Dict] :
629
+ exclude_recent_hours: Optional[int],
630
+ ) -> Tuple[pd.DataFrame, str, Dict]:
536
631
  """Retrieve and prepare table data"""
537
632
  db_type = DBMSType.from_engine(engine)
538
633
  adapter = self._get_adapter(db_type)
539
634
  app_logger.info(db_type)
540
635
 
541
636
  query, params = adapter.build_data_query_common(
542
- data_ref, columns, date_column, update_column,
543
- start_date, end_date, exclude_recent_hours
637
+ data_ref,
638
+ columns,
639
+ date_column,
640
+ update_column,
641
+ start_date,
642
+ end_date,
643
+ exclude_recent_hours,
544
644
  )
545
645
 
546
- df = self._execute_query((query,params), engine, self.timezone)
646
+ df = self._execute_query((query, params), engine, self.timezone)
547
647
 
548
648
  # Apply type conversions
549
649
  df = adapter.convert_types(df, metadata, self.timezone)
@@ -555,9 +655,11 @@ class DataQualityComparator:
555
655
  try:
556
656
  return self.adapters[db_type]
557
657
  except KeyError:
558
- raise ValueError(f"No adapter available for {db_type}")
658
+ raise ValueError(f'No adapter available for {db_type}')
559
659
 
560
- def _execute_query(self, query: Union[str, Tuple[str, Dict]], engine: Engine, timezone: str = None) -> pd.DataFrame:
660
+ def _execute_query(
661
+ self, query: Union[str, Tuple[str, Dict]], engine: Engine, timezone: str = None
662
+ ) -> pd.DataFrame:
561
663
  """Execute SQL query using appropriate adapter"""
562
664
  db_type = DBMSType.from_engine(engine)
563
665
  adapter = self._get_adapter(db_type)
@@ -566,9 +668,7 @@ class DataQualityComparator:
566
668
  return df
567
669
 
568
670
  def _analyze_columns_meta(
569
- self,
570
- source_columns_meta: pd.DataFrame,
571
- target_columns_meta: pd.DataFrame
671
+ self, source_columns_meta: pd.DataFrame, target_columns_meta: pd.DataFrame
572
672
  ) -> tuple[pd.DataFrame, list, list]:
573
673
  """Find common columns between source and target and return unique columns for each"""
574
674
 
@@ -576,8 +676,10 @@ class DataQualityComparator:
576
676
  target_columns = target_columns_meta['column_name'].tolist()
577
677
 
578
678
  common_columns = pd.merge(
579
- source_columns_meta, target_columns_meta,
580
- on='column_name', suffixes=('_source', '_target')
679
+ source_columns_meta,
680
+ target_columns_meta,
681
+ on='column_name',
682
+ suffixes=('_source', '_target'),
581
683
  )
582
684
 
583
685
  source_set = set(source_columns)
@@ -588,13 +690,9 @@ class DataQualityComparator:
588
690
 
589
691
  return common_columns, source_unique, target_unique
590
692
 
591
- def _validate_inputs(
592
- self,
593
- source: DataReference,
594
- target: DataReference
595
- ):
693
+ def _validate_inputs(self, source: DataReference, target: DataReference):
596
694
  """Validate input parameters"""
597
695
  if not isinstance(source, DataReference):
598
- raise TypeError("source must be a DataReference")
696
+ raise TypeError('source must be a DataReference')
599
697
  if not isinstance(target, DataReference):
600
- raise TypeError("target must be a DataReference")
698
+ raise TypeError('target must be a DataReference')