xoverrr 1.1.4__py3-none-any.whl → 1.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
xoverrr/core.py CHANGED
@@ -1,41 +1,21 @@
1
+ from typing import Dict, List, Optional, Tuple, Union
1
2
 
2
- import sys
3
- from enum import Enum, auto
4
- from typing import Optional, List, Dict, Callable, Union, Tuple, Any
5
3
  import pandas as pd
6
4
  from sqlalchemy.engine import Engine
7
- from .models import (
8
- DBMSType,
9
- DataReference,
10
- ObjectType
11
- )
12
-
13
- from .logger import app_logger
14
5
 
6
+ from . import constants as ct
7
+ from .adapters.base import BaseDatabaseAdapter
8
+ from .adapters.clickhouse import ClickHouseAdapter
15
9
  from .adapters.oracle import OracleAdapter
16
10
  from .adapters.postgres import PostgresAdapter
17
- from .adapters.clickhouse import ClickHouseAdapter
18
- from .adapters.base import BaseDatabaseAdapter
19
-
20
- from . import constants as ct
21
-
22
- from .exceptions import (
23
- MetadataError,
24
- DQCompareException
25
- )
26
- from .utils import (
27
- prepare_dataframe,
28
- compare_dataframes,
29
- clean_recently_changed_data,
30
- generate_comparison_sample_report,
31
- generate_comparison_count_report,
32
- cross_fill_missing_dates,
33
- validate_dataframe_size,
34
- ComparisonStats,
35
- ComparisonDiffDetails
36
- )
37
-
38
-
11
+ from .exceptions import DQCompareException, MetadataError
12
+ from .logger import app_logger
13
+ from .models import DataReference, DBMSType, ObjectType
14
+ from .utils import (ComparisonDiffDetails, ComparisonStats,
15
+ clean_recently_changed_data, compare_dataframes,
16
+ cross_fill_missing_dates, generate_comparison_count_report,
17
+ generate_comparison_sample_report, normalize_column_names,
18
+ prepare_dataframe, validate_dataframe_size)
39
19
 
40
20
 
41
21
  class DataQualityComparator:
@@ -48,7 +28,7 @@ class DataQualityComparator:
48
28
  source_engine: Engine,
49
29
  target_engine: Engine,
50
30
  default_exclude_recent_hours: Optional[int] = 24,
51
- timezone: str = ct.DEFAULT_TZ
31
+ timezone: str = ct.DEFAULT_TZ,
52
32
  ):
53
33
  self.source_engine = source_engine
54
34
  self.target_engine = target_engine
@@ -63,7 +43,12 @@ class DataQualityComparator:
63
43
  DBMSType.CLICKHOUSE: ClickHouseAdapter(),
64
44
  }
65
45
  self._reset_stats()
46
+ from . import __version__
47
+
66
48
  app_logger.info('start')
49
+ app_logger.info(f'Version: v{__version__}')
50
+ app_logger.info(f'Source DB: {self.source_db_type.name}')
51
+ app_logger.info(f'Target DB: {self.target_db_type.name}')
67
52
 
68
53
  def reset_stats(self):
69
54
  self._reset_stats()
@@ -74,17 +59,19 @@ class DataQualityComparator:
74
59
  ct.COMPARISON_SUCCESS: 0,
75
60
  ct.COMPARISON_FAILED: 0,
76
61
  ct.COMPARISON_SKIPPED: 0,
77
- 'tables_success' : set(),
78
- 'tables_failed' : set(),
62
+ 'tables_success': set(),
63
+ 'tables_failed': set(),
79
64
  'tables_skipped': set(),
80
65
  'start_time': pd.Timestamp.now().strftime(ct.DATETIME_FORMAT),
81
- 'end_time': None
66
+ 'end_time': None,
82
67
  }
83
68
 
84
- def _update_stats(self, status: str, source_table:DataReference):
69
+ def _update_stats(self, status: str, source_table: DataReference):
85
70
  """Update comparison statistics"""
86
71
  self.comparison_stats[status] += 1
87
- self.comparison_stats['end_time'] = pd.Timestamp.now().strftime(ct.DATETIME_FORMAT)
72
+ self.comparison_stats['end_time'] = pd.Timestamp.now().strftime(
73
+ ct.DATETIME_FORMAT
74
+ )
88
75
  if source_table:
89
76
  match status:
90
77
  case ct.COMPARISON_SUCCESS:
@@ -101,7 +88,7 @@ class DataQualityComparator:
101
88
  date_column: Optional[str] = None,
102
89
  date_range: Optional[Tuple[str, str]] = None,
103
90
  tolerance_percentage: float = 0.0,
104
- max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES
91
+ max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
105
92
  ) -> Tuple[str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
106
93
 
107
94
  self._validate_inputs(source_table, target_table)
@@ -111,17 +98,21 @@ class DataQualityComparator:
111
98
  try:
112
99
  self.comparison_stats['compared'] += 1
113
100
 
114
-
115
101
  status, report, stats, details = self._compare_counts(
116
- source_table, target_table, date_column, start_date, end_date,
117
- tolerance_percentage, max_examples
102
+ source_table,
103
+ target_table,
104
+ date_column,
105
+ start_date,
106
+ end_date,
107
+ tolerance_percentage,
108
+ max_examples,
118
109
  )
119
110
 
120
111
  self._update_stats(status, source_table)
121
112
  return status, report, stats, details
122
113
 
123
114
  except Exception as e:
124
- app_logger.exception(f"Count comparison failed: {str(e)}")
115
+ app_logger.exception(f'Count comparison failed: {str(e)}')
125
116
  status = ct.COMPARISON_FAILED
126
117
  self._update_stats(status, source_table)
127
118
  return status, None, None, None
@@ -138,25 +129,25 @@ class DataQualityComparator:
138
129
  custom_primary_key: Optional[List[str]] = None,
139
130
  tolerance_percentage: float = 0.0,
140
131
  exclude_recent_hours: Optional[int] = None,
141
- max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES
132
+ max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
142
133
  ) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
143
134
  """
144
135
  Compare data from custom queries with specified key columns
145
136
 
146
137
  Parameters:
147
- source_table: `DataReference`
138
+ source_table: `DataReference`
148
139
  source table to compare
149
140
  target_table: `DataReference`
150
141
  target table to compare
151
142
  custom_primary_key : `List[str]`
152
143
  List of primary key columns for comparison.
153
- exclude_columns : `Optional[List[str]] = None`
144
+ exclude_columns : `Optional[List[str]] = None`
154
145
  Columns to exclude from comparison.
155
- include_columns : `Optional[List[str]] = None`
146
+ include_columns : `Optional[List[str]] = None`
156
147
  Columns to include from comparison (default all cols)
157
- tolerance_percentage : `float`
148
+ tolerance_percentage : `float`
158
149
  Tolerance percentage for discrepancies.
159
- max_examples
150
+ max_examples
160
151
  Maximum number of discrepancy examples per column
161
152
  """
162
153
  self._validate_inputs(source_table, target_table)
@@ -164,35 +155,51 @@ class DataQualityComparator:
164
155
  exclude_hours = exclude_recent_hours or self.default_exclude_recent_hours
165
156
 
166
157
  start_date, end_date = date_range or (None, None)
167
- exclude_cols = exclude_columns or []
168
- custom_keys = custom_primary_key
169
- include_cols = include_columns or []
158
+ exclude_cols = normalize_column_names(exclude_columns or [])
159
+ custom_keys = (
160
+ normalize_column_names(custom_primary_key or [])
161
+ if custom_primary_key
162
+ else None
163
+ )
164
+ include_cols = normalize_column_names(include_columns or [])
170
165
 
171
166
  try:
172
167
  self.comparison_stats['compared'] += 1
173
168
 
174
169
  status, report, stats, details = self._compare_samples(
175
- source_table, target_table, date_column, update_column,
176
- start_date, end_date, exclude_cols,include_cols,
177
- custom_keys, tolerance_percentage, exclude_hours, max_examples
170
+ source_table,
171
+ target_table,
172
+ date_column,
173
+ update_column,
174
+ start_date,
175
+ end_date,
176
+ exclude_cols,
177
+ include_cols,
178
+ custom_keys,
179
+ tolerance_percentage,
180
+ exclude_hours,
181
+ max_examples,
178
182
  )
179
183
 
180
184
  self._update_stats(status, source_table)
181
185
  return status, report, stats, details
182
186
 
183
187
  except Exception as e:
184
- app_logger.exception(f"Sample comparison failed: {str(e)}")
188
+ app_logger.exception(f'Sample comparison failed: {str(e)}')
185
189
  status = ct.COMPARISON_FAILED
186
190
  self._update_stats(status, source_table)
187
191
  return status, None, None, None
188
192
 
189
- def _compare_counts(self, source_table: DataReference,
190
- target_table: DataReference,
191
- date_column: str,
192
- start_date: Optional[str],
193
- end_date: Optional[str],
194
- tolerance_percentage:float,
195
- max_examples:int) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
193
+ def _compare_counts(
194
+ self,
195
+ source_table: DataReference,
196
+ target_table: DataReference,
197
+ date_column: str,
198
+ start_date: Optional[str],
199
+ end_date: Optional[str],
200
+ tolerance_percentage: float,
201
+ max_examples: int,
202
+ ) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
196
203
 
197
204
  try:
198
205
  source_adapter = self._get_adapter(self.source_db_type)
@@ -201,59 +208,73 @@ class DataQualityComparator:
201
208
  source_query, source_params = source_adapter.build_count_query(
202
209
  source_table, date_column, start_date, end_date
203
210
  )
204
- source_counts = self._execute_query((source_query, source_params), self.source_engine, self.timezone)
211
+ source_counts = self._execute_query(
212
+ (source_query, source_params), self.source_engine, self.timezone
213
+ )
205
214
 
206
215
  target_query, target_params = target_adapter.build_count_query(
207
216
  target_table, date_column, start_date, end_date
208
217
  )
209
- target_counts = self._execute_query((target_query, target_params), self.target_engine, self.timezone)
218
+ target_counts = self._execute_query(
219
+ (target_query, target_params), self.target_engine, self.timezone
220
+ )
210
221
 
211
- source_counts_filled, target_counts_filled = cross_fill_missing_dates(source_counts, target_counts)
212
- source_counts_filled['dt'] = pd.to_datetime(source_counts_filled['dt'], format='%Y-%m-%d')
213
- target_counts_filled['dt'] = pd.to_datetime(target_counts_filled['dt'], format='%Y-%m-%d')
222
+ source_counts_filled, target_counts_filled = cross_fill_missing_dates(
223
+ source_counts, target_counts
224
+ )
214
225
 
215
226
  merged = source_counts_filled.merge(target_counts_filled, on='dt')
216
227
  total_count_source = source_counts_filled['cnt'].sum()
217
- total_count_taget = target_counts_filled['cnt'].sum()
228
+ total_count_taget = target_counts_filled['cnt'].sum()
218
229
 
219
- if (total_count_source, total_count_taget) == (0,0):
230
+ if (total_count_source, total_count_taget) == (0, 0):
220
231
  app_logger.warning('nothing to compare to you')
221
232
  status = ct.COMPARISON_SKIPPED
222
233
  return status, None, None, None
223
234
 
224
235
  else:
225
-
226
236
  result_diff_in_counters = abs(merged['cnt_x'] - merged['cnt_y']).sum()
227
237
  result_equal_in_counters = merged[['cnt_x', 'cnt_y']].min(axis=1).sum()
228
238
 
229
- discrepancies_counters_percentage = 100*result_diff_in_counters/(result_diff_in_counters+result_equal_in_counters)
230
- stats, details = compare_dataframes(source_df=source_counts_filled,
231
- target_df=target_counts_filled,
232
- key_columns=['dt'],
233
- max_examples=max_examples)
234
-
235
- status = ct.COMPARISON_FAILED if discrepancies_counters_percentage > tolerance_percentage else ct.COMPARISON_SUCCESS
236
-
237
- report = generate_comparison_count_report(source_table.full_name,
238
- target_table.full_name,
239
- stats,
240
- details,
241
- total_count_source,
242
- total_count_taget,
243
- discrepancies_counters_percentage,
244
- result_diff_in_counters,
245
- result_equal_in_counters,
246
- self.timezone,
247
- source_query,
248
- source_params,
249
- target_query,
250
- target_params
251
- )
239
+ discrepancies_counters_percentage = (
240
+ 100
241
+ * result_diff_in_counters
242
+ / (result_diff_in_counters + result_equal_in_counters)
243
+ )
244
+ stats, details = compare_dataframes(
245
+ source_df=source_counts_filled,
246
+ target_df=target_counts_filled,
247
+ key_columns=['dt'],
248
+ max_examples=max_examples,
249
+ )
250
+
251
+ status = (
252
+ ct.COMPARISON_FAILED
253
+ if discrepancies_counters_percentage > tolerance_percentage
254
+ else ct.COMPARISON_SUCCESS
255
+ )
256
+
257
+ report = generate_comparison_count_report(
258
+ source_table.full_name,
259
+ target_table.full_name,
260
+ stats,
261
+ details,
262
+ total_count_source,
263
+ total_count_taget,
264
+ discrepancies_counters_percentage,
265
+ result_diff_in_counters,
266
+ result_equal_in_counters,
267
+ self.timezone,
268
+ source_query,
269
+ source_params,
270
+ target_query,
271
+ target_params,
272
+ )
252
273
 
253
274
  return status, report, stats, details
254
275
 
255
276
  except Exception as e:
256
- app_logger.error(f"Count comparison failed: {str(e)}")
277
+ app_logger.error(f'Count comparison failed: {str(e)}')
257
278
  raise
258
279
 
259
280
  def _compare_samples(
@@ -267,28 +288,36 @@ class DataQualityComparator:
267
288
  exclude_columns: List[str],
268
289
  include_columns: List[str],
269
290
  custom_key_columns: Optional[List[str]],
270
- tolerance_percentage:float,
291
+ tolerance_percentage: float,
271
292
  exclude_recent_hours: Optional[int],
272
- max_examples:Optional[int]
293
+ max_examples: Optional[int],
273
294
  ) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
274
295
 
275
296
  try:
276
297
  source_object_type = self._get_object_type(source_table, self.source_engine)
277
298
  target_object_type = self._get_object_type(target_table, self.target_engine)
278
- app_logger.info(f'object type source: {source_object_type} vs target {target_object_type}')
299
+ app_logger.info(
300
+ f'object type source: {source_object_type} vs target {target_object_type}'
301
+ )
279
302
 
280
- source_columns_meta = self._get_metadata_cols(source_table, self.source_engine)
303
+ source_columns_meta = self._get_metadata_cols(
304
+ source_table, self.source_engine
305
+ )
281
306
  app_logger.info('source_columns meta:\n')
282
307
  app_logger.info(source_columns_meta.to_string(index=False))
283
308
 
284
- target_columns_meta = self._get_metadata_cols(target_table, self.target_engine)
309
+ target_columns_meta = self._get_metadata_cols(
310
+ target_table, self.target_engine
311
+ )
285
312
  app_logger.info('target_columns meta:\n')
286
313
  app_logger.info(target_columns_meta.to_string(index=False))
287
314
 
288
- intersect = list(set(include_columns)&set(exclude_columns))
315
+ intersect = list(set(include_columns) & set(exclude_columns))
289
316
  if intersect:
290
- app_logger.warning(f'Intersection columns between Include and exclude: {",".join(intersect)}')
291
-
317
+ app_logger.warning(
318
+ f'Intersection columns between Include and exclude: {",".join(intersect)}'
319
+ )
320
+
292
321
  key_columns = None
293
322
 
294
323
  if custom_key_columns:
@@ -296,30 +325,55 @@ class DataQualityComparator:
296
325
  source_cols = source_columns_meta['column_name'].tolist()
297
326
  target_cols = target_columns_meta['column_name'].tolist()
298
327
 
299
- missing_in_source = [col for col in custom_key_columns if col not in source_cols]
300
- missing_in_target = [col for col in custom_key_columns if col not in target_cols]
328
+ missing_in_source = [
329
+ col for col in custom_key_columns if col not in source_cols
330
+ ]
331
+ missing_in_target = [
332
+ col for col in custom_key_columns if col not in target_cols
333
+ ]
301
334
 
302
335
  if missing_in_source:
303
- raise MetadataError(f"Custom key columns missing in source: {missing_in_source}")
336
+ raise MetadataError(
337
+ f'Custom key columns missing in source: {missing_in_source}'
338
+ )
304
339
  if missing_in_target:
305
- raise MetadataError(f"Custom key columns missing in target: {missing_in_target}")
340
+ raise MetadataError(
341
+ f'Custom key columns missing in target: {missing_in_target}'
342
+ )
306
343
  else:
307
- source_pk = self._get_metadata_pk(source_table, self.source_engine) \
308
- if source_object_type == ObjectType.TABLE else pd.DataFrame({'pk_column_name': []})
309
- target_pk = self._get_metadata_pk(target_table, self.target_engine) \
310
- if target_object_type == ObjectType.TABLE else pd.DataFrame({'pk_column_name': []})
311
-
312
- if source_pk['pk_column_name'].tolist() != target_pk['pk_column_name'].tolist():
313
- app_logger.warning(f"Primary keys differ: source={source_pk['pk_column_name'].tolist()}, target={target_pk['pk_column_name'].tolist()}")
314
- key_columns = source_pk['pk_column_name'].tolist() or target_pk['pk_column_name'].tolist()
344
+ source_pk = (
345
+ self._get_metadata_pk(source_table, self.source_engine)
346
+ if source_object_type == ObjectType.TABLE
347
+ else pd.DataFrame({'pk_column_name': []})
348
+ )
349
+ target_pk = (
350
+ self._get_metadata_pk(target_table, self.target_engine)
351
+ if target_object_type == ObjectType.TABLE
352
+ else pd.DataFrame({'pk_column_name': []})
353
+ )
354
+
355
+ if (
356
+ source_pk['pk_column_name'].tolist()
357
+ != target_pk['pk_column_name'].tolist()
358
+ ):
359
+ app_logger.warning(
360
+ f'Primary keys differ: source={source_pk["pk_column_name"].tolist()}, target={target_pk["pk_column_name"].tolist()}'
361
+ )
362
+ key_columns = (
363
+ source_pk['pk_column_name'].tolist()
364
+ or target_pk['pk_column_name'].tolist()
365
+ )
315
366
  if not key_columns:
316
- raise MetadataError(f"Primary key not found in the source neither in the target and not provided")
367
+ raise MetadataError(
368
+ f'Primary key not found in the source neither in the target and not provided'
369
+ )
317
370
 
318
371
  if include_columns:
319
-
320
372
  if not set(include_columns) & set(key_columns):
321
- app_logger.warning(f'The primary key was not included in the column list.\
322
- The key column was included in the resulting query automatically. PK:{key_columns}')
373
+ app_logger.warning(
374
+ f'The primary key was not included in the column list.\
375
+ The key column was included in the resulting query automatically. PK:{key_columns}'
376
+ )
323
377
 
324
378
  include_columns = list(set(include_columns + key_columns))
325
379
 
@@ -329,12 +383,13 @@ class DataQualityComparator:
329
383
  target_columns_meta = target_columns_meta[
330
384
  target_columns_meta['column_name'].isin(include_columns)
331
385
  ]
332
-
333
- if exclude_columns:
334
386
 
387
+ if exclude_columns:
335
388
  if set(exclude_columns) & set(key_columns):
336
- app_logger.warning(f'The primary key has been excluded from the column list.\
337
- However, the key column must be present in the resulting query.s PK:{key_columns}')
389
+ app_logger.warning(
390
+ f'The primary key has been excluded from the column list.\
391
+ However, the key column must be present in the resulting query.s PK:{key_columns}'
392
+ )
338
393
 
339
394
  exclude_columns = list(set(exclude_columns) - set(key_columns))
340
395
 
@@ -345,63 +400,87 @@ class DataQualityComparator:
345
400
  ~target_columns_meta['column_name'].isin(exclude_columns)
346
401
  ]
347
402
 
348
- common_cols_df, source_only_cols, target_only_cols = self._analyze_columns_meta(source_columns_meta, target_columns_meta)
403
+ common_cols_df, source_only_cols, target_only_cols = (
404
+ self._analyze_columns_meta(source_columns_meta, target_columns_meta)
405
+ )
349
406
  common_cols = common_cols_df['column_name'].tolist()
350
407
 
351
408
  if not common_cols:
352
- raise MetadataError(f"No one column to compare, need to check tables or reduce the exclude_columns list: {','.join(exclude_columns)}")
353
-
409
+ raise MetadataError(
410
+ f'No one column to compare, need to check tables or reduce the exclude_columns list: {",".join(exclude_columns)}'
411
+ )
412
+
354
413
  source_data, source_query, source_params = self._get_table_data(
355
- self.source_engine, source_table, source_columns_meta, common_cols,
356
- date_column, update_column, start_date, end_date, exclude_recent_hours
414
+ self.source_engine,
415
+ source_table,
416
+ source_columns_meta,
417
+ common_cols,
418
+ date_column,
419
+ update_column,
420
+ start_date,
421
+ end_date,
422
+ exclude_recent_hours,
357
423
  )
358
424
 
359
425
  target_data, target_query, target_params = self._get_table_data(
360
- self.target_engine, target_table, target_columns_meta, common_cols,
361
- date_column, update_column, start_date, end_date, exclude_recent_hours
426
+ self.target_engine,
427
+ target_table,
428
+ target_columns_meta,
429
+ common_cols,
430
+ date_column,
431
+ update_column,
432
+ start_date,
433
+ end_date,
434
+ exclude_recent_hours,
362
435
  )
363
436
  status = None
364
- #special case
437
+ # special case
365
438
  if target_data.empty and source_data.empty:
366
439
  status = ct.COMPARISON_SKIPPED
367
440
  return status, None, None, None
368
441
  elif source_data.empty or target_data.empty:
369
- raise DQCompareException(f"Nothing to compare, rows returned from source: {len(source_data)}, from target: {len(target_data)}")
370
-
442
+ raise DQCompareException(
443
+ f'Nothing to compare, rows returned from source: {len(source_data)}, from target: {len(target_data)}'
444
+ )
371
445
 
372
446
  source_data = prepare_dataframe(source_data)
373
447
  target_data = prepare_dataframe(target_data)
374
448
  if update_column and exclude_recent_hours:
375
- source_data, target_data = clean_recently_changed_data(source_data, target_data, key_columns)
376
-
449
+ source_data, target_data = clean_recently_changed_data(
450
+ source_data, target_data, key_columns
451
+ )
377
452
 
378
453
  stats, details = compare_dataframes(
379
- source_data, target_data,
380
- key_columns, max_examples
454
+ source_data, target_data, key_columns, max_examples
381
455
  )
382
456
 
383
457
  if stats:
384
458
  details.skipped_source_columns = source_only_cols
385
459
  details.skipped_target_columns = target_only_cols
386
460
 
387
- report = generate_comparison_sample_report(source_table.full_name,
388
- target_table.full_name,
389
- stats,
390
- details,
391
- self.timezone,
392
- source_query,
393
- source_params,
394
- target_query,
395
- target_params
396
- )
397
- status = ct.COMPARISON_FAILED if stats.final_diff_score > tolerance_percentage else ct.COMPARISON_SUCCESS
461
+ report = generate_comparison_sample_report(
462
+ source_table.full_name,
463
+ target_table.full_name,
464
+ stats,
465
+ details,
466
+ self.timezone,
467
+ source_query,
468
+ source_params,
469
+ target_query,
470
+ target_params,
471
+ )
472
+ status = (
473
+ ct.COMPARISON_FAILED
474
+ if stats.final_diff_score > tolerance_percentage
475
+ else ct.COMPARISON_SUCCESS
476
+ )
398
477
  return status, report, stats, details
399
478
  else:
400
479
  status = ct.COMPARISON_SKIPPED
401
480
  return status, None, None, None
402
481
 
403
482
  except Exception as e:
404
- app_logger.error(f"Sample comparison failed: {str(e)}")
483
+ app_logger.error(f'Sample comparison failed: {str(e)}')
405
484
  raise
406
485
 
407
486
  def compare_custom_query(
@@ -413,25 +492,25 @@ class DataQualityComparator:
413
492
  custom_primary_key: List[str],
414
493
  exclude_columns: Optional[List[str]] = None,
415
494
  tolerance_percentage: float = 0.0,
416
- max_examples:Optional[int] = ct.DEFAULT_MAX_EXAMPLES
495
+ max_examples: Optional[int] = ct.DEFAULT_MAX_EXAMPLES,
417
496
  ) -> Tuple[str, str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]:
418
497
  """
419
498
  Compare data from custom queries with specified key columns
420
499
 
421
500
  Parameters:
422
- source_query : Union[str, Tuple[str, Dict]]
501
+ source_query : Union[str, Tuple[str, Dict]]
423
502
  Source query (can be string or tuple with query and params).
424
503
  target_query : Union[str, Tuple[str, Dict]]
425
504
  Target query (can be string or tuple with query and params).
426
505
  custom_primary_key : List[str]
427
506
  List of primary key columns for comparison.
428
- exclude_columns : Optional[List[str]] = None
507
+ exclude_columns : Optional[List[str]] = None
429
508
  Columns to exclude from comparison.
430
- tolerance_percentage : float
509
+ tolerance_percentage : float
431
510
  Tolerance percentage for discrepancies.
432
511
  max_examples: int
433
- Maximum number of discrepancy examples per column
434
-
512
+ Maximum number of discrepancy examples per column
513
+
435
514
  Returns:
436
515
  ----------
437
516
  Tuple[str, Optional[ComparisonStats], Optional[ComparisonDiffDetails]]
@@ -444,8 +523,12 @@ class DataQualityComparator:
444
523
  self.comparison_stats['compared'] += 1
445
524
 
446
525
  # Execute queries
447
- source_data = self._execute_query((source_query,source_params), source_engine, timezone)
448
- target_data = self._execute_query((target_query,target_params), target_engine, timezone)
526
+ source_data = self._execute_query(
527
+ (source_query, source_params), source_engine, timezone
528
+ )
529
+ target_data = self._execute_query(
530
+ (target_query, target_params), target_engine, timezone
531
+ )
449
532
  app_logger.info('preparing source dataframe')
450
533
  source_data_prepared = prepare_dataframe(source_data)
451
534
  app_logger.info('preparing target dataframe')
@@ -453,43 +536,60 @@ class DataQualityComparator:
453
536
 
454
537
  # Exclude columns if specified
455
538
  exclude_cols = exclude_columns or []
456
- common_cols = [col for col in source_data_prepared.columns
457
- if col in target_data_prepared.columns and col not in exclude_cols]
539
+ common_cols = [
540
+ col
541
+ for col in source_data_prepared.columns
542
+ if col in target_data_prepared.columns and col not in exclude_cols
543
+ ]
458
544
 
459
545
  source_data_filtered = source_data_prepared[common_cols]
460
546
  target_data_filtered = target_data_prepared[common_cols]
461
547
  if 'xrecently_changed' in common_cols:
462
- source_data_filtered, target_data_filtered = clean_recently_changed_data(source_data_filtered, target_data_filtered, custom_primary_key)
548
+ source_data_filtered, target_data_filtered = (
549
+ clean_recently_changed_data(
550
+ source_data_filtered, target_data_filtered, custom_primary_key
551
+ )
552
+ )
463
553
  # Compare dataframes
464
554
  stats, details = compare_dataframes(
465
- source_data_filtered, target_data_filtered, custom_primary_key, max_examples
555
+ source_data_filtered,
556
+ target_data_filtered,
557
+ custom_primary_key,
558
+ max_examples,
466
559
  )
467
560
 
468
561
  if stats:
469
- report = generate_comparison_sample_report(None,
470
- None,
471
- stats,
472
- details,
473
- self.timezone,
474
- source_query,
475
- source_params,
476
- target_query,
477
- target_params
478
- )
479
- status = ct.COMPARISON_FAILED if stats.final_diff_score > tolerance_percentage else ct.COMPARISON_SUCCESS
562
+ report = generate_comparison_sample_report(
563
+ None,
564
+ None,
565
+ stats,
566
+ details,
567
+ self.timezone,
568
+ source_query,
569
+ source_params,
570
+ target_query,
571
+ target_params,
572
+ )
573
+ status = (
574
+ ct.COMPARISON_FAILED
575
+ if stats.final_diff_score > tolerance_percentage
576
+ else ct.COMPARISON_SUCCESS
577
+ )
480
578
  else:
481
579
  status = ct.COMPARISON_SKIPPED
482
580
 
483
-
484
581
  self._update_stats(status, None)
485
582
  return status, report, stats, details
486
583
 
487
584
  except Exception as e:
488
- app_logger.exception("Custom query comparison failed")
585
+ app_logger.exception('Custom query comparison failed')
489
586
  status = ct.COMPARISON_FAILED
490
587
  self._update_stats(status, None)
491
588
  return status, None, None, None
492
- def _get_metadata_cols(self, data_ref: DataReference, engine: Engine) -> pd.DataFrame:
589
+
590
+ def _get_metadata_cols(
591
+ self, data_ref: DataReference, engine: Engine
592
+ ) -> pd.DataFrame:
493
593
  """Get metadata with proper source handling"""
494
594
  adapter = self._get_adapter(DBMSType.from_engine(engine))
495
595
 
@@ -497,13 +597,12 @@ class DataQualityComparator:
497
597
  columns_meta = self._execute_query((query, params), engine)
498
598
 
499
599
  if columns_meta.empty:
500
- raise ValueError(f"Failed to get metadata for: {data_ref.full_name}")
600
+ raise ValueError(f'Failed to get metadata for: {data_ref.full_name}')
501
601
 
502
602
  return columns_meta
503
603
 
504
604
  def _get_metadata_pk(self, data_ref: DataReference, engine: Engine) -> pd.DataFrame:
505
- """Get metadata with proper source handling
506
- """
605
+ """Get metadata with proper source handling"""
507
606
  adapter = self._get_adapter(DBMSType.from_engine(engine))
508
607
 
509
608
  query, params = adapter.build_primary_key_query(data_ref)
@@ -527,19 +626,24 @@ class DataQualityComparator:
527
626
  update_column: str,
528
627
  start_date: Optional[str],
529
628
  end_date: Optional[str],
530
- exclude_recent_hours: Optional[int]
531
- ) -> Tuple[pd.DataFrame, str, Dict] :
629
+ exclude_recent_hours: Optional[int],
630
+ ) -> Tuple[pd.DataFrame, str, Dict]:
532
631
  """Retrieve and prepare table data"""
533
632
  db_type = DBMSType.from_engine(engine)
534
633
  adapter = self._get_adapter(db_type)
535
634
  app_logger.info(db_type)
536
635
 
537
636
  query, params = adapter.build_data_query_common(
538
- data_ref, columns, date_column, update_column,
539
- start_date, end_date, exclude_recent_hours
637
+ data_ref,
638
+ columns,
639
+ date_column,
640
+ update_column,
641
+ start_date,
642
+ end_date,
643
+ exclude_recent_hours,
540
644
  )
541
645
 
542
- df = self._execute_query((query,params), engine, self.timezone)
646
+ df = self._execute_query((query, params), engine, self.timezone)
543
647
 
544
648
  # Apply type conversions
545
649
  df = adapter.convert_types(df, metadata, self.timezone)
@@ -551,9 +655,11 @@ class DataQualityComparator:
551
655
  try:
552
656
  return self.adapters[db_type]
553
657
  except KeyError:
554
- raise ValueError(f"No adapter available for {db_type}")
658
+ raise ValueError(f'No adapter available for {db_type}')
555
659
 
556
- def _execute_query(self, query: Union[str, Tuple[str, Dict]], engine: Engine, timezone: str = None) -> pd.DataFrame:
660
+ def _execute_query(
661
+ self, query: Union[str, Tuple[str, Dict]], engine: Engine, timezone: str = None
662
+ ) -> pd.DataFrame:
557
663
  """Execute SQL query using appropriate adapter"""
558
664
  db_type = DBMSType.from_engine(engine)
559
665
  adapter = self._get_adapter(db_type)
@@ -562,9 +668,7 @@ class DataQualityComparator:
562
668
  return df
563
669
 
564
670
  def _analyze_columns_meta(
565
- self,
566
- source_columns_meta: pd.DataFrame,
567
- target_columns_meta: pd.DataFrame
671
+ self, source_columns_meta: pd.DataFrame, target_columns_meta: pd.DataFrame
568
672
  ) -> tuple[pd.DataFrame, list, list]:
569
673
  """Find common columns between source and target and return unique columns for each"""
570
674
 
@@ -572,8 +676,10 @@ class DataQualityComparator:
572
676
  target_columns = target_columns_meta['column_name'].tolist()
573
677
 
574
678
  common_columns = pd.merge(
575
- source_columns_meta, target_columns_meta,
576
- on='column_name', suffixes=('_source', '_target')
679
+ source_columns_meta,
680
+ target_columns_meta,
681
+ on='column_name',
682
+ suffixes=('_source', '_target'),
577
683
  )
578
684
 
579
685
  source_set = set(source_columns)
@@ -584,13 +690,9 @@ class DataQualityComparator:
584
690
 
585
691
  return common_columns, source_unique, target_unique
586
692
 
587
- def _validate_inputs(
588
- self,
589
- source: DataReference,
590
- target: DataReference
591
- ):
693
+ def _validate_inputs(self, source: DataReference, target: DataReference):
592
694
  """Validate input parameters"""
593
695
  if not isinstance(source, DataReference):
594
- raise TypeError("source must be a DataReference")
696
+ raise TypeError('source must be a DataReference')
595
697
  if not isinstance(target, DataReference):
596
- raise TypeError("target must be a DataReference")
698
+ raise TypeError('target must be a DataReference')