xoverrr 1.1.4__tar.gz → 1.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xoverrr
3
- Version: 1.1.4
3
+ Version: 1.1.5
4
4
  Summary: A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
5
5
  Author-email: Dmitry Ischenko <hotmori@gmail.com>
6
6
  License: MIT
@@ -40,6 +40,70 @@ Dynamic: license-file
40
40
 
41
41
  A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
42
42
 
43
+ ## Usage Example
44
+ **Sample comparison** (Greenplum vs Oracle):
45
+
46
+ ```python
47
+ from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
48
+ import os
49
+ from datetime import date, timedelta
50
+
51
+ USER_ORA = os.getenv('USER_ORA', '')
52
+ PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
53
+
54
+ USER_GP = os.getenv('USER_GP', '')
55
+ PASSWORD_GP = os.getenv('PASSWORD_GP', '')
56
+
57
+ HOST_ORA = os.getenv('HOST_ORA', '')
58
+ HOST_GP = os.getenv('HOST_GP', '')
59
+
60
+ def create_src_engine(user, password, host):
61
+ """Source engine (Oracle)"""
62
+ os.environ['NLS_LANG'] = '.AL32UTF8'
63
+ return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
64
+
65
+ def create_trg_engine(user, password, host):
66
+ """Target engine (Postgres/Greenplum)"""
67
+ connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
68
+ engine = create_engine(connection_string)
69
+ return engine
70
+
71
+
72
+ src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST_ORA)
73
+ trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST_GP)
74
+
75
+ comparator = DataQualityComparator(
76
+ source_engine=src_engine,
77
+ target_engine=trg_engine,
78
+ timezone='Europe/Athens'
79
+ )
80
+
81
+ source = DataReference("users", "schema1")
82
+ target = DataReference("users", "schema2")
83
+
84
+ FORMAT = '%Y-%m-%d'
85
+ recent_range_end = date.today()
86
+ recent_range_begin = recent_range_end - timedelta(days=1)
87
+
88
+ status, report, stats, details = comparator.compare_sample(
89
+ source,
90
+ target,
91
+ date_column="created_at",
92
+ update_column="modified_date",
93
+ exclude_columns=["audit_timestamp", "internal_id"],
94
+ exclude_recent_hours=3,
95
+ date_range=(
96
+ recent_range_begin.strftime(FORMAT),
97
+ recent_range_end.strftime(FORMAT)
98
+ ),
99
+ tolerance_percentage=0
100
+ )
101
+
102
+ print(report)
103
+ if status == COMPARISON_FAILED:
104
+ raise Exception("Sample check failed")
105
+ ```
106
+
43
107
  ## Key Features
44
108
  - **Multi‑DBMS support**: Oracle, PostgreSQL (+ Greenplum), ClickHouse (extensible via adapter layer) — tables and views.
45
109
  - **Universal connections**: Provide SQLAlchemy Engine objects for source and target databases.
@@ -273,71 +337,3 @@ Logs include timing information and structured context:
273
337
  - If `final_diff_score ≤ tolerance`: status = `COMPARISON_SUCCESS`
274
338
  - Enables configuration of acceptable discrepancy levels.
275
339
 
276
- ---
277
-
278
- ## Usage Example
279
- **Sample comparison** (Greenplum vs Oracle):
280
-
281
- ```python
282
- from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
283
- import os
284
- from datetime import date, timedelta
285
-
286
- USER_ORA = os.getenv('USER_ORA', '')
287
- PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
288
-
289
- USER_GP = os.getenv('USER_GP', '')
290
- PASSWORD_GP = os.getenv('PASSWORD_GP', '')
291
-
292
- HOST = os.getenv('HOST', '')
293
-
294
- def create_src_engine(user, password, host):
295
- """Source engine (Oracle)"""
296
- os.environ['NLS_LANG'] = '.AL32UTF8'
297
- return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
298
-
299
- def create_trg_engine(user, password, host):
300
- """Target engine (Postgres/Greenplum)"""
301
- connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
302
- engine = create_engine(connection_string)
303
- return engine
304
-
305
-
306
-
307
- src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST)
308
- trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST)
309
-
310
- comparator = DataQualityComparator(
311
- source_engine=src_engine,
312
- target_engine=trg_engine,
313
- timezone='Asia/Yekaterinburg'
314
- )
315
-
316
- source = DataReference("users", "schema1")
317
- target = DataReference("users", "schema2")
318
-
319
- FORMAT = '%Y-%m-%d'
320
- recent_range_end = date.today()
321
- recent_range_begin = recent_range_end - timedelta(days=1)
322
-
323
- status, report, stats, details = comparator.compare_sample(
324
- source,
325
- target,
326
- date_column="created_at",
327
- update_column="modified_date",
328
- exclude_columns=["audit_timestamp", "internal_id"],
329
- exclude_recent_hours=24,
330
- date_range=(
331
- recent_range_begin.strftime(FORMAT),
332
- recent_range_end.strftime(FORMAT)
333
- ),
334
- tolerance_percentage=0
335
- )
336
-
337
- print(report)
338
- if status == COMPARISON_FAILED:
339
- raise Exception("Sample check failed")
340
- ```
341
-
342
- ---
343
-
@@ -2,6 +2,70 @@
2
2
 
3
3
  A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
4
4
 
5
+ ## Usage Example
6
+ **Sample comparison** (Greenplum vs Oracle):
7
+
8
+ ```python
9
+ from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
10
+ import os
11
+ from datetime import date, timedelta
12
+
13
+ USER_ORA = os.getenv('USER_ORA', '')
14
+ PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
15
+
16
+ USER_GP = os.getenv('USER_GP', '')
17
+ PASSWORD_GP = os.getenv('PASSWORD_GP', '')
18
+
19
+ HOST_ORA = os.getenv('HOST_ORA', '')
20
+ HOST_GP = os.getenv('HOST_GP', '')
21
+
22
+ def create_src_engine(user, password, host):
23
+ """Source engine (Oracle)"""
24
+ os.environ['NLS_LANG'] = '.AL32UTF8'
25
+ return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
26
+
27
+ def create_trg_engine(user, password, host):
28
+ """Target engine (Postgres/Greenplum)"""
29
+ connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
30
+ engine = create_engine(connection_string)
31
+ return engine
32
+
33
+
34
+ src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST_ORA)
35
+ trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST_GP)
36
+
37
+ comparator = DataQualityComparator(
38
+ source_engine=src_engine,
39
+ target_engine=trg_engine,
40
+ timezone='Europe/Athens'
41
+ )
42
+
43
+ source = DataReference("users", "schema1")
44
+ target = DataReference("users", "schema2")
45
+
46
+ FORMAT = '%Y-%m-%d'
47
+ recent_range_end = date.today()
48
+ recent_range_begin = recent_range_end - timedelta(days=1)
49
+
50
+ status, report, stats, details = comparator.compare_sample(
51
+ source,
52
+ target,
53
+ date_column="created_at",
54
+ update_column="modified_date",
55
+ exclude_columns=["audit_timestamp", "internal_id"],
56
+ exclude_recent_hours=3,
57
+ date_range=(
58
+ recent_range_begin.strftime(FORMAT),
59
+ recent_range_end.strftime(FORMAT)
60
+ ),
61
+ tolerance_percentage=0
62
+ )
63
+
64
+ print(report)
65
+ if status == COMPARISON_FAILED:
66
+ raise Exception("Sample check failed")
67
+ ```
68
+
5
69
  ## Key Features
6
70
  - **Multi‑DBMS support**: Oracle, PostgreSQL (+ Greenplum), ClickHouse (extensible via adapter layer) — tables and views.
7
71
  - **Universal connections**: Provide SQLAlchemy Engine objects for source and target databases.
@@ -235,71 +299,3 @@ Logs include timing information and structured context:
235
299
  - If `final_diff_score ≤ tolerance`: status = `COMPARISON_SUCCESS`
236
300
  - Enables configuration of acceptable discrepancy levels.
237
301
 
238
- ---
239
-
240
- ## Usage Example
241
- **Sample comparison** (Greenplum vs Oracle):
242
-
243
- ```python
244
- from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
245
- import os
246
- from datetime import date, timedelta
247
-
248
- USER_ORA = os.getenv('USER_ORA', '')
249
- PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
250
-
251
- USER_GP = os.getenv('USER_GP', '')
252
- PASSWORD_GP = os.getenv('PASSWORD_GP', '')
253
-
254
- HOST = os.getenv('HOST', '')
255
-
256
- def create_src_engine(user, password, host):
257
- """Source engine (Oracle)"""
258
- os.environ['NLS_LANG'] = '.AL32UTF8'
259
- return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
260
-
261
- def create_trg_engine(user, password, host):
262
- """Target engine (Postgres/Greenplum)"""
263
- connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
264
- engine = create_engine(connection_string)
265
- return engine
266
-
267
-
268
-
269
- src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST)
270
- trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST)
271
-
272
- comparator = DataQualityComparator(
273
- source_engine=src_engine,
274
- target_engine=trg_engine,
275
- timezone='Asia/Yekaterinburg'
276
- )
277
-
278
- source = DataReference("users", "schema1")
279
- target = DataReference("users", "schema2")
280
-
281
- FORMAT = '%Y-%m-%d'
282
- recent_range_end = date.today()
283
- recent_range_begin = recent_range_end - timedelta(days=1)
284
-
285
- status, report, stats, details = comparator.compare_sample(
286
- source,
287
- target,
288
- date_column="created_at",
289
- update_column="modified_date",
290
- exclude_columns=["audit_timestamp", "internal_id"],
291
- exclude_recent_hours=24,
292
- date_range=(
293
- recent_range_begin.strftime(FORMAT),
294
- recent_range_end.strftime(FORMAT)
295
- ),
296
- tolerance_percentage=0
297
- )
298
-
299
- print(report)
300
- if status == COMPARISON_FAILED:
301
- raise Exception("Sample check failed")
302
- ```
303
-
304
- ---
305
-
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
 
6
6
  [project]
7
7
  name = "xoverrr"
8
- version = "1.1.4"
8
+ version = "1.1.5"
9
9
  description = "A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting."
10
10
  readme = "README.md"
11
11
  requires-python = ">=3.9"
@@ -14,4 +14,4 @@ __all__ = [
14
14
  "COMPARISON_SKIPPED",
15
15
  ]
16
16
 
17
- __version__ = "1.1.4"
17
+ __version__ = "1.1.5"
@@ -101,7 +101,7 @@ class ClickHouseAdapter(BaseDatabaseAdapter):
101
101
  start_date: Optional[str], end_date: Optional[str]) -> Tuple[str, Dict]:
102
102
  query = f"""
103
103
  SELECT
104
- toDate({date_column}) as dt,
104
+ formatDateTime(toDate({date_column}), '%%Y-%%m-%%d') as dt,
105
105
  count(*) as cnt
106
106
  FROM {data_ref.full_name}
107
107
  WHERE 1=1
@@ -161,10 +161,14 @@ class ClickHouseAdapter(BaseDatabaseAdapter):
161
161
 
162
162
  return None, None
163
163
 
164
- def _get_type_conversion_rules(self, timezone:str ) -> Dict[str, Callable]:
164
+ def _get_type_conversion_rules(self, timezone: str) -> Dict[str, Callable]:
165
165
  return {
166
- r'datetime\(': lambda x: pd.to_datetime(x, utc=True, errors='coerce').dt.tz_convert(timezone).dt.tz_localize(None).strftime(DATETIME_FORMAT).str.replace(r'\s00:00:00$', '', regex=True),
167
- r'datetime64': lambda x: pd.to_datetime(x, utc=True, errors='coerce').dt.tz_convert(timezone).dt.tz_localize(None).strftime(DATETIME_FORMAT).str.replace(r'\s00:00:00$', '', regex=True),
168
- r'date': lambda x: pd.to_datetime(x, errors='coerce').dt.strftime(DATE_FORMAT).str.replace(r'\s00:00:00$', '', regex=True),
169
- r'uint64|uint8|float|decimal|int32': lambda x: x.astype(str).str.replace(r'\.0+$', '', regex=True),
170
- }
166
+ r'datetime64|datetime': lambda x: pd.to_datetime(x, utc=True, errors='coerce')
167
+ .dt.tz_convert(timezone)
168
+ .dt.strftime(DATETIME_FORMAT)
169
+ .str.replace(r'\s00:00:00$', '', regex=True),
170
+ r'date': lambda x: pd.to_datetime(x, errors='coerce')
171
+ .dt.strftime(DATE_FORMAT)
172
+ .str.replace(r'\s00:00:00$', '', regex=True),
173
+ r'uint64|uint8|float|decimal|int32': lambda x: x.astype(str).str.replace(r'\.0+$', '', regex=True),
174
+ }
@@ -1,7 +1,7 @@
1
1
  import pandas as pd
2
2
  from typing import Optional, Dict, Callable, List, Tuple, Union
3
- from datetime import datetime, timedelta
4
- from ..constants import DATE_FORMAT,DATETIME_FORMAT
3
+
4
+ from ..constants import DATETIME_FORMAT
5
5
  from .base import BaseDatabaseAdapter, Engine
6
6
  from ..models import DataReference, ObjectType
7
7
  from ..exceptions import QueryExecutionError
@@ -63,7 +63,12 @@ class DataQualityComparator:
63
63
  DBMSType.CLICKHOUSE: ClickHouseAdapter(),
64
64
  }
65
65
  self._reset_stats()
66
+ from . import __version__
66
67
  app_logger.info('start')
68
+ app_logger.info(f'Version: v{__version__}')
69
+ app_logger.info(f'Source DB: {self.source_db_type.name}')
70
+ app_logger.info(f'Target DB: {self.target_db_type.name}')
71
+
67
72
 
68
73
  def reset_stats(self):
69
74
  self._reset_stats()
@@ -208,9 +213,8 @@ class DataQualityComparator:
208
213
  )
209
214
  target_counts = self._execute_query((target_query, target_params), self.target_engine, self.timezone)
210
215
 
216
+
211
217
  source_counts_filled, target_counts_filled = cross_fill_missing_dates(source_counts, target_counts)
212
- source_counts_filled['dt'] = pd.to_datetime(source_counts_filled['dt'], format='%Y-%m-%d')
213
- target_counts_filled['dt'] = pd.to_datetime(target_counts_filled['dt'], format='%Y-%m-%d')
214
218
 
215
219
  merged = source_counts_filled.merge(target_counts_filled, on='dt')
216
220
  total_count_source = source_counts_filled['cnt'].sum()
@@ -3,13 +3,9 @@ import numpy as np
3
3
  from typing import Dict, Any, List, Optional, Tuple, defaultdict
4
4
  from datetime import datetime
5
5
 
6
- try:
7
- from .constants import NULL_REPLACEMENT, DEFAULT_MAX_EXAMPLES, DATETIME_FORMAT
8
- from .logger import app_logger
9
- except ImportError:
10
- # for cases when used as standalone script
11
- from constants import NULL_REPLACEMENT, DEFAULT_MAX_EXAMPLES, DATETIME_FORMAT
12
- from logger import app_logger
6
+
7
+ from .constants import NULL_REPLACEMENT, DEFAULT_MAX_EXAMPLES, DATETIME_FORMAT
8
+ from .logger import app_logger
13
9
 
14
10
  from dataclasses import dataclass, field
15
11
 
@@ -528,7 +524,7 @@ def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
528
524
 
529
525
 
530
526
  df = df.fillna(NULL_REPLACEMENT)
531
- df = df.replace(r'(?i)^(None|nan|NaN|\s*)$', NULL_REPLACEMENT, regex=True)
527
+ df = df.replace(r'(?i)^(None|nan|NaN|NaT|\s*)$', NULL_REPLACEMENT, regex=True)
532
528
 
533
529
  df = df.astype(str)
534
530
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xoverrr
3
- Version: 1.1.4
3
+ Version: 1.1.5
4
4
  Summary: A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
5
5
  Author-email: Dmitry Ischenko <hotmori@gmail.com>
6
6
  License: MIT
@@ -40,6 +40,70 @@ Dynamic: license-file
40
40
 
41
41
  A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
42
42
 
43
+ ## Usage Example
44
+ **Sample comparison** (Greenplum vs Oracle):
45
+
46
+ ```python
47
+ from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
48
+ import os
49
+ from datetime import date, timedelta
50
+
51
+ USER_ORA = os.getenv('USER_ORA', '')
52
+ PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
53
+
54
+ USER_GP = os.getenv('USER_GP', '')
55
+ PASSWORD_GP = os.getenv('PASSWORD_GP', '')
56
+
57
+ HOST_ORA = os.getenv('HOST_ORA', '')
58
+ HOST_GP = os.getenv('HOST_GP', '')
59
+
60
+ def create_src_engine(user, password, host):
61
+ """Source engine (Oracle)"""
62
+ os.environ['NLS_LANG'] = '.AL32UTF8'
63
+ return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
64
+
65
+ def create_trg_engine(user, password, host):
66
+ """Target engine (Postgres/Greenplum)"""
67
+ connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
68
+ engine = create_engine(connection_string)
69
+ return engine
70
+
71
+
72
+ src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST_ORA)
73
+ trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST_GP)
74
+
75
+ comparator = DataQualityComparator(
76
+ source_engine=src_engine,
77
+ target_engine=trg_engine,
78
+ timezone='Europe/Athens'
79
+ )
80
+
81
+ source = DataReference("users", "schema1")
82
+ target = DataReference("users", "schema2")
83
+
84
+ FORMAT = '%Y-%m-%d'
85
+ recent_range_end = date.today()
86
+ recent_range_begin = recent_range_end - timedelta(days=1)
87
+
88
+ status, report, stats, details = comparator.compare_sample(
89
+ source,
90
+ target,
91
+ date_column="created_at",
92
+ update_column="modified_date",
93
+ exclude_columns=["audit_timestamp", "internal_id"],
94
+ exclude_recent_hours=3,
95
+ date_range=(
96
+ recent_range_begin.strftime(FORMAT),
97
+ recent_range_end.strftime(FORMAT)
98
+ ),
99
+ tolerance_percentage=0
100
+ )
101
+
102
+ print(report)
103
+ if status == COMPARISON_FAILED:
104
+ raise Exception("Sample check failed")
105
+ ```
106
+
43
107
  ## Key Features
44
108
  - **Multi‑DBMS support**: Oracle, PostgreSQL (+ Greenplum), ClickHouse (extensible via adapter layer) — tables and views.
45
109
  - **Universal connections**: Provide SQLAlchemy Engine objects for source and target databases.
@@ -273,71 +337,3 @@ Logs include timing information and structured context:
273
337
  - If `final_diff_score ≤ tolerance`: status = `COMPARISON_SUCCESS`
274
338
  - Enables configuration of acceptable discrepancy levels.
275
339
 
276
- ---
277
-
278
- ## Usage Example
279
- **Sample comparison** (Greenplum vs Oracle):
280
-
281
- ```python
282
- from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
283
- import os
284
- from datetime import date, timedelta
285
-
286
- USER_ORA = os.getenv('USER_ORA', '')
287
- PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
288
-
289
- USER_GP = os.getenv('USER_GP', '')
290
- PASSWORD_GP = os.getenv('PASSWORD_GP', '')
291
-
292
- HOST = os.getenv('HOST', '')
293
-
294
- def create_src_engine(user, password, host):
295
- """Source engine (Oracle)"""
296
- os.environ['NLS_LANG'] = '.AL32UTF8'
297
- return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
298
-
299
- def create_trg_engine(user, password, host):
300
- """Target engine (Postgres/Greenplum)"""
301
- connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
302
- engine = create_engine(connection_string)
303
- return engine
304
-
305
-
306
-
307
- src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST)
308
- trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST)
309
-
310
- comparator = DataQualityComparator(
311
- source_engine=src_engine,
312
- target_engine=trg_engine,
313
- timezone='Asia/Yekaterinburg'
314
- )
315
-
316
- source = DataReference("users", "schema1")
317
- target = DataReference("users", "schema2")
318
-
319
- FORMAT = '%Y-%m-%d'
320
- recent_range_end = date.today()
321
- recent_range_begin = recent_range_end - timedelta(days=1)
322
-
323
- status, report, stats, details = comparator.compare_sample(
324
- source,
325
- target,
326
- date_column="created_at",
327
- update_column="modified_date",
328
- exclude_columns=["audit_timestamp", "internal_id"],
329
- exclude_recent_hours=24,
330
- date_range=(
331
- recent_range_begin.strftime(FORMAT),
332
- recent_range_end.strftime(FORMAT)
333
- ),
334
- tolerance_percentage=0
335
- )
336
-
337
- print(report)
338
- if status == COMPARISON_FAILED:
339
- raise Exception("Sample check failed")
340
- ```
341
-
342
- ---
343
-
File without changes
File without changes
File without changes
File without changes