xoverrr 1.1.4__tar.gz → 1.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xoverrr-1.1.4/src/xoverrr.egg-info → xoverrr-1.1.5}/PKG-INFO +65 -69
- {xoverrr-1.1.4 → xoverrr-1.1.5}/README.md +64 -68
- {xoverrr-1.1.4 → xoverrr-1.1.5}/pyproject.toml +1 -1
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/__init__.py +1 -1
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/adapters/clickhouse.py +11 -7
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/adapters/oracle.py +2 -2
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/core.py +6 -2
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/utils.py +4 -8
- {xoverrr-1.1.4 → xoverrr-1.1.5/src/xoverrr.egg-info}/PKG-INFO +65 -69
- {xoverrr-1.1.4 → xoverrr-1.1.5}/LICENSE +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/setup.cfg +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/adapters/__init__.py +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/adapters/base.py +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/adapters/postgres.py +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/constants.py +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/exceptions.py +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/logger.py +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr/models.py +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr.egg-info/SOURCES.txt +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr.egg-info/dependency_links.txt +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr.egg-info/requires.txt +0 -0
- {xoverrr-1.1.4 → xoverrr-1.1.5}/src/xoverrr.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xoverrr
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.5
|
|
4
4
|
Summary: A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
|
|
5
5
|
Author-email: Dmitry Ischenko <hotmori@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -40,6 +40,70 @@ Dynamic: license-file
|
|
|
40
40
|
|
|
41
41
|
A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
|
|
42
42
|
|
|
43
|
+
## Usage Example
|
|
44
|
+
**Sample comparison** (Greenplum vs Oracle):
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
|
|
48
|
+
import os
|
|
49
|
+
from datetime import date, timedelta
|
|
50
|
+
|
|
51
|
+
USER_ORA = os.getenv('USER_ORA', '')
|
|
52
|
+
PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
|
|
53
|
+
|
|
54
|
+
USER_GP = os.getenv('USER_GP', '')
|
|
55
|
+
PASSWORD_GP = os.getenv('PASSWORD_GP', '')
|
|
56
|
+
|
|
57
|
+
HOST_ORA = os.getenv('HOST_ORA', '')
|
|
58
|
+
HOST_GP = os.getenv('HOST_GP', '')
|
|
59
|
+
|
|
60
|
+
def create_src_engine(user, password, host):
|
|
61
|
+
"""Source engine (Oracle)"""
|
|
62
|
+
os.environ['NLS_LANG'] = '.AL32UTF8'
|
|
63
|
+
return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
|
|
64
|
+
|
|
65
|
+
def create_trg_engine(user, password, host):
|
|
66
|
+
"""Target engine (Postgres/Greenplum)"""
|
|
67
|
+
connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
|
|
68
|
+
engine = create_engine(connection_string)
|
|
69
|
+
return engine
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST_ORA)
|
|
73
|
+
trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST_GP)
|
|
74
|
+
|
|
75
|
+
comparator = DataQualityComparator(
|
|
76
|
+
source_engine=src_engine,
|
|
77
|
+
target_engine=trg_engine,
|
|
78
|
+
timezone='Europe/Athens'
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
source = DataReference("users", "schema1")
|
|
82
|
+
target = DataReference("users", "schema2")
|
|
83
|
+
|
|
84
|
+
FORMAT = '%Y-%m-%d'
|
|
85
|
+
recent_range_end = date.today()
|
|
86
|
+
recent_range_begin = recent_range_end - timedelta(days=1)
|
|
87
|
+
|
|
88
|
+
status, report, stats, details = comparator.compare_sample(
|
|
89
|
+
source,
|
|
90
|
+
target,
|
|
91
|
+
date_column="created_at",
|
|
92
|
+
update_column="modified_date",
|
|
93
|
+
exclude_columns=["audit_timestamp", "internal_id"],
|
|
94
|
+
exclude_recent_hours=3,
|
|
95
|
+
date_range=(
|
|
96
|
+
recent_range_begin.strftime(FORMAT),
|
|
97
|
+
recent_range_end.strftime(FORMAT)
|
|
98
|
+
),
|
|
99
|
+
tolerance_percentage=0
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
print(report)
|
|
103
|
+
if status == COMPARISON_FAILED:
|
|
104
|
+
raise Exception("Sample check failed")
|
|
105
|
+
```
|
|
106
|
+
|
|
43
107
|
## Key Features
|
|
44
108
|
- **Multi‑DBMS support**: Oracle, PostgreSQL (+ Greenplum), ClickHouse (extensible via adapter layer) — tables and views.
|
|
45
109
|
- **Universal connections**: Provide SQLAlchemy Engine objects for source and target databases.
|
|
@@ -273,71 +337,3 @@ Logs include timing information and structured context:
|
|
|
273
337
|
- If `final_diff_score ≤ tolerance`: status = `COMPARISON_SUCCESS`
|
|
274
338
|
- Enables configuration of acceptable discrepancy levels.
|
|
275
339
|
|
|
276
|
-
---
|
|
277
|
-
|
|
278
|
-
## Usage Example
|
|
279
|
-
**Sample comparison** (Greenplum vs Oracle):
|
|
280
|
-
|
|
281
|
-
```python
|
|
282
|
-
from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
|
|
283
|
-
import os
|
|
284
|
-
from datetime import date, timedelta
|
|
285
|
-
|
|
286
|
-
USER_ORA = os.getenv('USER_ORA', '')
|
|
287
|
-
PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
|
|
288
|
-
|
|
289
|
-
USER_GP = os.getenv('USER_GP', '')
|
|
290
|
-
PASSWORD_GP = os.getenv('PASSWORD_GP', '')
|
|
291
|
-
|
|
292
|
-
HOST = os.getenv('HOST', '')
|
|
293
|
-
|
|
294
|
-
def create_src_engine(user, password, host):
|
|
295
|
-
"""Source engine (Oracle)"""
|
|
296
|
-
os.environ['NLS_LANG'] = '.AL32UTF8'
|
|
297
|
-
return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
|
|
298
|
-
|
|
299
|
-
def create_trg_engine(user, password, host):
|
|
300
|
-
"""Target engine (Postgres/Greenplum)"""
|
|
301
|
-
connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
|
|
302
|
-
engine = create_engine(connection_string)
|
|
303
|
-
return engine
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST)
|
|
308
|
-
trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST)
|
|
309
|
-
|
|
310
|
-
comparator = DataQualityComparator(
|
|
311
|
-
source_engine=src_engine,
|
|
312
|
-
target_engine=trg_engine,
|
|
313
|
-
timezone='Asia/Yekaterinburg'
|
|
314
|
-
)
|
|
315
|
-
|
|
316
|
-
source = DataReference("users", "schema1")
|
|
317
|
-
target = DataReference("users", "schema2")
|
|
318
|
-
|
|
319
|
-
FORMAT = '%Y-%m-%d'
|
|
320
|
-
recent_range_end = date.today()
|
|
321
|
-
recent_range_begin = recent_range_end - timedelta(days=1)
|
|
322
|
-
|
|
323
|
-
status, report, stats, details = comparator.compare_sample(
|
|
324
|
-
source,
|
|
325
|
-
target,
|
|
326
|
-
date_column="created_at",
|
|
327
|
-
update_column="modified_date",
|
|
328
|
-
exclude_columns=["audit_timestamp", "internal_id"],
|
|
329
|
-
exclude_recent_hours=24,
|
|
330
|
-
date_range=(
|
|
331
|
-
recent_range_begin.strftime(FORMAT),
|
|
332
|
-
recent_range_end.strftime(FORMAT)
|
|
333
|
-
),
|
|
334
|
-
tolerance_percentage=0
|
|
335
|
-
)
|
|
336
|
-
|
|
337
|
-
print(report)
|
|
338
|
-
if status == COMPARISON_FAILED:
|
|
339
|
-
raise Exception("Sample check failed")
|
|
340
|
-
```
|
|
341
|
-
|
|
342
|
-
---
|
|
343
|
-
|
|
@@ -2,6 +2,70 @@
|
|
|
2
2
|
|
|
3
3
|
A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
|
|
4
4
|
|
|
5
|
+
## Usage Example
|
|
6
|
+
**Sample comparison** (Greenplum vs Oracle):
|
|
7
|
+
|
|
8
|
+
```python
|
|
9
|
+
from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
|
|
10
|
+
import os
|
|
11
|
+
from datetime import date, timedelta
|
|
12
|
+
|
|
13
|
+
USER_ORA = os.getenv('USER_ORA', '')
|
|
14
|
+
PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
|
|
15
|
+
|
|
16
|
+
USER_GP = os.getenv('USER_GP', '')
|
|
17
|
+
PASSWORD_GP = os.getenv('PASSWORD_GP', '')
|
|
18
|
+
|
|
19
|
+
HOST_ORA = os.getenv('HOST_ORA', '')
|
|
20
|
+
HOST_GP = os.getenv('HOST_GP', '')
|
|
21
|
+
|
|
22
|
+
def create_src_engine(user, password, host):
|
|
23
|
+
"""Source engine (Oracle)"""
|
|
24
|
+
os.environ['NLS_LANG'] = '.AL32UTF8'
|
|
25
|
+
return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
|
|
26
|
+
|
|
27
|
+
def create_trg_engine(user, password, host):
|
|
28
|
+
"""Target engine (Postgres/Greenplum)"""
|
|
29
|
+
connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
|
|
30
|
+
engine = create_engine(connection_string)
|
|
31
|
+
return engine
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST_ORA)
|
|
35
|
+
trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST_GP)
|
|
36
|
+
|
|
37
|
+
comparator = DataQualityComparator(
|
|
38
|
+
source_engine=src_engine,
|
|
39
|
+
target_engine=trg_engine,
|
|
40
|
+
timezone='Europe/Athens'
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
source = DataReference("users", "schema1")
|
|
44
|
+
target = DataReference("users", "schema2")
|
|
45
|
+
|
|
46
|
+
FORMAT = '%Y-%m-%d'
|
|
47
|
+
recent_range_end = date.today()
|
|
48
|
+
recent_range_begin = recent_range_end - timedelta(days=1)
|
|
49
|
+
|
|
50
|
+
status, report, stats, details = comparator.compare_sample(
|
|
51
|
+
source,
|
|
52
|
+
target,
|
|
53
|
+
date_column="created_at",
|
|
54
|
+
update_column="modified_date",
|
|
55
|
+
exclude_columns=["audit_timestamp", "internal_id"],
|
|
56
|
+
exclude_recent_hours=3,
|
|
57
|
+
date_range=(
|
|
58
|
+
recent_range_begin.strftime(FORMAT),
|
|
59
|
+
recent_range_end.strftime(FORMAT)
|
|
60
|
+
),
|
|
61
|
+
tolerance_percentage=0
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
print(report)
|
|
65
|
+
if status == COMPARISON_FAILED:
|
|
66
|
+
raise Exception("Sample check failed")
|
|
67
|
+
```
|
|
68
|
+
|
|
5
69
|
## Key Features
|
|
6
70
|
- **Multi‑DBMS support**: Oracle, PostgreSQL (+ Greenplum), ClickHouse (extensible via adapter layer) — tables and views.
|
|
7
71
|
- **Universal connections**: Provide SQLAlchemy Engine objects for source and target databases.
|
|
@@ -235,71 +299,3 @@ Logs include timing information and structured context:
|
|
|
235
299
|
- If `final_diff_score ≤ tolerance`: status = `COMPARISON_SUCCESS`
|
|
236
300
|
- Enables configuration of acceptable discrepancy levels.
|
|
237
301
|
|
|
238
|
-
---
|
|
239
|
-
|
|
240
|
-
## Usage Example
|
|
241
|
-
**Sample comparison** (Greenplum vs Oracle):
|
|
242
|
-
|
|
243
|
-
```python
|
|
244
|
-
from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
|
|
245
|
-
import os
|
|
246
|
-
from datetime import date, timedelta
|
|
247
|
-
|
|
248
|
-
USER_ORA = os.getenv('USER_ORA', '')
|
|
249
|
-
PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
|
|
250
|
-
|
|
251
|
-
USER_GP = os.getenv('USER_GP', '')
|
|
252
|
-
PASSWORD_GP = os.getenv('PASSWORD_GP', '')
|
|
253
|
-
|
|
254
|
-
HOST = os.getenv('HOST', '')
|
|
255
|
-
|
|
256
|
-
def create_src_engine(user, password, host):
|
|
257
|
-
"""Source engine (Oracle)"""
|
|
258
|
-
os.environ['NLS_LANG'] = '.AL32UTF8'
|
|
259
|
-
return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
|
|
260
|
-
|
|
261
|
-
def create_trg_engine(user, password, host):
|
|
262
|
-
"""Target engine (Postgres/Greenplum)"""
|
|
263
|
-
connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
|
|
264
|
-
engine = create_engine(connection_string)
|
|
265
|
-
return engine
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST)
|
|
270
|
-
trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST)
|
|
271
|
-
|
|
272
|
-
comparator = DataQualityComparator(
|
|
273
|
-
source_engine=src_engine,
|
|
274
|
-
target_engine=trg_engine,
|
|
275
|
-
timezone='Asia/Yekaterinburg'
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
source = DataReference("users", "schema1")
|
|
279
|
-
target = DataReference("users", "schema2")
|
|
280
|
-
|
|
281
|
-
FORMAT = '%Y-%m-%d'
|
|
282
|
-
recent_range_end = date.today()
|
|
283
|
-
recent_range_begin = recent_range_end - timedelta(days=1)
|
|
284
|
-
|
|
285
|
-
status, report, stats, details = comparator.compare_sample(
|
|
286
|
-
source,
|
|
287
|
-
target,
|
|
288
|
-
date_column="created_at",
|
|
289
|
-
update_column="modified_date",
|
|
290
|
-
exclude_columns=["audit_timestamp", "internal_id"],
|
|
291
|
-
exclude_recent_hours=24,
|
|
292
|
-
date_range=(
|
|
293
|
-
recent_range_begin.strftime(FORMAT),
|
|
294
|
-
recent_range_end.strftime(FORMAT)
|
|
295
|
-
),
|
|
296
|
-
tolerance_percentage=0
|
|
297
|
-
)
|
|
298
|
-
|
|
299
|
-
print(report)
|
|
300
|
-
if status == COMPARISON_FAILED:
|
|
301
|
-
raise Exception("Sample check failed")
|
|
302
|
-
```
|
|
303
|
-
|
|
304
|
-
---
|
|
305
|
-
|
|
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "xoverrr"
|
|
8
|
-
version = "1.1.
|
|
8
|
+
version = "1.1.5"
|
|
9
9
|
description = "A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting."
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
requires-python = ">=3.9"
|
|
@@ -101,7 +101,7 @@ class ClickHouseAdapter(BaseDatabaseAdapter):
|
|
|
101
101
|
start_date: Optional[str], end_date: Optional[str]) -> Tuple[str, Dict]:
|
|
102
102
|
query = f"""
|
|
103
103
|
SELECT
|
|
104
|
-
toDate({date_column}) as dt,
|
|
104
|
+
formatDateTime(toDate({date_column}), '%%Y-%%m-%%d') as dt,
|
|
105
105
|
count(*) as cnt
|
|
106
106
|
FROM {data_ref.full_name}
|
|
107
107
|
WHERE 1=1
|
|
@@ -161,10 +161,14 @@ class ClickHouseAdapter(BaseDatabaseAdapter):
|
|
|
161
161
|
|
|
162
162
|
return None, None
|
|
163
163
|
|
|
164
|
-
def _get_type_conversion_rules(self, timezone:str
|
|
164
|
+
def _get_type_conversion_rules(self, timezone: str) -> Dict[str, Callable]:
|
|
165
165
|
return {
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
166
|
+
r'datetime64|datetime': lambda x: pd.to_datetime(x, utc=True, errors='coerce')
|
|
167
|
+
.dt.tz_convert(timezone)
|
|
168
|
+
.dt.strftime(DATETIME_FORMAT)
|
|
169
|
+
.str.replace(r'\s00:00:00$', '', regex=True),
|
|
170
|
+
r'date': lambda x: pd.to_datetime(x, errors='coerce')
|
|
171
|
+
.dt.strftime(DATE_FORMAT)
|
|
172
|
+
.str.replace(r'\s00:00:00$', '', regex=True),
|
|
173
|
+
r'uint64|uint8|float|decimal|int32': lambda x: x.astype(str).str.replace(r'\.0+$', '', regex=True),
|
|
174
|
+
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from typing import Optional, Dict, Callable, List, Tuple, Union
|
|
3
|
-
|
|
4
|
-
from ..constants import
|
|
3
|
+
|
|
4
|
+
from ..constants import DATETIME_FORMAT
|
|
5
5
|
from .base import BaseDatabaseAdapter, Engine
|
|
6
6
|
from ..models import DataReference, ObjectType
|
|
7
7
|
from ..exceptions import QueryExecutionError
|
|
@@ -63,7 +63,12 @@ class DataQualityComparator:
|
|
|
63
63
|
DBMSType.CLICKHOUSE: ClickHouseAdapter(),
|
|
64
64
|
}
|
|
65
65
|
self._reset_stats()
|
|
66
|
+
from . import __version__
|
|
66
67
|
app_logger.info('start')
|
|
68
|
+
app_logger.info(f'Version: v{__version__}')
|
|
69
|
+
app_logger.info(f'Source DB: {self.source_db_type.name}')
|
|
70
|
+
app_logger.info(f'Target DB: {self.target_db_type.name}')
|
|
71
|
+
|
|
67
72
|
|
|
68
73
|
def reset_stats(self):
|
|
69
74
|
self._reset_stats()
|
|
@@ -208,9 +213,8 @@ class DataQualityComparator:
|
|
|
208
213
|
)
|
|
209
214
|
target_counts = self._execute_query((target_query, target_params), self.target_engine, self.timezone)
|
|
210
215
|
|
|
216
|
+
|
|
211
217
|
source_counts_filled, target_counts_filled = cross_fill_missing_dates(source_counts, target_counts)
|
|
212
|
-
source_counts_filled['dt'] = pd.to_datetime(source_counts_filled['dt'], format='%Y-%m-%d')
|
|
213
|
-
target_counts_filled['dt'] = pd.to_datetime(target_counts_filled['dt'], format='%Y-%m-%d')
|
|
214
218
|
|
|
215
219
|
merged = source_counts_filled.merge(target_counts_filled, on='dt')
|
|
216
220
|
total_count_source = source_counts_filled['cnt'].sum()
|
|
@@ -3,13 +3,9 @@ import numpy as np
|
|
|
3
3
|
from typing import Dict, Any, List, Optional, Tuple, defaultdict
|
|
4
4
|
from datetime import datetime
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
except ImportError:
|
|
10
|
-
# for cases when used as standalone script
|
|
11
|
-
from constants import NULL_REPLACEMENT, DEFAULT_MAX_EXAMPLES, DATETIME_FORMAT
|
|
12
|
-
from logger import app_logger
|
|
6
|
+
|
|
7
|
+
from .constants import NULL_REPLACEMENT, DEFAULT_MAX_EXAMPLES, DATETIME_FORMAT
|
|
8
|
+
from .logger import app_logger
|
|
13
9
|
|
|
14
10
|
from dataclasses import dataclass, field
|
|
15
11
|
|
|
@@ -528,7 +524,7 @@ def prepare_dataframe(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
528
524
|
|
|
529
525
|
|
|
530
526
|
df = df.fillna(NULL_REPLACEMENT)
|
|
531
|
-
df = df.replace(r'(?i)^(None|nan|NaN|\s*)$', NULL_REPLACEMENT, regex=True)
|
|
527
|
+
df = df.replace(r'(?i)^(None|nan|NaN|NaT|\s*)$', NULL_REPLACEMENT, regex=True)
|
|
532
528
|
|
|
533
529
|
df = df.astype(str)
|
|
534
530
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xoverrr
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.5
|
|
4
4
|
Summary: A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
|
|
5
5
|
Author-email: Dmitry Ischenko <hotmori@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -40,6 +40,70 @@ Dynamic: license-file
|
|
|
40
40
|
|
|
41
41
|
A tool for cross-database and intra-source data comparison with detailed discrepancy analysis and reporting.
|
|
42
42
|
|
|
43
|
+
## Usage Example
|
|
44
|
+
**Sample comparison** (Greenplum vs Oracle):
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
|
|
48
|
+
import os
|
|
49
|
+
from datetime import date, timedelta
|
|
50
|
+
|
|
51
|
+
USER_ORA = os.getenv('USER_ORA', '')
|
|
52
|
+
PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
|
|
53
|
+
|
|
54
|
+
USER_GP = os.getenv('USER_GP', '')
|
|
55
|
+
PASSWORD_GP = os.getenv('PASSWORD_GP', '')
|
|
56
|
+
|
|
57
|
+
HOST_ORA = os.getenv('HOST_ORA', '')
|
|
58
|
+
HOST_GP = os.getenv('HOST_GP', '')
|
|
59
|
+
|
|
60
|
+
def create_src_engine(user, password, host):
|
|
61
|
+
"""Source engine (Oracle)"""
|
|
62
|
+
os.environ['NLS_LANG'] = '.AL32UTF8'
|
|
63
|
+
return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
|
|
64
|
+
|
|
65
|
+
def create_trg_engine(user, password, host):
|
|
66
|
+
"""Target engine (Postgres/Greenplum)"""
|
|
67
|
+
connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
|
|
68
|
+
engine = create_engine(connection_string)
|
|
69
|
+
return engine
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST_ORA)
|
|
73
|
+
trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST_GP)
|
|
74
|
+
|
|
75
|
+
comparator = DataQualityComparator(
|
|
76
|
+
source_engine=src_engine,
|
|
77
|
+
target_engine=trg_engine,
|
|
78
|
+
timezone='Europe/Athens'
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
source = DataReference("users", "schema1")
|
|
82
|
+
target = DataReference("users", "schema2")
|
|
83
|
+
|
|
84
|
+
FORMAT = '%Y-%m-%d'
|
|
85
|
+
recent_range_end = date.today()
|
|
86
|
+
recent_range_begin = recent_range_end - timedelta(days=1)
|
|
87
|
+
|
|
88
|
+
status, report, stats, details = comparator.compare_sample(
|
|
89
|
+
source,
|
|
90
|
+
target,
|
|
91
|
+
date_column="created_at",
|
|
92
|
+
update_column="modified_date",
|
|
93
|
+
exclude_columns=["audit_timestamp", "internal_id"],
|
|
94
|
+
exclude_recent_hours=3,
|
|
95
|
+
date_range=(
|
|
96
|
+
recent_range_begin.strftime(FORMAT),
|
|
97
|
+
recent_range_end.strftime(FORMAT)
|
|
98
|
+
),
|
|
99
|
+
tolerance_percentage=0
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
print(report)
|
|
103
|
+
if status == COMPARISON_FAILED:
|
|
104
|
+
raise Exception("Sample check failed")
|
|
105
|
+
```
|
|
106
|
+
|
|
43
107
|
## Key Features
|
|
44
108
|
- **Multi‑DBMS support**: Oracle, PostgreSQL (+ Greenplum), ClickHouse (extensible via adapter layer) — tables and views.
|
|
45
109
|
- **Universal connections**: Provide SQLAlchemy Engine objects for source and target databases.
|
|
@@ -273,71 +337,3 @@ Logs include timing information and structured context:
|
|
|
273
337
|
- If `final_diff_score ≤ tolerance`: status = `COMPARISON_SUCCESS`
|
|
274
338
|
- Enables configuration of acceptable discrepancy levels.
|
|
275
339
|
|
|
276
|
-
---
|
|
277
|
-
|
|
278
|
-
## Usage Example
|
|
279
|
-
**Sample comparison** (Greenplum vs Oracle):
|
|
280
|
-
|
|
281
|
-
```python
|
|
282
|
-
from xoverrr import DataQualityComparator, DataReference, COMPARISON_SUCCESS, COMPARISON_FAILED, COMPARISON_SKIPPED
|
|
283
|
-
import os
|
|
284
|
-
from datetime import date, timedelta
|
|
285
|
-
|
|
286
|
-
USER_ORA = os.getenv('USER_ORA', '')
|
|
287
|
-
PASSWORD_ORA = os.getenv('PASSWORD_ORA', '')
|
|
288
|
-
|
|
289
|
-
USER_GP = os.getenv('USER_GP', '')
|
|
290
|
-
PASSWORD_GP = os.getenv('PASSWORD_GP', '')
|
|
291
|
-
|
|
292
|
-
HOST = os.getenv('HOST', '')
|
|
293
|
-
|
|
294
|
-
def create_src_engine(user, password, host):
|
|
295
|
-
"""Source engine (Oracle)"""
|
|
296
|
-
os.environ['NLS_LANG'] = '.AL32UTF8'
|
|
297
|
-
return create_engine(f'oracle+oracledb://{user}:{password}@{host}:1521/?service_name=dwh')
|
|
298
|
-
|
|
299
|
-
def create_trg_engine(user, password, host):
|
|
300
|
-
"""Target engine (Postgres/Greenplum)"""
|
|
301
|
-
connection_string = f'postgresql+psycopg2://{user}:{password}@{host}:5432/adb'
|
|
302
|
-
engine = create_engine(connection_string)
|
|
303
|
-
return engine
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
src_engine = create_src_engine(USER_ORA, PASSWORD_ORA, HOST)
|
|
308
|
-
trg_engine = create_trg_engine(USER_GP, PASSWORD_GP, HOST)
|
|
309
|
-
|
|
310
|
-
comparator = DataQualityComparator(
|
|
311
|
-
source_engine=src_engine,
|
|
312
|
-
target_engine=trg_engine,
|
|
313
|
-
timezone='Asia/Yekaterinburg'
|
|
314
|
-
)
|
|
315
|
-
|
|
316
|
-
source = DataReference("users", "schema1")
|
|
317
|
-
target = DataReference("users", "schema2")
|
|
318
|
-
|
|
319
|
-
FORMAT = '%Y-%m-%d'
|
|
320
|
-
recent_range_end = date.today()
|
|
321
|
-
recent_range_begin = recent_range_end - timedelta(days=1)
|
|
322
|
-
|
|
323
|
-
status, report, stats, details = comparator.compare_sample(
|
|
324
|
-
source,
|
|
325
|
-
target,
|
|
326
|
-
date_column="created_at",
|
|
327
|
-
update_column="modified_date",
|
|
328
|
-
exclude_columns=["audit_timestamp", "internal_id"],
|
|
329
|
-
exclude_recent_hours=24,
|
|
330
|
-
date_range=(
|
|
331
|
-
recent_range_begin.strftime(FORMAT),
|
|
332
|
-
recent_range_end.strftime(FORMAT)
|
|
333
|
-
),
|
|
334
|
-
tolerance_percentage=0
|
|
335
|
-
)
|
|
336
|
-
|
|
337
|
-
print(report)
|
|
338
|
-
if status == COMPARISON_FAILED:
|
|
339
|
-
raise Exception("Sample check failed")
|
|
340
|
-
```
|
|
341
|
-
|
|
342
|
-
---
|
|
343
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|