dcs-sdk 1.4.8__py3-none-any.whl → 1.4.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/databases/base.py +3 -0
- data_diff/hashdiff_tables.py +1 -1
- data_diff/table_segment.py +74 -10
- data_diff/utils.py +3 -0
- dcs_sdk/__version__.py +1 -1
- dcs_sdk/sdk/data_diff/data_differ.py +11 -4
- {dcs_sdk-1.4.8.dist-info → dcs_sdk-1.4.9.dist-info}/METADATA +2 -2
- {dcs_sdk-1.4.8.dist-info → dcs_sdk-1.4.9.dist-info}/RECORD +10 -10
- {dcs_sdk-1.4.8.dist-info → dcs_sdk-1.4.9.dist-info}/WHEEL +0 -0
- {dcs_sdk-1.4.8.dist-info → dcs_sdk-1.4.9.dist-info}/entry_points.txt +0 -0
data_diff/databases/base.py
CHANGED
|
@@ -54,6 +54,7 @@ from typing_extensions import Self
|
|
|
54
54
|
from data_diff.abcs.compiler import AbstractCompiler, Compilable
|
|
55
55
|
from data_diff.abcs.database_types import (
|
|
56
56
|
JSON,
|
|
57
|
+
ArithAlphanumeric,
|
|
57
58
|
Array,
|
|
58
59
|
Boolean,
|
|
59
60
|
ColType,
|
|
@@ -753,6 +754,8 @@ class BaseDialect(abc.ABC):
|
|
|
753
754
|
return f"'{v.decode()}'"
|
|
754
755
|
elif isinstance(v, Code):
|
|
755
756
|
return v.code
|
|
757
|
+
elif isinstance(v, ArithAlphanumeric):
|
|
758
|
+
return f"'{v._str}'"
|
|
756
759
|
return repr(v)
|
|
757
760
|
|
|
758
761
|
def constant_values(self, rows) -> str:
|
data_diff/hashdiff_tables.py
CHANGED
|
@@ -522,7 +522,7 @@ class HashDiffer(TableDiffer):
|
|
|
522
522
|
|
|
523
523
|
if count1 == 0 and count2 == 0:
|
|
524
524
|
logger.debug(
|
|
525
|
-
"Uneven distribution of keys detected in segment
|
|
525
|
+
"Uneven distribution of keys detected in segment {}..{} (big gaps in the key column). "
|
|
526
526
|
"For better performance, we recommend to increase the bisection-threshold.",
|
|
527
527
|
table1.min_key,
|
|
528
528
|
table1.max_key,
|
data_diff/table_segment.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
16
|
import time
|
|
17
|
+
from decimal import Decimal
|
|
17
18
|
from itertools import product
|
|
18
19
|
from typing import Container, Dict, List, Optional, Sequence, Tuple
|
|
19
20
|
|
|
@@ -24,7 +25,18 @@ from typing_extensions import Self
|
|
|
24
25
|
|
|
25
26
|
from data_diff.abcs.database_types import DbKey, DbPath, DbTime, IKey, NumericType
|
|
26
27
|
from data_diff.databases.base import Database
|
|
27
|
-
from data_diff.queries.api import
|
|
28
|
+
from data_diff.queries.api import (
|
|
29
|
+
SKIP,
|
|
30
|
+
Code,
|
|
31
|
+
Count,
|
|
32
|
+
Expr,
|
|
33
|
+
and_,
|
|
34
|
+
max_,
|
|
35
|
+
min_,
|
|
36
|
+
or_,
|
|
37
|
+
table,
|
|
38
|
+
this,
|
|
39
|
+
)
|
|
28
40
|
from data_diff.queries.extras import (
|
|
29
41
|
ApplyFuncAndNormalizeAsString,
|
|
30
42
|
Checksum,
|
|
@@ -54,6 +66,10 @@ def split_key_space(min_key: DbKey, max_key: DbKey, count: int) -> List[DbKey]:
|
|
|
54
66
|
assert type(min_key) is type(max_key)
|
|
55
67
|
checkpoints = min_key.range(max_key, count)
|
|
56
68
|
else:
|
|
69
|
+
if isinstance(min_key, Decimal):
|
|
70
|
+
min_key = float(min_key)
|
|
71
|
+
if isinstance(max_key, Decimal):
|
|
72
|
+
max_key = float(max_key)
|
|
57
73
|
checkpoints = split_space(min_key, max_key, count)
|
|
58
74
|
|
|
59
75
|
assert all(min_key < x < max_key for x in checkpoints)
|
|
@@ -288,17 +304,65 @@ class TableSegment:
|
|
|
288
304
|
|
|
289
305
|
return result
|
|
290
306
|
|
|
291
|
-
def get_sample_data(self, limit: int = 100) -> list:
|
|
292
|
-
|
|
307
|
+
# def get_sample_data(self, limit: int = 100) -> list:
|
|
308
|
+
# "Download all the relevant values of the segment from the database"
|
|
309
|
+
|
|
310
|
+
# exprs = []
|
|
311
|
+
# for c in self.key_columns:
|
|
312
|
+
# quoted = self.database.dialect.quote(c)
|
|
313
|
+
# exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
|
|
314
|
+
# if self.where:
|
|
315
|
+
# select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
|
|
316
|
+
# self.key_columns
|
|
317
|
+
# else:
|
|
318
|
+
# select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
|
|
319
|
+
|
|
320
|
+
# start_time = time.monotonic()
|
|
321
|
+
# result = self.database.query(select, List[Tuple])
|
|
322
|
+
# query_time_ms = (time.monotonic() - start_time) * 1000
|
|
323
|
+
# self._update_stats("row_fetch_queries_stats", query_time_ms)
|
|
324
|
+
|
|
325
|
+
def get_sample_data(self, limit: int = 100, sample_keys: Optional[List[List[DbKey]]] = None) -> list:
|
|
326
|
+
"""
|
|
327
|
+
Download relevant values of the segment from the database.
|
|
328
|
+
If `sample_keys` is provided, it filters rows matching those composite keys.
|
|
329
|
+
|
|
330
|
+
Parameters:
|
|
331
|
+
limit (int): Maximum number of rows to return (default: 100).
|
|
332
|
+
sample_keys (Optional[List[List[DbKey]]]): List of composite keys to filter rows.
|
|
333
|
+
Each inner list must match the number of key_columns.
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
list: List of tuples containing the queried row data.
|
|
337
|
+
"""
|
|
338
|
+
select = self.make_select().select(*self._relevant_columns_repr)
|
|
293
339
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
340
|
+
filters = []
|
|
341
|
+
|
|
342
|
+
if sample_keys:
|
|
343
|
+
key_exprs = []
|
|
344
|
+
for key_values in sample_keys:
|
|
345
|
+
and_exprs = []
|
|
346
|
+
for col, val in safezip(self.key_columns, key_values):
|
|
347
|
+
quoted = self.database.dialect.quote(col)
|
|
348
|
+
schema = self._schema[col]
|
|
349
|
+
if val is None:
|
|
350
|
+
and_exprs.append(Code(quoted + " IS NULL"))
|
|
351
|
+
continue
|
|
352
|
+
mk_v = schema.make_value(val)
|
|
353
|
+
constant_val = self.database.dialect._constant_value(mk_v)
|
|
354
|
+
where_expr = f"{quoted} = {constant_val}"
|
|
355
|
+
and_exprs.append(Code(where_expr))
|
|
356
|
+
if and_exprs:
|
|
357
|
+
key_exprs.append(and_(*and_exprs))
|
|
358
|
+
if key_exprs:
|
|
359
|
+
filters.append(or_(*key_exprs))
|
|
360
|
+
if filters or self.where:
|
|
361
|
+
select = select.where(*filters)
|
|
300
362
|
else:
|
|
301
|
-
|
|
363
|
+
logger.warning("No filters applied; fetching up to {} rows without key restrictions", limit)
|
|
364
|
+
|
|
365
|
+
select = select.limit(limit)
|
|
302
366
|
|
|
303
367
|
start_time = time.monotonic()
|
|
304
368
|
result = self.database.query(select, List[Tuple])
|
data_diff/utils.py
CHANGED
|
@@ -482,6 +482,9 @@ def number_to_human(n):
|
|
|
482
482
|
|
|
483
483
|
|
|
484
484
|
def split_space(start, end, count) -> List[int]:
|
|
485
|
+
if isinstance(start, float) or isinstance(end, float):
|
|
486
|
+
step = (end - start) / (count + 1)
|
|
487
|
+
return [start + step * i for i in range(1, count + 1)]
|
|
485
488
|
size = end - start
|
|
486
489
|
assert count <= size, (count, size)
|
|
487
490
|
return list(range(start, end, (size + 1) // (count + 1)))[1 : count + 1]
|
dcs_sdk/__version__.py
CHANGED
|
@@ -395,8 +395,10 @@ class DBTableDiffer:
|
|
|
395
395
|
error_message = f"Target table '{target_dataset.get('table_name')}' is empty"
|
|
396
396
|
is_table_empty = True
|
|
397
397
|
if not is_table_empty and not self.config.schema_diff:
|
|
398
|
+
pks_len = len(self.table1.key_columns)
|
|
398
399
|
table_1_sample_data = self.table1.with_schema().get_sample_data(limit=100)
|
|
399
|
-
|
|
400
|
+
sample_keys = [list(row[:pks_len]) for row in table_1_sample_data]
|
|
401
|
+
table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100, sample_keys=sample_keys)
|
|
400
402
|
self.diff_iter = diff_tables(
|
|
401
403
|
self.table1,
|
|
402
404
|
self.table2,
|
|
@@ -598,9 +600,14 @@ class DBTableDiffer:
|
|
|
598
600
|
columns_order_wise=columns_order_wise_target if columns_order_wise_target else [],
|
|
599
601
|
)
|
|
600
602
|
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
603
|
+
sample_value_column_names_src = list(self.table1.key_columns) + list(self.table1.extra_columns)
|
|
604
|
+
sample_value_column_names_tgt = list(self.table2.key_columns) + list(self.table2.extra_columns)
|
|
605
|
+
sample_value_source_dicts = [
|
|
606
|
+
dict(zip(sample_value_column_names_src, row)) for row in table_1_sample_data
|
|
607
|
+
]
|
|
608
|
+
sample_value_target_dicts = [
|
|
609
|
+
dict(zip(sample_value_column_names_tgt, row)) for row in table_2_sample_data
|
|
610
|
+
]
|
|
604
611
|
|
|
605
612
|
def get_pk(row, key_columns):
|
|
606
613
|
return tuple(row[k] for k in key_columns)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: dcs-sdk
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.9
|
|
4
4
|
Summary: SDK for DataChecks
|
|
5
5
|
Author: Waterdip Labs
|
|
6
6
|
Author-email: hello@waterdip.ai
|
|
@@ -60,7 +60,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
|
|
|
60
60
|
Description-Content-Type: text/markdown
|
|
61
61
|
|
|
62
62
|
<h1 align="center">
|
|
63
|
-
DCS SDK v1.4.
|
|
63
|
+
DCS SDK v1.4.9
|
|
64
64
|
</h1>
|
|
65
65
|
|
|
66
66
|
> SDK for DataChecks
|
|
@@ -6,7 +6,7 @@ data_diff/abcs/database_types.py,sha256=dHE6K6UtqFwX6LTjOqJu1OOb_XJBXODeMcZQ0cds
|
|
|
6
6
|
data_diff/config.py,sha256=uRcoVVhPjVZqgQNwr18v6sPq06cGXDLemTUyitU57zA,4998
|
|
7
7
|
data_diff/databases/__init__.py,sha256=NrBm1Paj7jkHZ_hQCD-4-Q1eeDdh3v9_bz1DkPDOv9g,1680
|
|
8
8
|
data_diff/databases/_connect.py,sha256=nGsmtzDSKN8CK8zMkdcGZz0iExzkJDYw-PGebIkmQgc,11151
|
|
9
|
-
data_diff/databases/base.py,sha256=
|
|
9
|
+
data_diff/databases/base.py,sha256=Fd_OzWOL97YiTXE_oYsBthXVVLvJGbU5Y4JgR77Sra0,51904
|
|
10
10
|
data_diff/databases/bigquery.py,sha256=PDwSkmWRW26gUl2SMOyIsiYtgrqghptYqG8_SaaiXb4,14709
|
|
11
11
|
data_diff/databases/clickhouse.py,sha256=5DsW8UpyYsWI8I3AlPUvHWYdWdWpqzRsvsVwgqEyaLw,7554
|
|
12
12
|
data_diff/databases/databricks.py,sha256=6cdwfAspg1GIgWFlQByvFcW_Hz1mnJeI9_2kZhOP8b4,9334
|
|
@@ -24,7 +24,7 @@ data_diff/databases/vertica.py,sha256=2dSDZp6qOEvUVPldI5Tgn7Sm3dCpC3vNXJL3qb3FDv
|
|
|
24
24
|
data_diff/diff_tables.py,sha256=Ey88gUr9Wh8UVsgRlBCY3CACIYfHL52PxJSrd821aqg,20060
|
|
25
25
|
data_diff/errors.py,sha256=4Yru8yOwyuDuBlTABnGCvJMSpe6-rbLJpNnVHeTTyHU,745
|
|
26
26
|
data_diff/format.py,sha256=QFDjdZaBVf_N-jfKiX4ppOUdpXTPZXmv1j0pc1RiOoc,10245
|
|
27
|
-
data_diff/hashdiff_tables.py,sha256=
|
|
27
|
+
data_diff/hashdiff_tables.py,sha256=K-JoMimAwACB3mqQLIdv5P5joYWyRMSZKcbDJWz5dlk,27993
|
|
28
28
|
data_diff/info_tree.py,sha256=yHtFSoXuu6oBafLYOYQjUSKlB-DnAAd08U9HOEAdTPI,2799
|
|
29
29
|
data_diff/joindiff_tables.py,sha256=fyrEYjyh2BX1vGibwVZLYM1V6JJTOY-uGXY-KInvMkw,17612
|
|
30
30
|
data_diff/lexicographic_space.py,sha256=bBoCbbH1Mla9jNOq1b5RuwjAxSVU7gWkra673tPBwXQ,8305
|
|
@@ -37,20 +37,20 @@ data_diff/queries/base.py,sha256=pT-iaII7Nlu-w-Cuq9fhoNKX7-GSxkQ3Fk8K-tMkk60,964
|
|
|
37
37
|
data_diff/queries/extras.py,sha256=aUm-ifj3BMlz4o4bbuHtmnvHZuptYAKGS5yWTHmNpvc,1270
|
|
38
38
|
data_diff/query_utils.py,sha256=R7ZfRwcvv9Zf4zWXNln4tr_OxLmDI7CPmmCahYfHxlo,2101
|
|
39
39
|
data_diff/schema.py,sha256=QoYSSB3k-svLXz680uRgsI4qjii8BFKOOQvheqtgEbs,2413
|
|
40
|
-
data_diff/table_segment.py,sha256
|
|
40
|
+
data_diff/table_segment.py,sha256=-zYdppv8lW8ZfDRzZvmARm1fYxGHhAYPurR0r0GmWh8,21350
|
|
41
41
|
data_diff/thread_utils.py,sha256=_692ERjnWfHKaZsLdg7CNfkKiRd66y7_kpgDwzntp44,3831
|
|
42
|
-
data_diff/utils.py,sha256=
|
|
42
|
+
data_diff/utils.py,sha256=IHKaFfyizOiA1By0sfnTra7CJvgEDwr5KPGjev0kx0A,23328
|
|
43
43
|
data_diff/version.py,sha256=Wk0ovyBlLEF2UaWLWEcVBLFElREtIxi7TU1hD3CuTFI,634
|
|
44
44
|
dcs_sdk/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
45
45
|
dcs_sdk/__main__.py,sha256=Qn8stIaQGrdLjHQ-H7xO0T-brtq5RWZoWU9QvqoarV8,683
|
|
46
|
-
dcs_sdk/__version__.py,sha256=
|
|
46
|
+
dcs_sdk/__version__.py,sha256=lS23jYuOvSiv7SJ1XHD4d1O93eassDRAcs2EuasDZ-0,633
|
|
47
47
|
dcs_sdk/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
48
48
|
dcs_sdk/cli/cli.py,sha256=LyrRk972OL9pTqrvBeXWBu5rUDAN17lQ1g8FdSRW_8M,4299
|
|
49
49
|
dcs_sdk/sdk/__init__.py,sha256=skrZcgWWJBL6NXTUERywJ3qRJRemgpDXyW7lPg1FJk8,2107
|
|
50
50
|
dcs_sdk/sdk/config/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
51
51
|
dcs_sdk/sdk/config/config_loader.py,sha256=Sg0gvleGrOcDaX41amgw6Uj6c_s4Zel6gQalIp9CltI,21416
|
|
52
52
|
dcs_sdk/sdk/data_diff/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
|
|
53
|
-
dcs_sdk/sdk/data_diff/data_differ.py,sha256=
|
|
53
|
+
dcs_sdk/sdk/data_diff/data_differ.py,sha256=8WBDz-q-Li5OyJcG_a6JRnDPCJBvIsMki-DrD4EgDzQ,35575
|
|
54
54
|
dcs_sdk/sdk/rules/__init__.py,sha256=_BkKcE_jfdDQI_ECdOamJaefMKEXrKpYjPpnBQXl_Xs,657
|
|
55
55
|
dcs_sdk/sdk/rules/rules_mappping.py,sha256=fxakVkf7B2cVkYSO946LTim_HmMsl6lBDBqZjTTsSPI,1292
|
|
56
56
|
dcs_sdk/sdk/rules/rules_repository.py,sha256=x0Rli-wdnHAmXm5526go_qC3P-eFRt-4L7fs4hNqC-g,7564
|
|
@@ -65,7 +65,7 @@ dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py,sha256=puAWP
|
|
|
65
65
|
dcs_sdk/sdk/utils/table.py,sha256=mXArkEYmxf7mz_DJvyHVHIN9RyxaF708I70n52JrptA,18268
|
|
66
66
|
dcs_sdk/sdk/utils/themes.py,sha256=Meo2Yldv4uyPpEqI7qdA28Aa6sxtwUU1dLKKm4QavjM,1403
|
|
67
67
|
dcs_sdk/sdk/utils/utils.py,sha256=vF2zAvgt__Y8limicWTEWRyn41SBVJN81ZCTBRy6hQg,11907
|
|
68
|
-
dcs_sdk-1.4.
|
|
69
|
-
dcs_sdk-1.4.
|
|
70
|
-
dcs_sdk-1.4.
|
|
71
|
-
dcs_sdk-1.4.
|
|
68
|
+
dcs_sdk-1.4.9.dist-info/METADATA,sha256=JUBldmhqJC8gu9G2eftTjkcf8rgWsPgsPQkk5tlkHWw,6221
|
|
69
|
+
dcs_sdk-1.4.9.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
70
|
+
dcs_sdk-1.4.9.dist-info/entry_points.txt,sha256=zQtrZL7YuaKtt6WPwihCTV1BRXnqBkaY6zUGdYJbBSg,49
|
|
71
|
+
dcs_sdk-1.4.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|