dcs-sdk 1.4.7__tar.gz → 1.4.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/PKG-INFO +2 -2
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/README.md +1 -1
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/abcs/database_types.py +1 -1
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/base.py +3 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/mssql.py +16 -8
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/sybase.py +29 -24
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/hashdiff_tables.py +1 -1
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/table_segment.py +75 -54
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/utils.py +3 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/__version__.py +1 -1
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/data_diff/data_differ.py +11 -4
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/pyproject.toml +1 -1
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/__main__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/abcs/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/abcs/compiler.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/config.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/_connect.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/bigquery.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/clickhouse.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/databricks.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/duckdb.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/mysql.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/oracle.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/postgresql.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/presto.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/redshift.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/snowflake.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/trino.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/vertica.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/diff_tables.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/errors.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/format.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/info_tree.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/joindiff_tables.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/lexicographic_space.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/parse_time.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/py.typed +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/queries/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/queries/api.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/queries/ast_classes.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/queries/base.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/queries/extras.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/query_utils.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/schema.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/thread_utils.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/version.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/__main__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/cli/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/cli/cli.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/config/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/config/config_loader.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/data_diff/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/rules_mappping.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/rules_repository.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/schema_rules.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/serializer.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/__init__.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/base_provider.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/table.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/themes.py +0 -0
- {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: dcs-sdk
|
|
3
|
-
Version: 1.4.
|
|
3
|
+
Version: 1.4.9
|
|
4
4
|
Summary: SDK for DataChecks
|
|
5
5
|
Author: Waterdip Labs
|
|
6
6
|
Author-email: hello@waterdip.ai
|
|
@@ -60,7 +60,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
|
|
|
60
60
|
Description-Content-Type: text/markdown
|
|
61
61
|
|
|
62
62
|
<h1 align="center">
|
|
63
|
-
DCS SDK v1.4.
|
|
63
|
+
DCS SDK v1.4.9
|
|
64
64
|
</h1>
|
|
65
65
|
|
|
66
66
|
> SDK for DataChecks
|
|
@@ -22,7 +22,7 @@ import attrs
|
|
|
22
22
|
from data_diff.utils import ArithAlphanumeric, ArithUnicodeString, ArithUUID, Unknown
|
|
23
23
|
|
|
24
24
|
DbPath = Tuple[str, ...]
|
|
25
|
-
DbKey = Union[int, str, bytes, ArithUUID, ArithAlphanumeric]
|
|
25
|
+
DbKey = Union[int, str, bytes, ArithUUID, ArithAlphanumeric, ArithUnicodeString]
|
|
26
26
|
DbTime = datetime
|
|
27
27
|
|
|
28
28
|
N = TypeVar("N")
|
|
@@ -54,6 +54,7 @@ from typing_extensions import Self
|
|
|
54
54
|
from data_diff.abcs.compiler import AbstractCompiler, Compilable
|
|
55
55
|
from data_diff.abcs.database_types import (
|
|
56
56
|
JSON,
|
|
57
|
+
ArithAlphanumeric,
|
|
57
58
|
Array,
|
|
58
59
|
Boolean,
|
|
59
60
|
ColType,
|
|
@@ -753,6 +754,8 @@ class BaseDialect(abc.ABC):
|
|
|
753
754
|
return f"'{v.decode()}'"
|
|
754
755
|
elif isinstance(v, Code):
|
|
755
756
|
return v.code
|
|
757
|
+
elif isinstance(v, ArithAlphanumeric):
|
|
758
|
+
return f"'{v._str}'"
|
|
756
759
|
return repr(v)
|
|
757
760
|
|
|
758
761
|
def constant_values(self, rows) -> str:
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import re
|
|
15
16
|
from typing import Any, ClassVar, Dict, Optional, Type
|
|
16
17
|
|
|
17
18
|
import attrs
|
|
@@ -118,15 +119,16 @@ class Dialect(BaseDialect):
|
|
|
118
119
|
WHERE name = CURRENT_USER"""
|
|
119
120
|
|
|
120
121
|
def to_string(self, s: str) -> str:
|
|
121
|
-
# Both convert(varchar(max), …) and convert(text, …) do work.
|
|
122
|
-
import re
|
|
123
|
-
|
|
124
122
|
s_temp = re.sub(r'["\[\]`]', "", s)
|
|
125
123
|
col_info = self.get_column_raw_info(s_temp)
|
|
124
|
+
ch_len = (col_info and col_info.character_maximum_length) or None
|
|
125
|
+
if not ch_len:
|
|
126
|
+
ch_len = 2500
|
|
127
|
+
ch_len = max(ch_len, 2500)
|
|
126
128
|
if col_info and col_info.data_type in ["nvarchar", "nchar", "ntext"]:
|
|
127
|
-
return f"CONVERT(NVARCHAR(
|
|
129
|
+
return f"CONVERT(NVARCHAR({ch_len}), {s})"
|
|
128
130
|
|
|
129
|
-
return f"CONVERT(VARCHAR(
|
|
131
|
+
return f"CONVERT(VARCHAR({ch_len}), {s})"
|
|
130
132
|
|
|
131
133
|
def type_repr(self, t) -> str:
|
|
132
134
|
try:
|
|
@@ -165,9 +167,9 @@ class Dialect(BaseDialect):
|
|
|
165
167
|
|
|
166
168
|
# select_query = re.sub(r"TRIM\(([\w]+)\)", r"TRIM(CAST(\1 AS NVARCHAR(MAX)))", select_query)
|
|
167
169
|
|
|
168
|
-
select_query = re.sub(r"TRIM\(\[([\w]+)\]\)", r"LTRIM(RTRIM(CAST([\1] AS VARCHAR(8000))))", select_query)
|
|
170
|
+
# select_query = re.sub(r"TRIM\(\[([\w]+)\]\)", r"LTRIM(RTRIM(CAST([\1] AS VARCHAR(8000))))", select_query)
|
|
169
171
|
|
|
170
|
-
select_query = re.sub(r"TRIM\(([\w]+)\)", r"LTRIM(RTRIM(CAST(\1 AS VARCHAR(8000))))", select_query)
|
|
172
|
+
# select_query = re.sub(r"TRIM\(([\w]+)\)", r"LTRIM(RTRIM(CAST(\1 AS VARCHAR(8000))))", select_query)
|
|
171
173
|
|
|
172
174
|
return f"{select_query} {result}"
|
|
173
175
|
|
|
@@ -206,8 +208,14 @@ class Dialect(BaseDialect):
|
|
|
206
208
|
return tuple(name.split("."))
|
|
207
209
|
|
|
208
210
|
def normalize_uuid(self, value, coltype):
|
|
211
|
+
s_temp = re.sub(r'["\[\]`]', "", value)
|
|
212
|
+
col_info = self.get_column_raw_info(s_temp)
|
|
213
|
+
ch_len = (col_info and col_info.character_maximum_length) or None
|
|
214
|
+
if not ch_len:
|
|
215
|
+
ch_len = 2500
|
|
216
|
+
ch_len = max(ch_len, 2500)
|
|
209
217
|
if isinstance(coltype, String_UUID):
|
|
210
|
-
return f"CAST({value} AS VARCHAR(
|
|
218
|
+
return f"CAST({value} AS VARCHAR({ch_len}))"
|
|
211
219
|
return f"CAST({value} AS VARCHAR(36))"
|
|
212
220
|
|
|
213
221
|
|
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
|
+
import re
|
|
15
16
|
import time
|
|
16
17
|
from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type
|
|
17
18
|
|
|
@@ -140,17 +141,17 @@ class Dialect(BaseDialect):
|
|
|
140
141
|
WHERE name = CURRENT_USER"""
|
|
141
142
|
|
|
142
143
|
def to_string(self, s: str, coltype: str = None) -> str:
|
|
143
|
-
if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
|
|
144
|
-
# Sybase IQ or FreeTDS detected as IQ: Use VARCHAR(2500)
|
|
145
|
-
return f"CAST({s} AS VARCHAR(2500))"
|
|
146
|
-
# Sybase ASE or FreeTDS detected as ASE: Handle nvarchar
|
|
147
|
-
import re
|
|
148
|
-
|
|
149
144
|
s_temp = re.sub(r'["\[\]`]', "", s)
|
|
150
145
|
raw_col_info = self.get_column_raw_info(s_temp)
|
|
146
|
+
ch_len = (raw_col_info and raw_col_info.character_maximum_length) or None
|
|
147
|
+
if not ch_len:
|
|
148
|
+
ch_len = 2500
|
|
149
|
+
ch_len = max(ch_len, 2500)
|
|
150
|
+
if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
|
|
151
|
+
return f"CAST({s} AS VARCHAR({ch_len}))"
|
|
151
152
|
if raw_col_info and raw_col_info.data_type in ["nvarchar", "nchar", "ntext"]:
|
|
152
|
-
return f"CAST({s} AS NVARCHAR(
|
|
153
|
-
return f"CAST({s} AS VARCHAR(
|
|
153
|
+
return f"CAST({s} AS NVARCHAR({ch_len}))"
|
|
154
|
+
return f"CAST({s} AS VARCHAR({ch_len}))"
|
|
154
155
|
|
|
155
156
|
def type_repr(self, t) -> str:
|
|
156
157
|
try:
|
|
@@ -173,17 +174,15 @@ class Dialect(BaseDialect):
|
|
|
173
174
|
limit: Optional[int] = None,
|
|
174
175
|
has_order_by: Optional[bool] = None,
|
|
175
176
|
) -> str:
|
|
176
|
-
import re
|
|
177
|
+
# import re
|
|
177
178
|
|
|
178
|
-
def safe_trim(match):
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
select_query = re.sub(r"TRIM\(
|
|
185
|
-
|
|
186
|
-
select_query = re.sub(r"TRIM\(([\w]+)\)", safe_trim, select_query)
|
|
179
|
+
# def safe_trim(match):
|
|
180
|
+
# column_name = match.group(1)
|
|
181
|
+
# if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
|
|
182
|
+
# return f"TRIM(CAST({column_name} AS VARCHAR(2500)))"
|
|
183
|
+
# return f"TRIM(CAST({column_name} AS NVARCHAR(5000)))"
|
|
184
|
+
# select_query = re.sub(r"TRIM\(\[([\w]+)\]\)", safe_trim, select_query)
|
|
185
|
+
# select_query = re.sub(r"TRIM\(([\w]+)\)", safe_trim, select_query)
|
|
187
186
|
|
|
188
187
|
if limit is not None:
|
|
189
188
|
select_query = select_query.replace("SELECT", f"SELECT TOP {limit}", 1)
|
|
@@ -225,8 +224,8 @@ class Dialect(BaseDialect):
|
|
|
225
224
|
f"END"
|
|
226
225
|
)
|
|
227
226
|
if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
|
|
228
|
-
return f"CAST({value} AS VARCHAR(
|
|
229
|
-
return f"CAST({value} AS NVARCHAR(
|
|
227
|
+
return f"CAST({value} AS VARCHAR(100))"
|
|
228
|
+
return f"CAST({value} AS NVARCHAR(100))"
|
|
230
229
|
|
|
231
230
|
def normalize_number(self, value: str, coltype: FractionalType) -> str:
|
|
232
231
|
return self.to_string(f"CAST({value} AS DECIMAL(38, {coltype.precision}))")
|
|
@@ -326,13 +325,19 @@ class Dialect(BaseDialect):
|
|
|
326
325
|
return " || ".join(items)
|
|
327
326
|
|
|
328
327
|
def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
|
|
328
|
+
s_temp = re.sub(r'["\[\]`]', "", value)
|
|
329
|
+
raw_col_info = self.get_column_raw_info(s_temp)
|
|
330
|
+
ch_len = (raw_col_info and raw_col_info.character_maximum_length) or None
|
|
331
|
+
if not ch_len:
|
|
332
|
+
ch_len = 2500
|
|
333
|
+
ch_len = max(ch_len, 2500)
|
|
329
334
|
if isinstance(coltype, String_UUID):
|
|
330
335
|
if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
|
|
331
|
-
return f"CAST({value} AS VARCHAR(
|
|
332
|
-
return f"CAST({value} AS NVARCHAR(
|
|
336
|
+
return f"CAST({value} AS VARCHAR({ch_len}))" # IQ: Match column length
|
|
337
|
+
return f"CAST({value} AS NVARCHAR({ch_len}))" # ASE: Match column length
|
|
333
338
|
if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
|
|
334
|
-
return f"CONVERT(VARCHAR(
|
|
335
|
-
return f"CONVERT(NVARCHAR(
|
|
339
|
+
return f"CONVERT(VARCHAR({ch_len}), {value})"
|
|
340
|
+
return f"CONVERT(NVARCHAR({ch_len}), {value})"
|
|
336
341
|
|
|
337
342
|
def parse_table_name(self, name: str) -> DbPath:
|
|
338
343
|
"Parse the given table name into a DbPath"
|
|
@@ -522,7 +522,7 @@ class HashDiffer(TableDiffer):
|
|
|
522
522
|
|
|
523
523
|
if count1 == 0 and count2 == 0:
|
|
524
524
|
logger.debug(
|
|
525
|
-
"Uneven distribution of keys detected in segment
|
|
525
|
+
"Uneven distribution of keys detected in segment {}..{} (big gaps in the key column). "
|
|
526
526
|
"For better performance, we recommend to increase the bisection-threshold.",
|
|
527
527
|
table1.min_key,
|
|
528
528
|
table1.max_key,
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
16
|
import time
|
|
17
|
+
from decimal import Decimal
|
|
17
18
|
from itertools import product
|
|
18
19
|
from typing import Container, Dict, List, Optional, Sequence, Tuple
|
|
19
20
|
|
|
@@ -24,7 +25,18 @@ from typing_extensions import Self
|
|
|
24
25
|
|
|
25
26
|
from data_diff.abcs.database_types import DbKey, DbPath, DbTime, IKey, NumericType
|
|
26
27
|
from data_diff.databases.base import Database
|
|
27
|
-
from data_diff.queries.api import
|
|
28
|
+
from data_diff.queries.api import (
|
|
29
|
+
SKIP,
|
|
30
|
+
Code,
|
|
31
|
+
Count,
|
|
32
|
+
Expr,
|
|
33
|
+
and_,
|
|
34
|
+
max_,
|
|
35
|
+
min_,
|
|
36
|
+
or_,
|
|
37
|
+
table,
|
|
38
|
+
this,
|
|
39
|
+
)
|
|
28
40
|
from data_diff.queries.extras import (
|
|
29
41
|
ApplyFuncAndNormalizeAsString,
|
|
30
42
|
Checksum,
|
|
@@ -54,6 +66,10 @@ def split_key_space(min_key: DbKey, max_key: DbKey, count: int) -> List[DbKey]:
|
|
|
54
66
|
assert type(min_key) is type(max_key)
|
|
55
67
|
checkpoints = min_key.range(max_key, count)
|
|
56
68
|
else:
|
|
69
|
+
if isinstance(min_key, Decimal):
|
|
70
|
+
min_key = float(min_key)
|
|
71
|
+
if isinstance(max_key, Decimal):
|
|
72
|
+
max_key = float(max_key)
|
|
57
73
|
checkpoints = split_space(min_key, max_key, count)
|
|
58
74
|
|
|
59
75
|
assert all(min_key < x < max_key for x in checkpoints)
|
|
@@ -288,17 +304,65 @@ class TableSegment:
|
|
|
288
304
|
|
|
289
305
|
return result
|
|
290
306
|
|
|
291
|
-
def get_sample_data(self, limit: int = 100) -> list:
|
|
292
|
-
|
|
307
|
+
# def get_sample_data(self, limit: int = 100) -> list:
|
|
308
|
+
# "Download all the relevant values of the segment from the database"
|
|
309
|
+
|
|
310
|
+
# exprs = []
|
|
311
|
+
# for c in self.key_columns:
|
|
312
|
+
# quoted = self.database.dialect.quote(c)
|
|
313
|
+
# exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
|
|
314
|
+
# if self.where:
|
|
315
|
+
# select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
|
|
316
|
+
# self.key_columns
|
|
317
|
+
# else:
|
|
318
|
+
# select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
|
|
319
|
+
|
|
320
|
+
# start_time = time.monotonic()
|
|
321
|
+
# result = self.database.query(select, List[Tuple])
|
|
322
|
+
# query_time_ms = (time.monotonic() - start_time) * 1000
|
|
323
|
+
# self._update_stats("row_fetch_queries_stats", query_time_ms)
|
|
324
|
+
|
|
325
|
+
def get_sample_data(self, limit: int = 100, sample_keys: Optional[List[List[DbKey]]] = None) -> list:
|
|
326
|
+
"""
|
|
327
|
+
Download relevant values of the segment from the database.
|
|
328
|
+
If `sample_keys` is provided, it filters rows matching those composite keys.
|
|
329
|
+
|
|
330
|
+
Parameters:
|
|
331
|
+
limit (int): Maximum number of rows to return (default: 100).
|
|
332
|
+
sample_keys (Optional[List[List[DbKey]]]): List of composite keys to filter rows.
|
|
333
|
+
Each inner list must match the number of key_columns.
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
list: List of tuples containing the queried row data.
|
|
337
|
+
"""
|
|
338
|
+
select = self.make_select().select(*self._relevant_columns_repr)
|
|
293
339
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
340
|
+
filters = []
|
|
341
|
+
|
|
342
|
+
if sample_keys:
|
|
343
|
+
key_exprs = []
|
|
344
|
+
for key_values in sample_keys:
|
|
345
|
+
and_exprs = []
|
|
346
|
+
for col, val in safezip(self.key_columns, key_values):
|
|
347
|
+
quoted = self.database.dialect.quote(col)
|
|
348
|
+
schema = self._schema[col]
|
|
349
|
+
if val is None:
|
|
350
|
+
and_exprs.append(Code(quoted + " IS NULL"))
|
|
351
|
+
continue
|
|
352
|
+
mk_v = schema.make_value(val)
|
|
353
|
+
constant_val = self.database.dialect._constant_value(mk_v)
|
|
354
|
+
where_expr = f"{quoted} = {constant_val}"
|
|
355
|
+
and_exprs.append(Code(where_expr))
|
|
356
|
+
if and_exprs:
|
|
357
|
+
key_exprs.append(and_(*and_exprs))
|
|
358
|
+
if key_exprs:
|
|
359
|
+
filters.append(or_(*key_exprs))
|
|
360
|
+
if filters or self.where:
|
|
361
|
+
select = select.where(*filters)
|
|
300
362
|
else:
|
|
301
|
-
|
|
363
|
+
logger.warning("No filters applied; fetching up to {} rows without key restrictions", limit)
|
|
364
|
+
|
|
365
|
+
select = select.limit(limit)
|
|
302
366
|
|
|
303
367
|
start_time = time.monotonic()
|
|
304
368
|
result = self.database.query(select, List[Tuple])
|
|
@@ -317,52 +381,9 @@ class TableSegment:
|
|
|
317
381
|
|
|
318
382
|
return split_compound_key_space(self.min_key, self.max_key, count)
|
|
319
383
|
|
|
320
|
-
def choose_checkpoints(self, count: int) -> List[Tuple[DbKey]]:
|
|
321
|
-
"""Returns count evenly spaced checkpoints (total segments ~= count), works for multi-key."""
|
|
322
|
-
assert self.is_bounded, "Cannot split unbounded key space"
|
|
323
|
-
|
|
324
|
-
if count < 1:
|
|
325
|
-
return [self.min_key, self.max_key]
|
|
326
|
-
|
|
327
|
-
# Check if all keys are ArithString (includes ArithAlphanumeric)
|
|
328
|
-
if all(isinstance(k, (ArithString, ArithUnicodeString)) for k in self.min_key) and all(
|
|
329
|
-
isinstance(k, (ArithString, ArithUnicodeString)) for k in self.max_key
|
|
330
|
-
):
|
|
331
|
-
# Use split_key_space for each key dimension
|
|
332
|
-
checkpoints_per_dim = [split_key_space(mn, mx, count) for mn, mx in safezip(self.min_key, self.max_key)]
|
|
333
|
-
# Create a mesh of checkpoints using create_mesh_from_points
|
|
334
|
-
return [tuple(start) for start, _ in create_mesh_from_points(*checkpoints_per_dim)]
|
|
335
|
-
else:
|
|
336
|
-
# Fallback to numeric interpolation for non-ArithString keys
|
|
337
|
-
def interpolate_key(fraction: float) -> Tuple[DbKey, ...]:
|
|
338
|
-
return tuple(int(mn + (mx - mn) * fraction) for mn, mx in zip(self.min_key, self.max_key))
|
|
339
|
-
|
|
340
|
-
return [interpolate_key(i / count) for i in range(count + 1)]
|
|
341
|
-
|
|
342
|
-
# def choose_checkpoints(
|
|
343
|
-
# self, max_key_range_per_segment: int = 1_000_000, total_rows: Optional[int] = None
|
|
344
|
-
# ) -> List[List[DbKey]]:
|
|
345
|
-
# """Suggests checkpoints to split by, including start and end.
|
|
346
|
-
|
|
347
|
-
# Uses linear interpolation across the entire compound key space to ensure segment
|
|
348
|
-
# sizes remain under `max_segment_size`, even for multi-column primary keys.
|
|
349
|
-
# """
|
|
350
|
-
# key_range = self.max_key[0] - self.min_key[0]
|
|
351
|
-
# segment_count = max(1, key_range // max_key_range_per_segment)
|
|
352
|
-
# segment_count = min(segment_count, 500) # Cap it for safety
|
|
353
|
-
|
|
354
|
-
# def interpolate_key(fraction: float) -> Tuple[DbKey, ...]:
|
|
355
|
-
# return tuple(int(mn + (mx - mn) * fraction) for mn, mx in zip(self.min_key, self.max_key))
|
|
356
|
-
|
|
357
|
-
# return [interpolate_key(i / segment_count) for i in range(segment_count + 1)]
|
|
358
|
-
|
|
359
384
|
def segment_by_checkpoints(self, checkpoints: List[List[DbKey]]) -> List["TableSegment"]:
|
|
360
385
|
"Split the current TableSegment to a bunch of smaller ones, separated by the given checkpoints"
|
|
361
|
-
|
|
362
|
-
# return [self.new_key_bounds(min_key=s, max_key=e) for s, e in create_mesh_from_points(*checkpoints)]
|
|
363
|
-
return [
|
|
364
|
-
self.new_key_bounds(min_key=start, max_key=end) for start, end in zip(checkpoints[:-1], checkpoints[1:])
|
|
365
|
-
]
|
|
386
|
+
return [self.new_key_bounds(min_key=s, max_key=e) for s, e in create_mesh_from_points(*checkpoints)]
|
|
366
387
|
|
|
367
388
|
def new(self, **kwargs) -> Self:
|
|
368
389
|
"""Creates a copy of the instance using 'replace()'"""
|
|
@@ -482,6 +482,9 @@ def number_to_human(n):
|
|
|
482
482
|
|
|
483
483
|
|
|
484
484
|
def split_space(start, end, count) -> List[int]:
|
|
485
|
+
if isinstance(start, float) or isinstance(end, float):
|
|
486
|
+
step = (end - start) / (count + 1)
|
|
487
|
+
return [start + step * i for i in range(1, count + 1)]
|
|
485
488
|
size = end - start
|
|
486
489
|
assert count <= size, (count, size)
|
|
487
490
|
return list(range(start, end, (size + 1) // (count + 1)))[1 : count + 1]
|
|
@@ -395,8 +395,10 @@ class DBTableDiffer:
|
|
|
395
395
|
error_message = f"Target table '{target_dataset.get('table_name')}' is empty"
|
|
396
396
|
is_table_empty = True
|
|
397
397
|
if not is_table_empty and not self.config.schema_diff:
|
|
398
|
+
pks_len = len(self.table1.key_columns)
|
|
398
399
|
table_1_sample_data = self.table1.with_schema().get_sample_data(limit=100)
|
|
399
|
-
|
|
400
|
+
sample_keys = [list(row[:pks_len]) for row in table_1_sample_data]
|
|
401
|
+
table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100, sample_keys=sample_keys)
|
|
400
402
|
self.diff_iter = diff_tables(
|
|
401
403
|
self.table1,
|
|
402
404
|
self.table2,
|
|
@@ -598,9 +600,14 @@ class DBTableDiffer:
|
|
|
598
600
|
columns_order_wise=columns_order_wise_target if columns_order_wise_target else [],
|
|
599
601
|
)
|
|
600
602
|
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
603
|
+
sample_value_column_names_src = list(self.table1.key_columns) + list(self.table1.extra_columns)
|
|
604
|
+
sample_value_column_names_tgt = list(self.table2.key_columns) + list(self.table2.extra_columns)
|
|
605
|
+
sample_value_source_dicts = [
|
|
606
|
+
dict(zip(sample_value_column_names_src, row)) for row in table_1_sample_data
|
|
607
|
+
]
|
|
608
|
+
sample_value_target_dicts = [
|
|
609
|
+
dict(zip(sample_value_column_names_tgt, row)) for row in table_2_sample_data
|
|
610
|
+
]
|
|
604
611
|
|
|
605
612
|
def get_pk(row, key_columns):
|
|
606
613
|
return tuple(row[k] for k in key_columns)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py
RENAMED
|
File without changes
|
|
File without changes
|
{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|