PyPI - dcs-sdk - Versions diffs - 1.4.7__tar.gz → 1.4.9__tar.gz - Mend

dcs-sdk 1.4.7tar.gz → 1.4.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: dcs-sdk
-Version: 1.4.7
+Version: 1.4.9
 Summary: SDK for DataChecks
 Author: Waterdip Labs
 Author-email: hello@waterdip.ai
@@ -60,7 +60,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
 Description-Content-Type: text/markdown
 <h1 align="center">
-  DCS SDK v1.4.7
+  DCS SDK v1.4.9
 </h1>
 > SDK for DataChecks

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/README.md RENAMED Viewed

@@ -1,5 +1,5 @@
 <h1 align="center">
-  DCS SDK v1.4.7
+  DCS SDK v1.4.9
 </h1>
 > SDK for DataChecks

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/abcs/database_types.py RENAMED Viewed

@@ -22,7 +22,7 @@ import attrs
 from data_diff.utils import ArithAlphanumeric, ArithUnicodeString, ArithUUID, Unknown
 DbPath = Tuple[str, ...]
-DbKey = Union[int, str, bytes, ArithUUID, ArithAlphanumeric]
+DbKey = Union[int, str, bytes, ArithUUID, ArithAlphanumeric, ArithUnicodeString]
 DbTime = datetime
 N = TypeVar("N")

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/base.py RENAMED Viewed

@@ -54,6 +54,7 @@ from typing_extensions import Self
 from data_diff.abcs.compiler import AbstractCompiler, Compilable
 from data_diff.abcs.database_types import (
     JSON,
+    ArithAlphanumeric,
     Array,
     Boolean,
     ColType,
@@ -753,6 +754,8 @@ class BaseDialect(abc.ABC):
             return f"'{v.decode()}'"
         elif isinstance(v, Code):
             return v.code
+        elif isinstance(v, ArithAlphanumeric):
+            return f"'{v._str}'"
         return repr(v)
     def constant_values(self, rows) -> str:

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/mssql.py RENAMED Viewed

@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import re
 from typing import Any, ClassVar, Dict, Optional, Type
 import attrs
@@ -118,15 +119,16 @@ class Dialect(BaseDialect):
         WHERE name = CURRENT_USER"""
     def to_string(self, s: str) -> str:
-        # Both convert(varchar(max), …) and convert(text, …) do work.
-        import re
         s_temp = re.sub(r'["\[\]`]', "", s)
         col_info = self.get_column_raw_info(s_temp)
+        ch_len = (col_info and col_info.character_maximum_length) or None
+        if not ch_len:
+            ch_len = 2500
+        ch_len = max(ch_len, 2500)
         if col_info and col_info.data_type in ["nvarchar", "nchar", "ntext"]:
-            return f"CONVERT(NVARCHAR(MAX), {s})"
+            return f"CONVERT(NVARCHAR({ch_len}), {s})"
-        return f"CONVERT(VARCHAR(MAX), {s})"
+        return f"CONVERT(VARCHAR({ch_len}), {s})"
     def type_repr(self, t) -> str:
         try:
@@ -165,9 +167,9 @@ class Dialect(BaseDialect):
         # select_query = re.sub(r"TRIM\(([\w]+)\)", r"TRIM(CAST(\1 AS NVARCHAR(MAX)))", select_query)
-        select_query = re.sub(r"TRIM\(\[([\w]+)\]\)", r"LTRIM(RTRIM(CAST([\1] AS VARCHAR(8000))))", select_query)
+        # select_query = re.sub(r"TRIM\(\[([\w]+)\]\)", r"LTRIM(RTRIM(CAST([\1] AS VARCHAR(8000))))", select_query)
-        select_query = re.sub(r"TRIM\(([\w]+)\)", r"LTRIM(RTRIM(CAST(\1 AS VARCHAR(8000))))", select_query)
+        # select_query = re.sub(r"TRIM\(([\w]+)\)", r"LTRIM(RTRIM(CAST(\1 AS VARCHAR(8000))))", select_query)
         return f"{select_query} {result}"
@@ -206,8 +208,14 @@ class Dialect(BaseDialect):
         return tuple(name.split("."))
     def normalize_uuid(self, value, coltype):
+        s_temp = re.sub(r'["\[\]`]', "", value)
+        col_info = self.get_column_raw_info(s_temp)
+        ch_len = (col_info and col_info.character_maximum_length) or None
+        if not ch_len:
+            ch_len = 2500
+        ch_len = max(ch_len, 2500)
         if isinstance(coltype, String_UUID):
-            return f"CAST({value} AS VARCHAR(MAX))"
+            return f"CAST({value} AS VARCHAR({ch_len}))"
         return f"CAST({value} AS VARCHAR(36))"

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/sybase.py RENAMED Viewed

@@ -12,6 +12,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import re
 import time
 from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type
@@ -140,17 +141,17 @@ class Dialect(BaseDialect):
         WHERE name = CURRENT_USER"""
     def to_string(self, s: str, coltype: str = None) -> str:
-        if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
-            # Sybase IQ or FreeTDS detected as IQ: Use VARCHAR(2500)
-            return f"CAST({s} AS VARCHAR(2500))"
-        # Sybase ASE or FreeTDS detected as ASE: Handle nvarchar
-        import re
         s_temp = re.sub(r'["\[\]`]', "", s)
         raw_col_info = self.get_column_raw_info(s_temp)
+        ch_len = (raw_col_info and raw_col_info.character_maximum_length) or None
+        if not ch_len:
+            ch_len = 2500
+        ch_len = max(ch_len, 2500)
+        if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
+            return f"CAST({s} AS VARCHAR({ch_len}))"
         if raw_col_info and raw_col_info.data_type in ["nvarchar", "nchar", "ntext"]:
-            return f"CAST({s} AS NVARCHAR(5000))"  # ASE max for nvarchar
-        return f"CAST({s} AS VARCHAR(2500))"
+            return f"CAST({s} AS NVARCHAR({ch_len}))"
+        return f"CAST({s} AS VARCHAR({ch_len}))"
     def type_repr(self, t) -> str:
         try:
@@ -173,17 +174,15 @@ class Dialect(BaseDialect):
         limit: Optional[int] = None,
         has_order_by: Optional[bool] = None,
     ) -> str:
-        import re
+        # import re
-        def safe_trim(match):
-            column_name = match.group(1)
-            if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
-                return f"TRIM(CAST({column_name} AS VARCHAR(2500)))"
-            return f"TRIM(CAST({column_name} AS NVARCHAR(5000)))"
-        select_query = re.sub(r"TRIM\(\[([\w]+)\]\)", safe_trim, select_query)
-        select_query = re.sub(r"TRIM\(([\w]+)\)", safe_trim, select_query)
+        # def safe_trim(match):
+        #     column_name = match.group(1)
+        #     if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
+        #         return f"TRIM(CAST({column_name} AS VARCHAR(2500)))"
+        #     return f"TRIM(CAST({column_name} AS NVARCHAR(5000)))"
+        # select_query = re.sub(r"TRIM\(\[([\w]+)\]\)", safe_trim, select_query)
+        # select_query = re.sub(r"TRIM\(([\w]+)\)", safe_trim, select_query)
         if limit is not None:
             select_query = select_query.replace("SELECT", f"SELECT TOP {limit}", 1)
@@ -225,8 +224,8 @@ class Dialect(BaseDialect):
                 f"END"
             )
         if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
-            return f"CAST({value} AS VARCHAR(2500))"
-        return f"CAST({value} AS NVARCHAR(5000))"
+            return f"CAST({value} AS VARCHAR(100))"
+        return f"CAST({value} AS NVARCHAR(100))"
     def normalize_number(self, value: str, coltype: FractionalType) -> str:
         return self.to_string(f"CAST({value} AS DECIMAL(38, {coltype.precision}))")
@@ -326,13 +325,19 @@ class Dialect(BaseDialect):
         return " || ".join(items)
     def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
+        s_temp = re.sub(r'["\[\]`]', "", value)
+        raw_col_info = self.get_column_raw_info(s_temp)
+        ch_len = (raw_col_info and raw_col_info.character_maximum_length) or None
+        if not ch_len:
+            ch_len = 2500
+        ch_len = max(ch_len, 2500)
         if isinstance(coltype, String_UUID):
             if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
-                return f"CAST({value} AS VARCHAR(2500))"  # IQ: Match column length
-            return f"CAST({value} AS NVARCHAR(5000))"  # ASE: Match column length
+                return f"CAST({value} AS VARCHAR({ch_len}))"  # IQ: Match column length
+            return f"CAST({value} AS NVARCHAR({ch_len}))"  # ASE: Match column length
         if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
-            return f"CONVERT(VARCHAR(36), {value})"
-        return f"CONVERT(NVARCHAR(36), {value})"
+            return f"CONVERT(VARCHAR({ch_len}), {value})"
+        return f"CONVERT(NVARCHAR({ch_len}), {value})"
     def parse_table_name(self, name: str) -> DbPath:
         "Parse the given table name into a DbPath"

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/hashdiff_tables.py RENAMED Viewed

@@ -522,7 +522,7 @@ class HashDiffer(TableDiffer):
         if count1 == 0 and count2 == 0:
             logger.debug(
-                "Uneven distribution of keys detected in segment %s..%s (big gaps in the key column). "
+                "Uneven distribution of keys detected in segment {}..{} (big gaps in the key column). "
                 "For better performance, we recommend to increase the bisection-threshold.",
                 table1.min_key,
                 table1.max_key,

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/table_segment.py RENAMED Viewed

@@ -14,6 +14,7 @@
 import logging
 import time
+from decimal import Decimal
 from itertools import product
 from typing import Container, Dict, List, Optional, Sequence, Tuple
@@ -24,7 +25,18 @@ from typing_extensions import Self
 from data_diff.abcs.database_types import DbKey, DbPath, DbTime, IKey, NumericType
 from data_diff.databases.base import Database
-from data_diff.queries.api import SKIP, Code, Count, Expr, max_, min_, table, this
+from data_diff.queries.api import (
+    SKIP,
+    Code,
+    Count,
+    Expr,
+    and_,
+    max_,
+    min_,
+    or_,
+    table,
+    this,
+)
 from data_diff.queries.extras import (
     ApplyFuncAndNormalizeAsString,
     Checksum,
@@ -54,6 +66,10 @@ def split_key_space(min_key: DbKey, max_key: DbKey, count: int) -> List[DbKey]:
         assert type(min_key) is type(max_key)
         checkpoints = min_key.range(max_key, count)
     else:
+        if isinstance(min_key, Decimal):
+            min_key = float(min_key)
+        if isinstance(max_key, Decimal):
+            max_key = float(max_key)
         checkpoints = split_space(min_key, max_key, count)
     assert all(min_key < x < max_key for x in checkpoints)
@@ -288,17 +304,65 @@ class TableSegment:
         return result
-    def get_sample_data(self, limit: int = 100) -> list:
-        "Download all the relevant values of the segment from the database"
+    # def get_sample_data(self, limit: int = 100) -> list:
+    #     "Download all the relevant values of the segment from the database"
+    #     exprs = []
+    #     for c in self.key_columns:
+    #         quoted = self.database.dialect.quote(c)
+    #         exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
+    #     if self.where:
+    #         select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
+    #         self.key_columns
+    #     else:
+    #         select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
+    #     start_time = time.monotonic()
+    #     result = self.database.query(select, List[Tuple])
+    #     query_time_ms = (time.monotonic() - start_time) * 1000
+    #     self._update_stats("row_fetch_queries_stats", query_time_ms)
+    def get_sample_data(self, limit: int = 100, sample_keys: Optional[List[List[DbKey]]] = None) -> list:
+        """
+        Download relevant values of the segment from the database.
+        If `sample_keys` is provided, it filters rows matching those composite keys.
+        Parameters:
+            limit (int): Maximum number of rows to return (default: 100).
+            sample_keys (Optional[List[List[DbKey]]]): List of composite keys to filter rows.
+                Each inner list must match the number of key_columns.
+        Returns:
+            list: List of tuples containing the queried row data.
+        """
+        select = self.make_select().select(*self._relevant_columns_repr)
-        exprs = []
-        for c in self.key_columns:
-            quoted = self.database.dialect.quote(c)
-            exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
-        if self.where:
-            select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
+        filters = []
+        if sample_keys:
+            key_exprs = []
+            for key_values in sample_keys:
+                and_exprs = []
+                for col, val in safezip(self.key_columns, key_values):
+                    quoted = self.database.dialect.quote(col)
+                    schema = self._schema[col]
+                    if val is None:
+                        and_exprs.append(Code(quoted + " IS NULL"))
+                        continue
+                    mk_v = schema.make_value(val)
+                    constant_val = self.database.dialect._constant_value(mk_v)
+                    where_expr = f"{quoted} = {constant_val}"
+                    and_exprs.append(Code(where_expr))
+                if and_exprs:
+                    key_exprs.append(and_(*and_exprs))
+            if key_exprs:
+                filters.append(or_(*key_exprs))
+        if filters or self.where:
+            select = select.where(*filters)
         else:
-            select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
+            logger.warning("No filters applied; fetching up to {} rows without key restrictions", limit)
+        select = select.limit(limit)
         start_time = time.monotonic()
         result = self.database.query(select, List[Tuple])
@@ -317,52 +381,9 @@ class TableSegment:
         return split_compound_key_space(self.min_key, self.max_key, count)
-    def choose_checkpoints(self, count: int) -> List[Tuple[DbKey]]:
-        """Returns count evenly spaced checkpoints (total segments ~= count), works for multi-key."""
-        assert self.is_bounded, "Cannot split unbounded key space"
-        if count < 1:
-            return [self.min_key, self.max_key]
-        # Check if all keys are ArithString (includes ArithAlphanumeric)
-        if all(isinstance(k, (ArithString, ArithUnicodeString)) for k in self.min_key) and all(
-            isinstance(k, (ArithString, ArithUnicodeString)) for k in self.max_key
-        ):
-            # Use split_key_space for each key dimension
-            checkpoints_per_dim = [split_key_space(mn, mx, count) for mn, mx in safezip(self.min_key, self.max_key)]
-            # Create a mesh of checkpoints using create_mesh_from_points
-            return [tuple(start) for start, _ in create_mesh_from_points(*checkpoints_per_dim)]
-        else:
-            # Fallback to numeric interpolation for non-ArithString keys
-            def interpolate_key(fraction: float) -> Tuple[DbKey, ...]:
-                return tuple(int(mn + (mx - mn) * fraction) for mn, mx in zip(self.min_key, self.max_key))
-            return [interpolate_key(i / count) for i in range(count + 1)]
-    # def choose_checkpoints(
-    #     self, max_key_range_per_segment: int = 1_000_000, total_rows: Optional[int] = None
-    # ) -> List[List[DbKey]]:
-    #     """Suggests checkpoints to split by, including start and end.
-    #     Uses linear interpolation across the entire compound key space to ensure segment
-    #     sizes remain under `max_segment_size`, even for multi-column primary keys.
-    #     """
-    #     key_range = self.max_key[0] - self.min_key[0]
-    #     segment_count = max(1, key_range // max_key_range_per_segment)
-    #     segment_count = min(segment_count, 500)  # Cap it for safety
-    #     def interpolate_key(fraction: float) -> Tuple[DbKey, ...]:
-    #         return tuple(int(mn + (mx - mn) * fraction) for mn, mx in zip(self.min_key, self.max_key))
-    #     return [interpolate_key(i / segment_count) for i in range(segment_count + 1)]
     def segment_by_checkpoints(self, checkpoints: List[List[DbKey]]) -> List["TableSegment"]:
         "Split the current TableSegment to a bunch of smaller ones, separated by the given checkpoints"
-        # return [self.new_key_bounds(min_key=s, max_key=e) for s, e in create_mesh_from_points(*checkpoints)]
-        return [
-            self.new_key_bounds(min_key=start, max_key=end) for start, end in zip(checkpoints[:-1], checkpoints[1:])
-        ]
+        return [self.new_key_bounds(min_key=s, max_key=e) for s, e in create_mesh_from_points(*checkpoints)]
     def new(self, **kwargs) -> Self:
         """Creates a copy of the instance using 'replace()'"""

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/utils.py RENAMED Viewed

@@ -482,6 +482,9 @@ def number_to_human(n):
 def split_space(start, end, count) -> List[int]:
+    if isinstance(start, float) or isinstance(end, float):
+        step = (end - start) / (count + 1)
+        return [start + step * i for i in range(1, count + 1)]
     size = end - start
     assert count <= size, (count, size)
     return list(range(start, end, (size + 1) // (count + 1)))[1 : count + 1]

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/__version__.py RENAMED Viewed

@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-__version__ = "1.4.7"
+__version__ = "1.4.9"

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/data_diff/data_differ.py RENAMED Viewed

@@ -395,8 +395,10 @@ class DBTableDiffer:
                     error_message = f"Target table '{target_dataset.get('table_name')}' is empty"
                 is_table_empty = True
             if not is_table_empty and not self.config.schema_diff:
+                pks_len = len(self.table1.key_columns)
                 table_1_sample_data = self.table1.with_schema().get_sample_data(limit=100)
-                table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100)
+                sample_keys = [list(row[:pks_len]) for row in table_1_sample_data]
+                table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100, sample_keys=sample_keys)
                 self.diff_iter = diff_tables(
                     self.table1,
                     self.table2,
@@ -598,9 +600,14 @@ class DBTableDiffer:
                     columns_order_wise=columns_order_wise_target if columns_order_wise_target else [],
                 )
-                sample_value_column_names = list(self.table1.key_columns) + list(self.table1.extra_columns)
-                sample_value_source_dicts = [dict(zip(sample_value_column_names, row)) for row in table_1_sample_data]
-                sample_value_target_dicts = [dict(zip(sample_value_column_names, row)) for row in table_2_sample_data]
+                sample_value_column_names_src = list(self.table1.key_columns) + list(self.table1.extra_columns)
+                sample_value_column_names_tgt = list(self.table2.key_columns) + list(self.table2.extra_columns)
+                sample_value_source_dicts = [
+                    dict(zip(sample_value_column_names_src, row)) for row in table_1_sample_data
+                ]
+                sample_value_target_dicts = [
+                    dict(zip(sample_value_column_names_tgt, row)) for row in table_2_sample_data
+                ]
                 def get_pk(row, key_columns):
                     return tuple(row[k] for k in key_columns)

{dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "dcs-sdk"
-version = "1.4.7"
+version = "1.4.9"
 description = "SDK for DataChecks"
 authors = ["Waterdip Labs <hello@waterdip.ai>"]
 readme = "README.md"