dcs-sdk 1.4.7__tar.gz → 1.4.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/PKG-INFO +2 -2
  2. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/README.md +1 -1
  3. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/abcs/database_types.py +1 -1
  4. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/base.py +3 -0
  5. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/mssql.py +16 -8
  6. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/sybase.py +29 -24
  7. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/hashdiff_tables.py +1 -1
  8. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/table_segment.py +75 -54
  9. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/utils.py +3 -0
  10. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/__version__.py +1 -1
  11. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/data_diff/data_differ.py +11 -4
  12. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/pyproject.toml +1 -1
  13. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/__init__.py +0 -0
  14. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/__main__.py +0 -0
  15. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/abcs/__init__.py +0 -0
  16. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/abcs/compiler.py +0 -0
  17. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/config.py +0 -0
  18. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/__init__.py +0 -0
  19. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/_connect.py +0 -0
  20. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/bigquery.py +0 -0
  21. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/clickhouse.py +0 -0
  22. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/databricks.py +0 -0
  23. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/duckdb.py +0 -0
  24. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/mysql.py +0 -0
  25. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/oracle.py +0 -0
  26. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/postgresql.py +0 -0
  27. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/presto.py +0 -0
  28. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/redshift.py +0 -0
  29. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/snowflake.py +0 -0
  30. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/trino.py +0 -0
  31. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/databases/vertica.py +0 -0
  32. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/diff_tables.py +0 -0
  33. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/errors.py +0 -0
  34. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/format.py +0 -0
  35. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/info_tree.py +0 -0
  36. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/joindiff_tables.py +0 -0
  37. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/lexicographic_space.py +0 -0
  38. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/parse_time.py +0 -0
  39. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/py.typed +0 -0
  40. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/queries/__init__.py +0 -0
  41. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/queries/api.py +0 -0
  42. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/queries/ast_classes.py +0 -0
  43. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/queries/base.py +0 -0
  44. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/queries/extras.py +0 -0
  45. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/query_utils.py +0 -0
  46. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/schema.py +0 -0
  47. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/thread_utils.py +0 -0
  48. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/data_diff/version.py +0 -0
  49. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/__init__.py +0 -0
  50. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/__main__.py +0 -0
  51. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/cli/__init__.py +0 -0
  52. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/cli/cli.py +0 -0
  53. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/__init__.py +0 -0
  54. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/config/__init__.py +0 -0
  55. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/config/config_loader.py +0 -0
  56. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/data_diff/__init__.py +0 -0
  57. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/__init__.py +0 -0
  58. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/rules_mappping.py +0 -0
  59. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/rules_repository.py +0 -0
  60. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/schema_rules.py +0 -0
  61. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/__init__.py +0 -0
  62. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/serializer.py +0 -0
  63. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/__init__.py +0 -0
  64. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/base_provider.py +0 -0
  65. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +0 -0
  66. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +0 -0
  67. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +0 -0
  68. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/table.py +0 -0
  69. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/themes.py +0 -0
  70. {dcs_sdk-1.4.7 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dcs-sdk
3
- Version: 1.4.7
3
+ Version: 1.4.9
4
4
  Summary: SDK for DataChecks
5
5
  Author: Waterdip Labs
6
6
  Author-email: hello@waterdip.ai
@@ -60,7 +60,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
60
60
  Description-Content-Type: text/markdown
61
61
 
62
62
  <h1 align="center">
63
- DCS SDK v1.4.7
63
+ DCS SDK v1.4.9
64
64
  </h1>
65
65
 
66
66
  > SDK for DataChecks
@@ -1,5 +1,5 @@
1
1
  <h1 align="center">
2
- DCS SDK v1.4.7
2
+ DCS SDK v1.4.9
3
3
  </h1>
4
4
 
5
5
  > SDK for DataChecks
@@ -22,7 +22,7 @@ import attrs
22
22
  from data_diff.utils import ArithAlphanumeric, ArithUnicodeString, ArithUUID, Unknown
23
23
 
24
24
  DbPath = Tuple[str, ...]
25
- DbKey = Union[int, str, bytes, ArithUUID, ArithAlphanumeric]
25
+ DbKey = Union[int, str, bytes, ArithUUID, ArithAlphanumeric, ArithUnicodeString]
26
26
  DbTime = datetime
27
27
 
28
28
  N = TypeVar("N")
@@ -54,6 +54,7 @@ from typing_extensions import Self
54
54
  from data_diff.abcs.compiler import AbstractCompiler, Compilable
55
55
  from data_diff.abcs.database_types import (
56
56
  JSON,
57
+ ArithAlphanumeric,
57
58
  Array,
58
59
  Boolean,
59
60
  ColType,
@@ -753,6 +754,8 @@ class BaseDialect(abc.ABC):
753
754
  return f"'{v.decode()}'"
754
755
  elif isinstance(v, Code):
755
756
  return v.code
757
+ elif isinstance(v, ArithAlphanumeric):
758
+ return f"'{v._str}'"
756
759
  return repr(v)
757
760
 
758
761
  def constant_values(self, rows) -> str:
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import re
15
16
  from typing import Any, ClassVar, Dict, Optional, Type
16
17
 
17
18
  import attrs
@@ -118,15 +119,16 @@ class Dialect(BaseDialect):
118
119
  WHERE name = CURRENT_USER"""
119
120
 
120
121
  def to_string(self, s: str) -> str:
121
- # Both convert(varchar(max), …) and convert(text, …) do work.
122
- import re
123
-
124
122
  s_temp = re.sub(r'["\[\]`]', "", s)
125
123
  col_info = self.get_column_raw_info(s_temp)
124
+ ch_len = (col_info and col_info.character_maximum_length) or None
125
+ if not ch_len:
126
+ ch_len = 2500
127
+ ch_len = max(ch_len, 2500)
126
128
  if col_info and col_info.data_type in ["nvarchar", "nchar", "ntext"]:
127
- return f"CONVERT(NVARCHAR(MAX), {s})"
129
+ return f"CONVERT(NVARCHAR({ch_len}), {s})"
128
130
 
129
- return f"CONVERT(VARCHAR(MAX), {s})"
131
+ return f"CONVERT(VARCHAR({ch_len}), {s})"
130
132
 
131
133
  def type_repr(self, t) -> str:
132
134
  try:
@@ -165,9 +167,9 @@ class Dialect(BaseDialect):
165
167
 
166
168
  # select_query = re.sub(r"TRIM\(([\w]+)\)", r"TRIM(CAST(\1 AS NVARCHAR(MAX)))", select_query)
167
169
 
168
- select_query = re.sub(r"TRIM\(\[([\w]+)\]\)", r"LTRIM(RTRIM(CAST([\1] AS VARCHAR(8000))))", select_query)
170
+ # select_query = re.sub(r"TRIM\(\[([\w]+)\]\)", r"LTRIM(RTRIM(CAST([\1] AS VARCHAR(8000))))", select_query)
169
171
 
170
- select_query = re.sub(r"TRIM\(([\w]+)\)", r"LTRIM(RTRIM(CAST(\1 AS VARCHAR(8000))))", select_query)
172
+ # select_query = re.sub(r"TRIM\(([\w]+)\)", r"LTRIM(RTRIM(CAST(\1 AS VARCHAR(8000))))", select_query)
171
173
 
172
174
  return f"{select_query} {result}"
173
175
 
@@ -206,8 +208,14 @@ class Dialect(BaseDialect):
206
208
  return tuple(name.split("."))
207
209
 
208
210
  def normalize_uuid(self, value, coltype):
211
+ s_temp = re.sub(r'["\[\]`]', "", value)
212
+ col_info = self.get_column_raw_info(s_temp)
213
+ ch_len = (col_info and col_info.character_maximum_length) or None
214
+ if not ch_len:
215
+ ch_len = 2500
216
+ ch_len = max(ch_len, 2500)
209
217
  if isinstance(coltype, String_UUID):
210
- return f"CAST({value} AS VARCHAR(MAX))"
218
+ return f"CAST({value} AS VARCHAR({ch_len}))"
211
219
  return f"CAST({value} AS VARCHAR(36))"
212
220
 
213
221
 
@@ -12,6 +12,7 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ import re
15
16
  import time
16
17
  from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type
17
18
 
@@ -140,17 +141,17 @@ class Dialect(BaseDialect):
140
141
  WHERE name = CURRENT_USER"""
141
142
 
142
143
  def to_string(self, s: str, coltype: str = None) -> str:
143
- if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
144
- # Sybase IQ or FreeTDS detected as IQ: Use VARCHAR(2500)
145
- return f"CAST({s} AS VARCHAR(2500))"
146
- # Sybase ASE or FreeTDS detected as ASE: Handle nvarchar
147
- import re
148
-
149
144
  s_temp = re.sub(r'["\[\]`]', "", s)
150
145
  raw_col_info = self.get_column_raw_info(s_temp)
146
+ ch_len = (raw_col_info and raw_col_info.character_maximum_length) or None
147
+ if not ch_len:
148
+ ch_len = 2500
149
+ ch_len = max(ch_len, 2500)
150
+ if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
151
+ return f"CAST({s} AS VARCHAR({ch_len}))"
151
152
  if raw_col_info and raw_col_info.data_type in ["nvarchar", "nchar", "ntext"]:
152
- return f"CAST({s} AS NVARCHAR(5000))" # ASE max for nvarchar
153
- return f"CAST({s} AS VARCHAR(2500))"
153
+ return f"CAST({s} AS NVARCHAR({ch_len}))"
154
+ return f"CAST({s} AS VARCHAR({ch_len}))"
154
155
 
155
156
  def type_repr(self, t) -> str:
156
157
  try:
@@ -173,17 +174,15 @@ class Dialect(BaseDialect):
173
174
  limit: Optional[int] = None,
174
175
  has_order_by: Optional[bool] = None,
175
176
  ) -> str:
176
- import re
177
+ # import re
177
178
 
178
- def safe_trim(match):
179
- column_name = match.group(1)
180
- if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
181
- return f"TRIM(CAST({column_name} AS VARCHAR(2500)))"
182
- return f"TRIM(CAST({column_name} AS NVARCHAR(5000)))"
183
-
184
- select_query = re.sub(r"TRIM\(\[([\w]+)\]\)", safe_trim, select_query)
185
-
186
- select_query = re.sub(r"TRIM\(([\w]+)\)", safe_trim, select_query)
179
+ # def safe_trim(match):
180
+ # column_name = match.group(1)
181
+ # if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
182
+ # return f"TRIM(CAST({column_name} AS VARCHAR(2500)))"
183
+ # return f"TRIM(CAST({column_name} AS NVARCHAR(5000)))"
184
+ # select_query = re.sub(r"TRIM\(\[([\w]+)\]\)", safe_trim, select_query)
185
+ # select_query = re.sub(r"TRIM\(([\w]+)\)", safe_trim, select_query)
187
186
 
188
187
  if limit is not None:
189
188
  select_query = select_query.replace("SELECT", f"SELECT TOP {limit}", 1)
@@ -225,8 +224,8 @@ class Dialect(BaseDialect):
225
224
  f"END"
226
225
  )
227
226
  if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
228
- return f"CAST({value} AS VARCHAR(2500))"
229
- return f"CAST({value} AS NVARCHAR(5000))"
227
+ return f"CAST({value} AS VARCHAR(100))"
228
+ return f"CAST({value} AS NVARCHAR(100))"
230
229
 
231
230
  def normalize_number(self, value: str, coltype: FractionalType) -> str:
232
231
  return self.to_string(f"CAST({value} AS DECIMAL(38, {coltype.precision}))")
@@ -326,13 +325,19 @@ class Dialect(BaseDialect):
326
325
  return " || ".join(items)
327
326
 
328
327
  def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
328
+ s_temp = re.sub(r'["\[\]`]', "", value)
329
+ raw_col_info = self.get_column_raw_info(s_temp)
330
+ ch_len = (raw_col_info and raw_col_info.character_maximum_length) or None
331
+ if not ch_len:
332
+ ch_len = 2500
333
+ ch_len = max(ch_len, 2500)
329
334
  if isinstance(coltype, String_UUID):
330
335
  if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
331
- return f"CAST({value} AS VARCHAR(2500))" # IQ: Match column length
332
- return f"CAST({value} AS NVARCHAR(5000))" # ASE: Match column length
336
+ return f"CAST({value} AS VARCHAR({ch_len}))" # IQ: Match column length
337
+ return f"CAST({value} AS NVARCHAR({ch_len}))" # ASE: Match column length
333
338
  if self.sybase_driver_type.is_iq or self.query_config_for_free_tds["freetds_query_chosen"]:
334
- return f"CONVERT(VARCHAR(36), {value})"
335
- return f"CONVERT(NVARCHAR(36), {value})"
339
+ return f"CONVERT(VARCHAR({ch_len}), {value})"
340
+ return f"CONVERT(NVARCHAR({ch_len}), {value})"
336
341
 
337
342
  def parse_table_name(self, name: str) -> DbPath:
338
343
  "Parse the given table name into a DbPath"
@@ -522,7 +522,7 @@ class HashDiffer(TableDiffer):
522
522
 
523
523
  if count1 == 0 and count2 == 0:
524
524
  logger.debug(
525
- "Uneven distribution of keys detected in segment %s..%s (big gaps in the key column). "
525
+ "Uneven distribution of keys detected in segment {}..{} (big gaps in the key column). "
526
526
  "For better performance, we recommend to increase the bisection-threshold.",
527
527
  table1.min_key,
528
528
  table1.max_key,
@@ -14,6 +14,7 @@
14
14
 
15
15
  import logging
16
16
  import time
17
+ from decimal import Decimal
17
18
  from itertools import product
18
19
  from typing import Container, Dict, List, Optional, Sequence, Tuple
19
20
 
@@ -24,7 +25,18 @@ from typing_extensions import Self
24
25
 
25
26
  from data_diff.abcs.database_types import DbKey, DbPath, DbTime, IKey, NumericType
26
27
  from data_diff.databases.base import Database
27
- from data_diff.queries.api import SKIP, Code, Count, Expr, max_, min_, table, this
28
+ from data_diff.queries.api import (
29
+ SKIP,
30
+ Code,
31
+ Count,
32
+ Expr,
33
+ and_,
34
+ max_,
35
+ min_,
36
+ or_,
37
+ table,
38
+ this,
39
+ )
28
40
  from data_diff.queries.extras import (
29
41
  ApplyFuncAndNormalizeAsString,
30
42
  Checksum,
@@ -54,6 +66,10 @@ def split_key_space(min_key: DbKey, max_key: DbKey, count: int) -> List[DbKey]:
54
66
  assert type(min_key) is type(max_key)
55
67
  checkpoints = min_key.range(max_key, count)
56
68
  else:
69
+ if isinstance(min_key, Decimal):
70
+ min_key = float(min_key)
71
+ if isinstance(max_key, Decimal):
72
+ max_key = float(max_key)
57
73
  checkpoints = split_space(min_key, max_key, count)
58
74
 
59
75
  assert all(min_key < x < max_key for x in checkpoints)
@@ -288,17 +304,65 @@ class TableSegment:
288
304
 
289
305
  return result
290
306
 
291
- def get_sample_data(self, limit: int = 100) -> list:
292
- "Download all the relevant values of the segment from the database"
307
+ # def get_sample_data(self, limit: int = 100) -> list:
308
+ # "Download all the relevant values of the segment from the database"
309
+
310
+ # exprs = []
311
+ # for c in self.key_columns:
312
+ # quoted = self.database.dialect.quote(c)
313
+ # exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
314
+ # if self.where:
315
+ # select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
316
+ # self.key_columns
317
+ # else:
318
+ # select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
319
+
320
+ # start_time = time.monotonic()
321
+ # result = self.database.query(select, List[Tuple])
322
+ # query_time_ms = (time.monotonic() - start_time) * 1000
323
+ # self._update_stats("row_fetch_queries_stats", query_time_ms)
324
+
325
+ def get_sample_data(self, limit: int = 100, sample_keys: Optional[List[List[DbKey]]] = None) -> list:
326
+ """
327
+ Download relevant values of the segment from the database.
328
+ If `sample_keys` is provided, it filters rows matching those composite keys.
329
+
330
+ Parameters:
331
+ limit (int): Maximum number of rows to return (default: 100).
332
+ sample_keys (Optional[List[List[DbKey]]]): List of composite keys to filter rows.
333
+ Each inner list must match the number of key_columns.
334
+
335
+ Returns:
336
+ list: List of tuples containing the queried row data.
337
+ """
338
+ select = self.make_select().select(*self._relevant_columns_repr)
293
339
 
294
- exprs = []
295
- for c in self.key_columns:
296
- quoted = self.database.dialect.quote(c)
297
- exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
298
- if self.where:
299
- select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
340
+ filters = []
341
+
342
+ if sample_keys:
343
+ key_exprs = []
344
+ for key_values in sample_keys:
345
+ and_exprs = []
346
+ for col, val in safezip(self.key_columns, key_values):
347
+ quoted = self.database.dialect.quote(col)
348
+ schema = self._schema[col]
349
+ if val is None:
350
+ and_exprs.append(Code(quoted + " IS NULL"))
351
+ continue
352
+ mk_v = schema.make_value(val)
353
+ constant_val = self.database.dialect._constant_value(mk_v)
354
+ where_expr = f"{quoted} = {constant_val}"
355
+ and_exprs.append(Code(where_expr))
356
+ if and_exprs:
357
+ key_exprs.append(and_(*and_exprs))
358
+ if key_exprs:
359
+ filters.append(or_(*key_exprs))
360
+ if filters or self.where:
361
+ select = select.where(*filters)
300
362
  else:
301
- select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
363
+ logger.warning("No filters applied; fetching up to {} rows without key restrictions", limit)
364
+
365
+ select = select.limit(limit)
302
366
 
303
367
  start_time = time.monotonic()
304
368
  result = self.database.query(select, List[Tuple])
@@ -317,52 +381,9 @@ class TableSegment:
317
381
 
318
382
  return split_compound_key_space(self.min_key, self.max_key, count)
319
383
 
320
- def choose_checkpoints(self, count: int) -> List[Tuple[DbKey]]:
321
- """Returns count evenly spaced checkpoints (total segments ~= count), works for multi-key."""
322
- assert self.is_bounded, "Cannot split unbounded key space"
323
-
324
- if count < 1:
325
- return [self.min_key, self.max_key]
326
-
327
- # Check if all keys are ArithString (includes ArithAlphanumeric)
328
- if all(isinstance(k, (ArithString, ArithUnicodeString)) for k in self.min_key) and all(
329
- isinstance(k, (ArithString, ArithUnicodeString)) for k in self.max_key
330
- ):
331
- # Use split_key_space for each key dimension
332
- checkpoints_per_dim = [split_key_space(mn, mx, count) for mn, mx in safezip(self.min_key, self.max_key)]
333
- # Create a mesh of checkpoints using create_mesh_from_points
334
- return [tuple(start) for start, _ in create_mesh_from_points(*checkpoints_per_dim)]
335
- else:
336
- # Fallback to numeric interpolation for non-ArithString keys
337
- def interpolate_key(fraction: float) -> Tuple[DbKey, ...]:
338
- return tuple(int(mn + (mx - mn) * fraction) for mn, mx in zip(self.min_key, self.max_key))
339
-
340
- return [interpolate_key(i / count) for i in range(count + 1)]
341
-
342
- # def choose_checkpoints(
343
- # self, max_key_range_per_segment: int = 1_000_000, total_rows: Optional[int] = None
344
- # ) -> List[List[DbKey]]:
345
- # """Suggests checkpoints to split by, including start and end.
346
-
347
- # Uses linear interpolation across the entire compound key space to ensure segment
348
- # sizes remain under `max_segment_size`, even for multi-column primary keys.
349
- # """
350
- # key_range = self.max_key[0] - self.min_key[0]
351
- # segment_count = max(1, key_range // max_key_range_per_segment)
352
- # segment_count = min(segment_count, 500) # Cap it for safety
353
-
354
- # def interpolate_key(fraction: float) -> Tuple[DbKey, ...]:
355
- # return tuple(int(mn + (mx - mn) * fraction) for mn, mx in zip(self.min_key, self.max_key))
356
-
357
- # return [interpolate_key(i / segment_count) for i in range(segment_count + 1)]
358
-
359
384
  def segment_by_checkpoints(self, checkpoints: List[List[DbKey]]) -> List["TableSegment"]:
360
385
  "Split the current TableSegment to a bunch of smaller ones, separated by the given checkpoints"
361
-
362
- # return [self.new_key_bounds(min_key=s, max_key=e) for s, e in create_mesh_from_points(*checkpoints)]
363
- return [
364
- self.new_key_bounds(min_key=start, max_key=end) for start, end in zip(checkpoints[:-1], checkpoints[1:])
365
- ]
386
+ return [self.new_key_bounds(min_key=s, max_key=e) for s, e in create_mesh_from_points(*checkpoints)]
366
387
 
367
388
  def new(self, **kwargs) -> Self:
368
389
  """Creates a copy of the instance using 'replace()'"""
@@ -482,6 +482,9 @@ def number_to_human(n):
482
482
 
483
483
 
484
484
  def split_space(start, end, count) -> List[int]:
485
+ if isinstance(start, float) or isinstance(end, float):
486
+ step = (end - start) / (count + 1)
487
+ return [start + step * i for i in range(1, count + 1)]
485
488
  size = end - start
486
489
  assert count <= size, (count, size)
487
490
  return list(range(start, end, (size + 1) // (count + 1)))[1 : count + 1]
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.4.7"
15
+ __version__ = "1.4.9"
@@ -395,8 +395,10 @@ class DBTableDiffer:
395
395
  error_message = f"Target table '{target_dataset.get('table_name')}' is empty"
396
396
  is_table_empty = True
397
397
  if not is_table_empty and not self.config.schema_diff:
398
+ pks_len = len(self.table1.key_columns)
398
399
  table_1_sample_data = self.table1.with_schema().get_sample_data(limit=100)
399
- table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100)
400
+ sample_keys = [list(row[:pks_len]) for row in table_1_sample_data]
401
+ table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100, sample_keys=sample_keys)
400
402
  self.diff_iter = diff_tables(
401
403
  self.table1,
402
404
  self.table2,
@@ -598,9 +600,14 @@ class DBTableDiffer:
598
600
  columns_order_wise=columns_order_wise_target if columns_order_wise_target else [],
599
601
  )
600
602
 
601
- sample_value_column_names = list(self.table1.key_columns) + list(self.table1.extra_columns)
602
- sample_value_source_dicts = [dict(zip(sample_value_column_names, row)) for row in table_1_sample_data]
603
- sample_value_target_dicts = [dict(zip(sample_value_column_names, row)) for row in table_2_sample_data]
603
+ sample_value_column_names_src = list(self.table1.key_columns) + list(self.table1.extra_columns)
604
+ sample_value_column_names_tgt = list(self.table2.key_columns) + list(self.table2.extra_columns)
605
+ sample_value_source_dicts = [
606
+ dict(zip(sample_value_column_names_src, row)) for row in table_1_sample_data
607
+ ]
608
+ sample_value_target_dicts = [
609
+ dict(zip(sample_value_column_names_tgt, row)) for row in table_2_sample_data
610
+ ]
604
611
 
605
612
  def get_pk(row, key_columns):
606
613
  return tuple(row[k] for k in key_columns)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dcs-sdk"
3
- version = "1.4.7"
3
+ version = "1.4.9"
4
4
  description = "SDK for DataChecks"
5
5
  authors = ["Waterdip Labs <hello@waterdip.ai>"]
6
6
  readme = "README.md"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes