dcs-sdk 1.4.8__py3-none-any.whl → 1.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -54,6 +54,7 @@ from typing_extensions import Self
54
54
  from data_diff.abcs.compiler import AbstractCompiler, Compilable
55
55
  from data_diff.abcs.database_types import (
56
56
  JSON,
57
+ ArithAlphanumeric,
57
58
  Array,
58
59
  Boolean,
59
60
  ColType,
@@ -753,6 +754,8 @@ class BaseDialect(abc.ABC):
753
754
  return f"'{v.decode()}'"
754
755
  elif isinstance(v, Code):
755
756
  return v.code
757
+ elif isinstance(v, ArithAlphanumeric):
758
+ return f"'{v._str}'"
756
759
  return repr(v)
757
760
 
758
761
  def constant_values(self, rows) -> str:
@@ -522,7 +522,7 @@ class HashDiffer(TableDiffer):
522
522
 
523
523
  if count1 == 0 and count2 == 0:
524
524
  logger.debug(
525
- "Uneven distribution of keys detected in segment %s..%s (big gaps in the key column). "
525
+ "Uneven distribution of keys detected in segment {}..{} (big gaps in the key column). "
526
526
  "For better performance, we recommend to increase the bisection-threshold.",
527
527
  table1.min_key,
528
528
  table1.max_key,
@@ -14,6 +14,7 @@
14
14
 
15
15
  import logging
16
16
  import time
17
+ from decimal import Decimal
17
18
  from itertools import product
18
19
  from typing import Container, Dict, List, Optional, Sequence, Tuple
19
20
 
@@ -24,7 +25,18 @@ from typing_extensions import Self
24
25
 
25
26
  from data_diff.abcs.database_types import DbKey, DbPath, DbTime, IKey, NumericType
26
27
  from data_diff.databases.base import Database
27
- from data_diff.queries.api import SKIP, Code, Count, Expr, max_, min_, table, this
28
+ from data_diff.queries.api import (
29
+ SKIP,
30
+ Code,
31
+ Count,
32
+ Expr,
33
+ and_,
34
+ max_,
35
+ min_,
36
+ or_,
37
+ table,
38
+ this,
39
+ )
28
40
  from data_diff.queries.extras import (
29
41
  ApplyFuncAndNormalizeAsString,
30
42
  Checksum,
@@ -54,6 +66,10 @@ def split_key_space(min_key: DbKey, max_key: DbKey, count: int) -> List[DbKey]:
54
66
  assert type(min_key) is type(max_key)
55
67
  checkpoints = min_key.range(max_key, count)
56
68
  else:
69
+ if isinstance(min_key, Decimal):
70
+ min_key = float(min_key)
71
+ if isinstance(max_key, Decimal):
72
+ max_key = float(max_key)
57
73
  checkpoints = split_space(min_key, max_key, count)
58
74
 
59
75
  assert all(min_key < x < max_key for x in checkpoints)
@@ -288,17 +304,65 @@ class TableSegment:
288
304
 
289
305
  return result
290
306
 
291
- def get_sample_data(self, limit: int = 100) -> list:
292
- "Download all the relevant values of the segment from the database"
307
+ # def get_sample_data(self, limit: int = 100) -> list:
308
+ # "Download all the relevant values of the segment from the database"
309
+
310
+ # exprs = []
311
+ # for c in self.key_columns:
312
+ # quoted = self.database.dialect.quote(c)
313
+ # exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
314
+ # if self.where:
315
+ # select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
316
+ # self.key_columns
317
+ # else:
318
+ # select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
319
+
320
+ # start_time = time.monotonic()
321
+ # result = self.database.query(select, List[Tuple])
322
+ # query_time_ms = (time.monotonic() - start_time) * 1000
323
+ # self._update_stats("row_fetch_queries_stats", query_time_ms)
324
+
325
+ def get_sample_data(self, limit: int = 100, sample_keys: Optional[List[List[DbKey]]] = None) -> list:
326
+ """
327
+ Download relevant values of the segment from the database.
328
+ If `sample_keys` is provided, it filters rows matching those composite keys.
329
+
330
+ Parameters:
331
+ limit (int): Maximum number of rows to return (default: 100).
332
+ sample_keys (Optional[List[List[DbKey]]]): List of composite keys to filter rows.
333
+ Each inner list must match the number of key_columns.
334
+
335
+ Returns:
336
+ list: List of tuples containing the queried row data.
337
+ """
338
+ select = self.make_select().select(*self._relevant_columns_repr)
293
339
 
294
- exprs = []
295
- for c in self.key_columns:
296
- quoted = self.database.dialect.quote(c)
297
- exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
298
- if self.where:
299
- select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
340
+ filters = []
341
+
342
+ if sample_keys:
343
+ key_exprs = []
344
+ for key_values in sample_keys:
345
+ and_exprs = []
346
+ for col, val in safezip(self.key_columns, key_values):
347
+ quoted = self.database.dialect.quote(col)
348
+ schema = self._schema[col]
349
+ if val is None:
350
+ and_exprs.append(Code(quoted + " IS NULL"))
351
+ continue
352
+ mk_v = schema.make_value(val)
353
+ constant_val = self.database.dialect._constant_value(mk_v)
354
+ where_expr = f"{quoted} = {constant_val}"
355
+ and_exprs.append(Code(where_expr))
356
+ if and_exprs:
357
+ key_exprs.append(and_(*and_exprs))
358
+ if key_exprs:
359
+ filters.append(or_(*key_exprs))
360
+ if filters or self.where:
361
+ select = select.where(*filters)
300
362
  else:
301
- select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
363
+ logger.warning("No filters applied; fetching up to {} rows without key restrictions", limit)
364
+
365
+ select = select.limit(limit)
302
366
 
303
367
  start_time = time.monotonic()
304
368
  result = self.database.query(select, List[Tuple])
data_diff/utils.py CHANGED
@@ -482,6 +482,9 @@ def number_to_human(n):
482
482
 
483
483
 
484
484
  def split_space(start, end, count) -> List[int]:
485
+ if isinstance(start, float) or isinstance(end, float):
486
+ step = (end - start) / (count + 1)
487
+ return [start + step * i for i in range(1, count + 1)]
485
488
  size = end - start
486
489
  assert count <= size, (count, size)
487
490
  return list(range(start, end, (size + 1) // (count + 1)))[1 : count + 1]
dcs_sdk/__version__.py CHANGED
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.4.8"
15
+ __version__ = "1.4.9"
@@ -395,8 +395,10 @@ class DBTableDiffer:
395
395
  error_message = f"Target table '{target_dataset.get('table_name')}' is empty"
396
396
  is_table_empty = True
397
397
  if not is_table_empty and not self.config.schema_diff:
398
+ pks_len = len(self.table1.key_columns)
398
399
  table_1_sample_data = self.table1.with_schema().get_sample_data(limit=100)
399
- table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100)
400
+ sample_keys = [list(row[:pks_len]) for row in table_1_sample_data]
401
+ table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100, sample_keys=sample_keys)
400
402
  self.diff_iter = diff_tables(
401
403
  self.table1,
402
404
  self.table2,
@@ -598,9 +600,14 @@ class DBTableDiffer:
598
600
  columns_order_wise=columns_order_wise_target if columns_order_wise_target else [],
599
601
  )
600
602
 
601
- sample_value_column_names = list(self.table1.key_columns) + list(self.table1.extra_columns)
602
- sample_value_source_dicts = [dict(zip(sample_value_column_names, row)) for row in table_1_sample_data]
603
- sample_value_target_dicts = [dict(zip(sample_value_column_names, row)) for row in table_2_sample_data]
603
+ sample_value_column_names_src = list(self.table1.key_columns) + list(self.table1.extra_columns)
604
+ sample_value_column_names_tgt = list(self.table2.key_columns) + list(self.table2.extra_columns)
605
+ sample_value_source_dicts = [
606
+ dict(zip(sample_value_column_names_src, row)) for row in table_1_sample_data
607
+ ]
608
+ sample_value_target_dicts = [
609
+ dict(zip(sample_value_column_names_tgt, row)) for row in table_2_sample_data
610
+ ]
604
611
 
605
612
  def get_pk(row, key_columns):
606
613
  return tuple(row[k] for k in key_columns)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dcs-sdk
3
- Version: 1.4.8
3
+ Version: 1.4.9
4
4
  Summary: SDK for DataChecks
5
5
  Author: Waterdip Labs
6
6
  Author-email: hello@waterdip.ai
@@ -60,7 +60,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
60
60
  Description-Content-Type: text/markdown
61
61
 
62
62
  <h1 align="center">
63
- DCS SDK v1.4.8
63
+ DCS SDK v1.4.9
64
64
  </h1>
65
65
 
66
66
  > SDK for DataChecks
@@ -6,7 +6,7 @@ data_diff/abcs/database_types.py,sha256=dHE6K6UtqFwX6LTjOqJu1OOb_XJBXODeMcZQ0cds
6
6
  data_diff/config.py,sha256=uRcoVVhPjVZqgQNwr18v6sPq06cGXDLemTUyitU57zA,4998
7
7
  data_diff/databases/__init__.py,sha256=NrBm1Paj7jkHZ_hQCD-4-Q1eeDdh3v9_bz1DkPDOv9g,1680
8
8
  data_diff/databases/_connect.py,sha256=nGsmtzDSKN8CK8zMkdcGZz0iExzkJDYw-PGebIkmQgc,11151
9
- data_diff/databases/base.py,sha256=X3i59bJwZJSSAZl9HaL3NQtAmx04zSoA-E-YpBWWWng,51801
9
+ data_diff/databases/base.py,sha256=Fd_OzWOL97YiTXE_oYsBthXVVLvJGbU5Y4JgR77Sra0,51904
10
10
  data_diff/databases/bigquery.py,sha256=PDwSkmWRW26gUl2SMOyIsiYtgrqghptYqG8_SaaiXb4,14709
11
11
  data_diff/databases/clickhouse.py,sha256=5DsW8UpyYsWI8I3AlPUvHWYdWdWpqzRsvsVwgqEyaLw,7554
12
12
  data_diff/databases/databricks.py,sha256=6cdwfAspg1GIgWFlQByvFcW_Hz1mnJeI9_2kZhOP8b4,9334
@@ -24,7 +24,7 @@ data_diff/databases/vertica.py,sha256=2dSDZp6qOEvUVPldI5Tgn7Sm3dCpC3vNXJL3qb3FDv
24
24
  data_diff/diff_tables.py,sha256=Ey88gUr9Wh8UVsgRlBCY3CACIYfHL52PxJSrd821aqg,20060
25
25
  data_diff/errors.py,sha256=4Yru8yOwyuDuBlTABnGCvJMSpe6-rbLJpNnVHeTTyHU,745
26
26
  data_diff/format.py,sha256=QFDjdZaBVf_N-jfKiX4ppOUdpXTPZXmv1j0pc1RiOoc,10245
27
- data_diff/hashdiff_tables.py,sha256=1hIO6rsUHHAR5sgI-UaKf7xXbqLsXVTvnhPjRqSNeIE,27993
27
+ data_diff/hashdiff_tables.py,sha256=K-JoMimAwACB3mqQLIdv5P5joYWyRMSZKcbDJWz5dlk,27993
28
28
  data_diff/info_tree.py,sha256=yHtFSoXuu6oBafLYOYQjUSKlB-DnAAd08U9HOEAdTPI,2799
29
29
  data_diff/joindiff_tables.py,sha256=fyrEYjyh2BX1vGibwVZLYM1V6JJTOY-uGXY-KInvMkw,17612
30
30
  data_diff/lexicographic_space.py,sha256=bBoCbbH1Mla9jNOq1b5RuwjAxSVU7gWkra673tPBwXQ,8305
@@ -37,20 +37,20 @@ data_diff/queries/base.py,sha256=pT-iaII7Nlu-w-Cuq9fhoNKX7-GSxkQ3Fk8K-tMkk60,964
37
37
  data_diff/queries/extras.py,sha256=aUm-ifj3BMlz4o4bbuHtmnvHZuptYAKGS5yWTHmNpvc,1270
38
38
  data_diff/query_utils.py,sha256=R7ZfRwcvv9Zf4zWXNln4tr_OxLmDI7CPmmCahYfHxlo,2101
39
39
  data_diff/schema.py,sha256=QoYSSB3k-svLXz680uRgsI4qjii8BFKOOQvheqtgEbs,2413
40
- data_diff/table_segment.py,sha256=W6S9kebBoSfglbsbyiWprMy3AM9oFgH-_CsJrjIgEbc,19018
40
+ data_diff/table_segment.py,sha256=-zYdppv8lW8ZfDRzZvmARm1fYxGHhAYPurR0r0GmWh8,21350
41
41
  data_diff/thread_utils.py,sha256=_692ERjnWfHKaZsLdg7CNfkKiRd66y7_kpgDwzntp44,3831
42
- data_diff/utils.py,sha256=kOTSkS4L1TO3Crx5mclP9E4RgGlc7z2RwzSRmANwrzA,23163
42
+ data_diff/utils.py,sha256=IHKaFfyizOiA1By0sfnTra7CJvgEDwr5KPGjev0kx0A,23328
43
43
  data_diff/version.py,sha256=Wk0ovyBlLEF2UaWLWEcVBLFElREtIxi7TU1hD3CuTFI,634
44
44
  dcs_sdk/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
45
45
  dcs_sdk/__main__.py,sha256=Qn8stIaQGrdLjHQ-H7xO0T-brtq5RWZoWU9QvqoarV8,683
46
- dcs_sdk/__version__.py,sha256=mTRqCxtg196g8dlefiZhyJ3s3O3JtAAx9-0RLHgGFDQ,633
46
+ dcs_sdk/__version__.py,sha256=lS23jYuOvSiv7SJ1XHD4d1O93eassDRAcs2EuasDZ-0,633
47
47
  dcs_sdk/cli/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
48
48
  dcs_sdk/cli/cli.py,sha256=LyrRk972OL9pTqrvBeXWBu5rUDAN17lQ1g8FdSRW_8M,4299
49
49
  dcs_sdk/sdk/__init__.py,sha256=skrZcgWWJBL6NXTUERywJ3qRJRemgpDXyW7lPg1FJk8,2107
50
50
  dcs_sdk/sdk/config/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
51
51
  dcs_sdk/sdk/config/config_loader.py,sha256=Sg0gvleGrOcDaX41amgw6Uj6c_s4Zel6gQalIp9CltI,21416
52
52
  dcs_sdk/sdk/data_diff/__init__.py,sha256=RkfhRKLXEForLCs4rZkTf0qc_b0TokSggSAcKI4yfZg,610
53
- dcs_sdk/sdk/data_diff/data_differ.py,sha256=9Kl35REucew02V0WBWTTj6zM2TUlOrA7gJ4P8WGGb0Q,35212
53
+ dcs_sdk/sdk/data_diff/data_differ.py,sha256=8WBDz-q-Li5OyJcG_a6JRnDPCJBvIsMki-DrD4EgDzQ,35575
54
54
  dcs_sdk/sdk/rules/__init__.py,sha256=_BkKcE_jfdDQI_ECdOamJaefMKEXrKpYjPpnBQXl_Xs,657
55
55
  dcs_sdk/sdk/rules/rules_mappping.py,sha256=fxakVkf7B2cVkYSO946LTim_HmMsl6lBDBqZjTTsSPI,1292
56
56
  dcs_sdk/sdk/rules/rules_repository.py,sha256=x0Rli-wdnHAmXm5526go_qC3P-eFRt-4L7fs4hNqC-g,7564
@@ -65,7 +65,7 @@ dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py,sha256=puAWP
65
65
  dcs_sdk/sdk/utils/table.py,sha256=mXArkEYmxf7mz_DJvyHVHIN9RyxaF708I70n52JrptA,18268
66
66
  dcs_sdk/sdk/utils/themes.py,sha256=Meo2Yldv4uyPpEqI7qdA28Aa6sxtwUU1dLKKm4QavjM,1403
67
67
  dcs_sdk/sdk/utils/utils.py,sha256=vF2zAvgt__Y8limicWTEWRyn41SBVJN81ZCTBRy6hQg,11907
68
- dcs_sdk-1.4.8.dist-info/METADATA,sha256=csButDjidmwzDZrtJtyz0DT-KvisbwRxg2jUBuoMCv4,6221
69
- dcs_sdk-1.4.8.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
70
- dcs_sdk-1.4.8.dist-info/entry_points.txt,sha256=zQtrZL7YuaKtt6WPwihCTV1BRXnqBkaY6zUGdYJbBSg,49
71
- dcs_sdk-1.4.8.dist-info/RECORD,,
68
+ dcs_sdk-1.4.9.dist-info/METADATA,sha256=JUBldmhqJC8gu9G2eftTjkcf8rgWsPgsPQkk5tlkHWw,6221
69
+ dcs_sdk-1.4.9.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
70
+ dcs_sdk-1.4.9.dist-info/entry_points.txt,sha256=zQtrZL7YuaKtt6WPwihCTV1BRXnqBkaY6zUGdYJbBSg,49
71
+ dcs_sdk-1.4.9.dist-info/RECORD,,