dcs-sdk 1.4.8__tar.gz → 1.4.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/PKG-INFO +2 -2
  2. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/README.md +1 -1
  3. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/base.py +3 -0
  4. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/hashdiff_tables.py +1 -1
  5. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/table_segment.py +74 -10
  6. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/utils.py +3 -0
  7. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/__version__.py +1 -1
  8. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/data_diff/data_differ.py +11 -4
  9. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/pyproject.toml +1 -1
  10. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/__init__.py +0 -0
  11. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/__main__.py +0 -0
  12. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/abcs/__init__.py +0 -0
  13. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/abcs/compiler.py +0 -0
  14. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/abcs/database_types.py +0 -0
  15. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/config.py +0 -0
  16. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/__init__.py +0 -0
  17. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/_connect.py +0 -0
  18. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/bigquery.py +0 -0
  19. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/clickhouse.py +0 -0
  20. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/databricks.py +0 -0
  21. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/duckdb.py +0 -0
  22. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/mssql.py +0 -0
  23. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/mysql.py +0 -0
  24. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/oracle.py +0 -0
  25. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/postgresql.py +0 -0
  26. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/presto.py +0 -0
  27. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/redshift.py +0 -0
  28. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/snowflake.py +0 -0
  29. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/sybase.py +0 -0
  30. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/trino.py +0 -0
  31. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/databases/vertica.py +0 -0
  32. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/diff_tables.py +0 -0
  33. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/errors.py +0 -0
  34. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/format.py +0 -0
  35. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/info_tree.py +0 -0
  36. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/joindiff_tables.py +0 -0
  37. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/lexicographic_space.py +0 -0
  38. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/parse_time.py +0 -0
  39. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/py.typed +0 -0
  40. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/queries/__init__.py +0 -0
  41. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/queries/api.py +0 -0
  42. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/queries/ast_classes.py +0 -0
  43. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/queries/base.py +0 -0
  44. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/queries/extras.py +0 -0
  45. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/query_utils.py +0 -0
  46. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/schema.py +0 -0
  47. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/thread_utils.py +0 -0
  48. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/data_diff/version.py +0 -0
  49. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/__init__.py +0 -0
  50. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/__main__.py +0 -0
  51. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/cli/__init__.py +0 -0
  52. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/cli/cli.py +0 -0
  53. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/__init__.py +0 -0
  54. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/config/__init__.py +0 -0
  55. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/config/config_loader.py +0 -0
  56. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/data_diff/__init__.py +0 -0
  57. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/__init__.py +0 -0
  58. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/rules_mappping.py +0 -0
  59. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/rules_repository.py +0 -0
  60. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/rules/schema_rules.py +0 -0
  61. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/__init__.py +0 -0
  62. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/serializer.py +0 -0
  63. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/__init__.py +0 -0
  64. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/base_provider.py +0 -0
  65. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +0 -0
  66. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +0 -0
  67. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +0 -0
  68. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/table.py +0 -0
  69. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/themes.py +0 -0
  70. {dcs_sdk-1.4.8 → dcs_sdk-1.4.9}/dcs_sdk/sdk/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dcs-sdk
3
- Version: 1.4.8
3
+ Version: 1.4.9
4
4
  Summary: SDK for DataChecks
5
5
  Author: Waterdip Labs
6
6
  Author-email: hello@waterdip.ai
@@ -60,7 +60,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
60
60
  Description-Content-Type: text/markdown
61
61
 
62
62
  <h1 align="center">
63
- DCS SDK v1.4.8
63
+ DCS SDK v1.4.9
64
64
  </h1>
65
65
 
66
66
  > SDK for DataChecks
@@ -1,5 +1,5 @@
1
1
  <h1 align="center">
2
- DCS SDK v1.4.8
2
+ DCS SDK v1.4.9
3
3
  </h1>
4
4
 
5
5
  > SDK for DataChecks
@@ -54,6 +54,7 @@ from typing_extensions import Self
54
54
  from data_diff.abcs.compiler import AbstractCompiler, Compilable
55
55
  from data_diff.abcs.database_types import (
56
56
  JSON,
57
+ ArithAlphanumeric,
57
58
  Array,
58
59
  Boolean,
59
60
  ColType,
@@ -753,6 +754,8 @@ class BaseDialect(abc.ABC):
753
754
  return f"'{v.decode()}'"
754
755
  elif isinstance(v, Code):
755
756
  return v.code
757
+ elif isinstance(v, ArithAlphanumeric):
758
+ return f"'{v._str}'"
756
759
  return repr(v)
757
760
 
758
761
  def constant_values(self, rows) -> str:
@@ -522,7 +522,7 @@ class HashDiffer(TableDiffer):
522
522
 
523
523
  if count1 == 0 and count2 == 0:
524
524
  logger.debug(
525
- "Uneven distribution of keys detected in segment %s..%s (big gaps in the key column). "
525
+ "Uneven distribution of keys detected in segment {}..{} (big gaps in the key column). "
526
526
  "For better performance, we recommend to increase the bisection-threshold.",
527
527
  table1.min_key,
528
528
  table1.max_key,
@@ -14,6 +14,7 @@
14
14
 
15
15
  import logging
16
16
  import time
17
+ from decimal import Decimal
17
18
  from itertools import product
18
19
  from typing import Container, Dict, List, Optional, Sequence, Tuple
19
20
 
@@ -24,7 +25,18 @@ from typing_extensions import Self
24
25
 
25
26
  from data_diff.abcs.database_types import DbKey, DbPath, DbTime, IKey, NumericType
26
27
  from data_diff.databases.base import Database
27
- from data_diff.queries.api import SKIP, Code, Count, Expr, max_, min_, table, this
28
+ from data_diff.queries.api import (
29
+ SKIP,
30
+ Code,
31
+ Count,
32
+ Expr,
33
+ and_,
34
+ max_,
35
+ min_,
36
+ or_,
37
+ table,
38
+ this,
39
+ )
28
40
  from data_diff.queries.extras import (
29
41
  ApplyFuncAndNormalizeAsString,
30
42
  Checksum,
@@ -54,6 +66,10 @@ def split_key_space(min_key: DbKey, max_key: DbKey, count: int) -> List[DbKey]:
54
66
  assert type(min_key) is type(max_key)
55
67
  checkpoints = min_key.range(max_key, count)
56
68
  else:
69
+ if isinstance(min_key, Decimal):
70
+ min_key = float(min_key)
71
+ if isinstance(max_key, Decimal):
72
+ max_key = float(max_key)
57
73
  checkpoints = split_space(min_key, max_key, count)
58
74
 
59
75
  assert all(min_key < x < max_key for x in checkpoints)
@@ -288,17 +304,65 @@ class TableSegment:
288
304
 
289
305
  return result
290
306
 
291
- def get_sample_data(self, limit: int = 100) -> list:
292
- "Download all the relevant values of the segment from the database"
307
+ # def get_sample_data(self, limit: int = 100) -> list:
308
+ # "Download all the relevant values of the segment from the database"
309
+
310
+ # exprs = []
311
+ # for c in self.key_columns:
312
+ # quoted = self.database.dialect.quote(c)
313
+ # exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
314
+ # if self.where:
315
+ # select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
316
+ # self.key_columns
317
+ # else:
318
+ # select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
319
+
320
+ # start_time = time.monotonic()
321
+ # result = self.database.query(select, List[Tuple])
322
+ # query_time_ms = (time.monotonic() - start_time) * 1000
323
+ # self._update_stats("row_fetch_queries_stats", query_time_ms)
324
+
325
+ def get_sample_data(self, limit: int = 100, sample_keys: Optional[List[List[DbKey]]] = None) -> list:
326
+ """
327
+ Download relevant values of the segment from the database.
328
+ If `sample_keys` is provided, it filters rows matching those composite keys.
329
+
330
+ Parameters:
331
+ limit (int): Maximum number of rows to return (default: 100).
332
+ sample_keys (Optional[List[List[DbKey]]]): List of composite keys to filter rows.
333
+ Each inner list must match the number of key_columns.
334
+
335
+ Returns:
336
+ list: List of tuples containing the queried row data.
337
+ """
338
+ select = self.make_select().select(*self._relevant_columns_repr)
293
339
 
294
- exprs = []
295
- for c in self.key_columns:
296
- quoted = self.database.dialect.quote(c)
297
- exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
298
- if self.where:
299
- select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
340
+ filters = []
341
+
342
+ if sample_keys:
343
+ key_exprs = []
344
+ for key_values in sample_keys:
345
+ and_exprs = []
346
+ for col, val in safezip(self.key_columns, key_values):
347
+ quoted = self.database.dialect.quote(col)
348
+ schema = self._schema[col]
349
+ if val is None:
350
+ and_exprs.append(Code(quoted + " IS NULL"))
351
+ continue
352
+ mk_v = schema.make_value(val)
353
+ constant_val = self.database.dialect._constant_value(mk_v)
354
+ where_expr = f"{quoted} = {constant_val}"
355
+ and_exprs.append(Code(where_expr))
356
+ if and_exprs:
357
+ key_exprs.append(and_(*and_exprs))
358
+ if key_exprs:
359
+ filters.append(or_(*key_exprs))
360
+ if filters or self.where:
361
+ select = select.where(*filters)
300
362
  else:
301
- select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
363
+ logger.warning("No filters applied; fetching up to {} rows without key restrictions", limit)
364
+
365
+ select = select.limit(limit)
302
366
 
303
367
  start_time = time.monotonic()
304
368
  result = self.database.query(select, List[Tuple])
@@ -482,6 +482,9 @@ def number_to_human(n):
482
482
 
483
483
 
484
484
  def split_space(start, end, count) -> List[int]:
485
+ if isinstance(start, float) or isinstance(end, float):
486
+ step = (end - start) / (count + 1)
487
+ return [start + step * i for i in range(1, count + 1)]
485
488
  size = end - start
486
489
  assert count <= size, (count, size)
487
490
  return list(range(start, end, (size + 1) // (count + 1)))[1 : count + 1]
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.4.8"
15
+ __version__ = "1.4.9"
@@ -395,8 +395,10 @@ class DBTableDiffer:
395
395
  error_message = f"Target table '{target_dataset.get('table_name')}' is empty"
396
396
  is_table_empty = True
397
397
  if not is_table_empty and not self.config.schema_diff:
398
+ pks_len = len(self.table1.key_columns)
398
399
  table_1_sample_data = self.table1.with_schema().get_sample_data(limit=100)
399
- table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100)
400
+ sample_keys = [list(row[:pks_len]) for row in table_1_sample_data]
401
+ table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100, sample_keys=sample_keys)
400
402
  self.diff_iter = diff_tables(
401
403
  self.table1,
402
404
  self.table2,
@@ -598,9 +600,14 @@ class DBTableDiffer:
598
600
  columns_order_wise=columns_order_wise_target if columns_order_wise_target else [],
599
601
  )
600
602
 
601
- sample_value_column_names = list(self.table1.key_columns) + list(self.table1.extra_columns)
602
- sample_value_source_dicts = [dict(zip(sample_value_column_names, row)) for row in table_1_sample_data]
603
- sample_value_target_dicts = [dict(zip(sample_value_column_names, row)) for row in table_2_sample_data]
603
+ sample_value_column_names_src = list(self.table1.key_columns) + list(self.table1.extra_columns)
604
+ sample_value_column_names_tgt = list(self.table2.key_columns) + list(self.table2.extra_columns)
605
+ sample_value_source_dicts = [
606
+ dict(zip(sample_value_column_names_src, row)) for row in table_1_sample_data
607
+ ]
608
+ sample_value_target_dicts = [
609
+ dict(zip(sample_value_column_names_tgt, row)) for row in table_2_sample_data
610
+ ]
604
611
 
605
612
  def get_pk(row, key_columns):
606
613
  return tuple(row[k] for k in key_columns)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dcs-sdk"
3
- version = "1.4.8"
3
+ version = "1.4.9"
4
4
  description = "SDK for DataChecks"
5
5
  authors = ["Waterdip Labs <hello@waterdip.ai>"]
6
6
  readme = "README.md"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes