dcs-sdk 1.4.8__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/PKG-INFO +2 -2
  2. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/README.md +1 -1
  3. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/abcs/database_types.py +76 -28
  4. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/base.py +18 -0
  5. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/sybase.py +7 -1
  6. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/hashdiff_tables.py +1 -1
  7. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/table_segment.py +83 -11
  8. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/utils.py +245 -21
  9. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/__version__.py +1 -1
  10. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/data_diff/data_differ.py +13 -6
  11. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/pyproject.toml +1 -1
  12. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/__init__.py +0 -0
  13. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/__main__.py +0 -0
  14. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/abcs/__init__.py +0 -0
  15. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/abcs/compiler.py +0 -0
  16. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/config.py +0 -0
  17. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/__init__.py +0 -0
  18. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/_connect.py +0 -0
  19. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/bigquery.py +0 -0
  20. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/clickhouse.py +0 -0
  21. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/databricks.py +0 -0
  22. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/duckdb.py +0 -0
  23. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/mssql.py +0 -0
  24. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/mysql.py +0 -0
  25. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/oracle.py +0 -0
  26. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/postgresql.py +0 -0
  27. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/presto.py +0 -0
  28. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/redshift.py +0 -0
  29. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/snowflake.py +0 -0
  30. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/trino.py +0 -0
  31. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/databases/vertica.py +0 -0
  32. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/diff_tables.py +0 -0
  33. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/errors.py +0 -0
  34. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/format.py +0 -0
  35. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/info_tree.py +0 -0
  36. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/joindiff_tables.py +0 -0
  37. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/lexicographic_space.py +0 -0
  38. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/parse_time.py +0 -0
  39. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/py.typed +0 -0
  40. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/queries/__init__.py +0 -0
  41. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/queries/api.py +0 -0
  42. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/queries/ast_classes.py +0 -0
  43. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/queries/base.py +0 -0
  44. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/queries/extras.py +0 -0
  45. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/query_utils.py +0 -0
  46. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/schema.py +0 -0
  47. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/thread_utils.py +0 -0
  48. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/data_diff/version.py +0 -0
  49. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/__init__.py +0 -0
  50. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/__main__.py +0 -0
  51. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/cli/__init__.py +0 -0
  52. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/cli/cli.py +0 -0
  53. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/__init__.py +0 -0
  54. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/config/__init__.py +0 -0
  55. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/config/config_loader.py +0 -0
  56. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/data_diff/__init__.py +0 -0
  57. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/rules/__init__.py +0 -0
  58. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/rules/rules_mappping.py +0 -0
  59. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/rules/rules_repository.py +0 -0
  60. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/rules/schema_rules.py +0 -0
  61. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/utils/__init__.py +0 -0
  62. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/utils/serializer.py +0 -0
  63. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/utils/similarity_score/__init__.py +0 -0
  64. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/utils/similarity_score/base_provider.py +0 -0
  65. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +0 -0
  66. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +0 -0
  67. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +0 -0
  68. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/utils/table.py +0 -0
  69. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/utils/themes.py +0 -0
  70. {dcs_sdk-1.4.8 → dcs_sdk-1.5.0}/dcs_sdk/sdk/utils/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: dcs-sdk
3
- Version: 1.4.8
3
+ Version: 1.5.0
4
4
  Summary: SDK for DataChecks
5
5
  Author: Waterdip Labs
6
6
  Author-email: hello@waterdip.ai
@@ -60,7 +60,7 @@ Requires-Dist: vertica-python (>=1.4.0) ; extra == "vertica" or extra == "all-db
60
60
  Description-Content-Type: text/markdown
61
61
 
62
62
  <h1 align="center">
63
- DCS SDK v1.4.8
63
+ DCS SDK v1.5.0
64
64
  </h1>
65
65
 
66
66
  > SDK for DataChecks
@@ -1,5 +1,5 @@
1
1
  <h1 align="center">
2
- DCS SDK v1.4.8
2
+ DCS SDK v1.5.0
3
3
  </h1>
4
4
 
5
5
  > SDK for DataChecks
@@ -14,15 +14,35 @@
14
14
 
15
15
  import decimal
16
16
  from abc import ABC, abstractmethod
17
- from datetime import datetime
17
+ from datetime import date, datetime
18
18
  from typing import Collection, List, Optional, Tuple, Type, TypeVar, Union
19
19
 
20
20
  import attrs
21
21
 
22
- from data_diff.utils import ArithAlphanumeric, ArithUnicodeString, ArithUUID, Unknown
22
+ from data_diff.utils import (
23
+ ArithAlphanumeric,
24
+ ArithDate,
25
+ ArithDateTime,
26
+ ArithTimestamp,
27
+ ArithTimestampTZ,
28
+ ArithUnicodeString,
29
+ ArithUUID,
30
+ Unknown,
31
+ )
23
32
 
24
33
  DbPath = Tuple[str, ...]
25
- DbKey = Union[int, str, bytes, ArithUUID, ArithAlphanumeric, ArithUnicodeString]
34
+ DbKey = Union[
35
+ int,
36
+ str,
37
+ bytes,
38
+ ArithUUID,
39
+ ArithAlphanumeric,
40
+ ArithUnicodeString,
41
+ ArithDateTime,
42
+ ArithDate,
43
+ ArithTimestamp,
44
+ ArithTimestampTZ,
45
+ ]
26
46
  DbTime = datetime
27
47
 
28
48
  N = TypeVar("N")
@@ -150,23 +170,66 @@ class TemporalType(PrecisionType):
150
170
 
151
171
 
152
172
  @attrs.define(frozen=True)
153
- class Timestamp(TemporalType):
154
- pass
173
+ class IKey(ABC):
174
+ "Interface for ColType, for using a column as a key in table."
175
+
176
+ @property
177
+ @abstractmethod
178
+ def python_type(self) -> type:
179
+ "Return the equivalent Python type of the key"
180
+
181
+ def make_value(self, value):
182
+ if isinstance(value, self.python_type):
183
+ return value
184
+ return self.python_type(value)
155
185
 
156
186
 
157
187
  @attrs.define(frozen=True)
158
- class TimestampTZ(TemporalType):
159
- pass
188
+ class Timestamp(TemporalType, IKey):
189
+ @property
190
+ def python_type(self) -> type:
191
+ return ArithTimestamp
192
+
193
+ def make_value(self, value):
194
+ if isinstance(value, ArithTimestamp):
195
+ return value
196
+ return ArithTimestamp(value)
160
197
 
161
198
 
162
199
  @attrs.define(frozen=True)
163
- class Datetime(TemporalType):
164
- pass
200
+ class TimestampTZ(TemporalType, IKey):
201
+ @property
202
+ def python_type(self) -> type:
203
+ return ArithTimestampTZ
204
+
205
+ def make_value(self, value):
206
+ if isinstance(value, ArithTimestampTZ):
207
+ return value
208
+ return ArithTimestampTZ(value)
165
209
 
166
210
 
167
211
  @attrs.define(frozen=True)
168
- class Date(TemporalType):
169
- pass
212
+ class Datetime(TemporalType, IKey):
213
+ @property
214
+ def python_type(self) -> type:
215
+ return ArithDateTime
216
+
217
+ def make_value(self, value):
218
+ if isinstance(value, ArithDateTime):
219
+ return value
220
+ return ArithDateTime(value)
221
+
222
+
223
+ @attrs.define(frozen=True)
224
+ class Date(TemporalType, IKey):
225
+ @property
226
+ def python_type(self) -> type:
227
+ return ArithDate
228
+
229
+ def make_value(self, value):
230
+ if isinstance(value, ArithDate):
231
+ return value
232
+ return ArithDate(value)
170
233
 
171
234
 
172
235
  @attrs.define(frozen=True)
@@ -190,21 +253,6 @@ class Float(FractionalType):
190
253
  python_type = float
191
254
 
192
255
 
193
- @attrs.define(frozen=True)
194
- class IKey(ABC):
195
- "Interface for ColType, for using a column as a key in table."
196
-
197
- @property
198
- @abstractmethod
199
- def python_type(self) -> type:
200
- "Return the equivalent Python type of the key"
201
-
202
- def make_value(self, value):
203
- if isinstance(value, self.python_type):
204
- return value
205
- return self.python_type(value)
206
-
207
-
208
256
  @attrs.define(frozen=True)
209
257
  class Decimal(FractionalType, IKey): # Snowflake may use Decimal as a key
210
258
  @property
@@ -243,7 +291,7 @@ class ColType_UUID(ColType, IKey):
243
291
 
244
292
  @attrs.define(frozen=True)
245
293
  class ColType_Alphanum(ColType, IKey):
246
- python_type = ArithAlphanumeric
294
+ python_type = ArithUnicodeString
247
295
 
248
296
 
249
297
  @attrs.define(frozen=True)
@@ -273,7 +321,7 @@ class String_Alphanum(ColType_Alphanum, StringType):
273
321
  @staticmethod
274
322
  def test_value(value: str) -> bool:
275
323
  try:
276
- ArithAlphanumeric(value)
324
+ ArithUnicodeString(value)
277
325
  return True
278
326
  except ValueError:
279
327
  return False
@@ -54,6 +54,8 @@ from typing_extensions import Self
54
54
  from data_diff.abcs.compiler import AbstractCompiler, Compilable
55
55
  from data_diff.abcs.database_types import (
56
56
  JSON,
57
+ ArithAlphanumeric,
58
+ ArithUnicodeString,
57
59
  Array,
58
60
  Boolean,
59
61
  ColType,
@@ -114,7 +116,11 @@ from data_diff.queries.extras import (
114
116
  )
115
117
  from data_diff.schema import RawColumnInfo
116
118
  from data_diff.utils import (
119
+ ArithDate,
120
+ ArithDateTime,
117
121
  ArithString,
122
+ ArithTimestamp,
123
+ ArithTimestampTZ,
118
124
  ArithUUID,
119
125
  SybaseDriverTypes,
120
126
  is_uuid,
@@ -753,6 +759,18 @@ class BaseDialect(abc.ABC):
753
759
  return f"'{v.decode()}'"
754
760
  elif isinstance(v, Code):
755
761
  return v.code
762
+ elif isinstance(v, ArithAlphanumeric):
763
+ return f"'{v._str}'"
764
+ elif isinstance(v, ArithUnicodeString):
765
+ return f"'{v._str}'"
766
+ elif isinstance(v, ArithDate):
767
+ return f"'{str(v)}'"
768
+ elif isinstance(v, ArithTimestamp):
769
+ return f"'{str(v)}'"
770
+ elif isinstance(v, ArithTimestampTZ):
771
+ return f"'{str(v)}'"
772
+ elif isinstance(v, ArithDateTime):
773
+ return f"'{str(v)}'"
756
774
  return repr(v)
757
775
 
758
776
  def constant_values(self, rows) -> str:
@@ -389,7 +389,7 @@ class Sybase(ThreadedDatabase):
389
389
  username = self._args.get("user", None)
390
390
  password = self._args.get("password", None)
391
391
  driver = self._args.get("driver", None)
392
- max_query_timeout = 60 * 60 # 3600 seconds
392
+ max_query_timeout = 60 * 5 # 300 seconds
393
393
 
394
394
  if self.dialect.sybase_driver_type.is_freetds:
395
395
  conn_dict = {
@@ -583,7 +583,13 @@ class Sybase(ThreadedDatabase):
583
583
  self.dialect.query_config_for_free_tds["ase_query_chosen"] = True
584
584
  return ase_query
585
585
  else:
586
+ max_temp_space_usage_query = "SET TEMPORARY OPTION MAX_TEMP_SPACE_PER_CONNECTION = 5120"
587
+ if self._query_cursor(self._conn.cursor(), max_temp_space_usage_query, test_query=True):
588
+ logger.info("Max temporary space usage set successfully.")
589
+ else:
590
+ logger.warning("Failed to set max temporary space usage, continuing with default settings.")
586
591
  logger.info("Sybase IQ Detected")
592
+
587
593
  self.dialect.query_config_for_free_tds["freetds_query_chosen"] = True
588
594
  return iq_query
589
595
  except Exception as e:
@@ -522,7 +522,7 @@ class HashDiffer(TableDiffer):
522
522
 
523
523
  if count1 == 0 and count2 == 0:
524
524
  logger.debug(
525
- "Uneven distribution of keys detected in segment %s..%s (big gaps in the key column). "
525
+ "Uneven distribution of keys detected in segment {}..{} (big gaps in the key column). "
526
526
  "For better performance, we recommend to increase the bisection-threshold.",
527
527
  table1.min_key,
528
528
  table1.max_key,
@@ -14,6 +14,7 @@
14
14
 
15
15
  import logging
16
16
  import time
17
+ from decimal import Decimal
17
18
  from itertools import product
18
19
  from typing import Container, Dict, List, Optional, Sequence, Tuple
19
20
 
@@ -24,7 +25,18 @@ from typing_extensions import Self
24
25
 
25
26
  from data_diff.abcs.database_types import DbKey, DbPath, DbTime, IKey, NumericType
26
27
  from data_diff.databases.base import Database
27
- from data_diff.queries.api import SKIP, Code, Count, Expr, max_, min_, table, this
28
+ from data_diff.queries.api import (
29
+ SKIP,
30
+ Code,
31
+ Count,
32
+ Expr,
33
+ and_,
34
+ max_,
35
+ min_,
36
+ or_,
37
+ table,
38
+ this,
39
+ )
28
40
  from data_diff.queries.extras import (
29
41
  ApplyFuncAndNormalizeAsString,
30
42
  Checksum,
@@ -32,7 +44,11 @@ from data_diff.queries.extras import (
32
44
  )
33
45
  from data_diff.schema import RawColumnInfo, Schema, create_schema
34
46
  from data_diff.utils import (
47
+ ArithDate,
48
+ ArithDateTime,
35
49
  ArithString,
50
+ ArithTimestamp,
51
+ ArithTimestampTZ,
36
52
  ArithUnicodeString,
37
53
  Vector,
38
54
  safezip,
@@ -50,10 +66,18 @@ def split_key_space(min_key: DbKey, max_key: DbKey, count: int) -> List[DbKey]:
50
66
  if max_key - min_key <= count:
51
67
  count = 1
52
68
 
53
- if isinstance(min_key, ArithString) or isinstance(min_key, ArithUnicodeString):
69
+ # Handle arithmetic string types (including temporal types)
70
+ if isinstance(
71
+ min_key, (ArithString, ArithUnicodeString, ArithDateTime, ArithDate, ArithTimestamp, ArithTimestampTZ)
72
+ ):
54
73
  assert type(min_key) is type(max_key)
55
74
  checkpoints = min_key.range(max_key, count)
56
75
  else:
76
+ # Handle numeric types
77
+ if isinstance(min_key, Decimal):
78
+ min_key = float(min_key)
79
+ if isinstance(max_key, Decimal):
80
+ max_key = float(max_key)
57
81
  checkpoints = split_space(min_key, max_key, count)
58
82
 
59
83
  assert all(min_key < x < max_key for x in checkpoints)
@@ -288,17 +312,65 @@ class TableSegment:
288
312
 
289
313
  return result
290
314
 
291
- def get_sample_data(self, limit: int = 100) -> list:
292
- "Download all the relevant values of the segment from the database"
315
+ # def get_sample_data(self, limit: int = 100) -> list:
316
+ # "Download all the relevant values of the segment from the database"
317
+
318
+ # exprs = []
319
+ # for c in self.key_columns:
320
+ # quoted = self.database.dialect.quote(c)
321
+ # exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
322
+ # if self.where:
323
+ # select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
324
+ # self.key_columns
325
+ # else:
326
+ # select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
327
+
328
+ # start_time = time.monotonic()
329
+ # result = self.database.query(select, List[Tuple])
330
+ # query_time_ms = (time.monotonic() - start_time) * 1000
331
+ # self._update_stats("row_fetch_queries_stats", query_time_ms)
332
+
333
+ def get_sample_data(self, limit: int = 100, sample_keys: Optional[List[List[DbKey]]] = None) -> list:
334
+ """
335
+ Download relevant values of the segment from the database.
336
+ If `sample_keys` is provided, it filters rows matching those composite keys.
337
+
338
+ Parameters:
339
+ limit (int): Maximum number of rows to return (default: 100).
340
+ sample_keys (Optional[List[List[DbKey]]]): List of composite keys to filter rows.
341
+ Each inner list must match the number of key_columns.
342
+
343
+ Returns:
344
+ list: List of tuples containing the queried row data.
345
+ """
346
+ select = self.make_select().select(*self._relevant_columns_repr)
293
347
 
294
- exprs = []
295
- for c in self.key_columns:
296
- quoted = self.database.dialect.quote(c)
297
- exprs.append(NormalizeAsString(Code(quoted), self._schema[c]))
298
- if self.where:
299
- select = self.source_table.select(*self._relevant_columns_repr).where(Code(self._where())).limit(limit)
348
+ filters = []
349
+
350
+ if sample_keys:
351
+ key_exprs = []
352
+ for key_values in sample_keys:
353
+ and_exprs = []
354
+ for col, val in safezip(self.key_columns, key_values):
355
+ quoted = self.database.dialect.quote(col)
356
+ schema = self._schema[col]
357
+ if val is None:
358
+ and_exprs.append(Code(quoted + " IS NULL"))
359
+ continue
360
+ mk_v = schema.make_value(val)
361
+ constant_val = self.database.dialect._constant_value(mk_v)
362
+ where_expr = f"{quoted} = {constant_val}"
363
+ and_exprs.append(Code(where_expr))
364
+ if and_exprs:
365
+ key_exprs.append(and_(*and_exprs))
366
+ if key_exprs:
367
+ filters.append(or_(*key_exprs))
368
+ if filters or self.where:
369
+ select = select.where(*filters)
300
370
  else:
301
- select = self.source_table.select(*self._relevant_columns_repr).limit(limit)
371
+ logger.warning("No filters applied; fetching up to {} rows without key restrictions", limit)
372
+
373
+ select = select.limit(limit)
302
374
 
303
375
  start_time = time.monotonic()
304
376
  result = self.database.query(select, List[Tuple])
@@ -21,7 +21,7 @@ import string
21
21
  import threading
22
22
  from abc import abstractmethod
23
23
  from dataclasses import dataclass
24
- from datetime import datetime
24
+ from datetime import date, datetime, time
25
25
  from typing import (
26
26
  Any,
27
27
  Dict,
@@ -179,11 +179,224 @@ def _any_to_uuid(v: Union[str, int, UUID, "ArithUUID"]) -> UUID:
179
179
  raise ValueError(f"Cannot convert a value to UUID: {v!r}")
180
180
 
181
181
 
182
+ def _any_to_datetime(v: Union[str, datetime, date, "ArithDateTime"]) -> datetime:
183
+ """Convert various types to datetime object."""
184
+ if isinstance(v, ArithDateTime):
185
+ return v._dt
186
+ elif isinstance(v, datetime):
187
+ return v
188
+ elif isinstance(v, date):
189
+ return datetime.combine(v, time.min)
190
+ elif isinstance(v, str):
191
+ # Try to parse ISO format strings
192
+ try:
193
+ return datetime.fromisoformat(v.replace("Z", "+00:00"))
194
+ except ValueError:
195
+ # Fallback parsing for other common formats
196
+ try:
197
+ return datetime.strptime(v, "%Y-%m-%d %H:%M:%S")
198
+ except ValueError:
199
+ try:
200
+ return datetime.strptime(v, "%Y-%m-%d")
201
+ except ValueError:
202
+ raise ValueError(f"Cannot parse datetime string: {v!r}")
203
+ else:
204
+ raise ValueError(f"Cannot convert value to datetime: {v!r}")
205
+
206
+
207
+ def _any_to_date(v: Union[str, datetime, date, "ArithDate"]) -> date:
208
+ """Convert various types to date object."""
209
+ if isinstance(v, ArithDate):
210
+ return v._date
211
+ elif isinstance(v, date):
212
+ return v
213
+ elif isinstance(v, datetime):
214
+ return v.date()
215
+ elif isinstance(v, str):
216
+ try:
217
+ return datetime.fromisoformat(v.replace("Z", "+00:00")).date()
218
+ except ValueError:
219
+ try:
220
+ return datetime.strptime(v, "%Y-%m-%d").date()
221
+ except ValueError:
222
+ raise ValueError(f"Cannot parse date string: {v!r}")
223
+ else:
224
+ raise ValueError(f"Cannot convert value to date: {v!r}")
225
+
226
+
227
+ @attrs.define(frozen=True, eq=False, order=False)
228
+ class ArithDateTime(ArithString):
229
+ """A datetime that supports basic arithmetic and range operations for database diffing."""
230
+
231
+ _dt: datetime = attrs.field(converter=_any_to_datetime)
232
+
233
+ def range(self, other: "ArithDateTime", count: int) -> List[Self]:
234
+ """Generate evenly spaced datetime checkpoints between self and other."""
235
+ assert isinstance(other, ArithDateTime)
236
+
237
+ start_ts = self._dt.timestamp()
238
+ end_ts = other._dt.timestamp()
239
+
240
+ checkpoints = split_space(start_ts, end_ts, count)
241
+ return [self.new(datetime.fromtimestamp(ts)) for ts in checkpoints]
242
+
243
+ def __int__(self) -> int:
244
+ """Convert to timestamp for arithmetic operations."""
245
+ return int(self._dt.timestamp())
246
+
247
+ def __add__(self, other: Union[int, float]) -> Self:
248
+ """Add seconds to the datetime."""
249
+ if isinstance(other, (int, float)):
250
+ new_ts = self._dt.timestamp() + other
251
+ return self.new(datetime.fromtimestamp(new_ts))
252
+ return NotImplemented
253
+
254
+ def __sub__(self, other: Union["ArithDateTime", int, float]):
255
+ """Subtract seconds or another datetime."""
256
+ if isinstance(other, (int, float)):
257
+ new_ts = self._dt.timestamp() - other
258
+ return self.new(datetime.fromtimestamp(new_ts))
259
+ elif isinstance(other, ArithDateTime):
260
+ return self._dt.timestamp() - other._dt.timestamp()
261
+ return NotImplemented
262
+
263
+ def __eq__(self, other: object) -> bool:
264
+ if isinstance(other, ArithDateTime):
265
+ return self._dt == other._dt
266
+ return NotImplemented
267
+
268
+ def __ne__(self, other: object) -> bool:
269
+ if isinstance(other, ArithDateTime):
270
+ return self._dt != other._dt
271
+ return NotImplemented
272
+
273
+ def __gt__(self, other: object) -> bool:
274
+ if isinstance(other, ArithDateTime):
275
+ return self._dt > other._dt
276
+ return NotImplemented
277
+
278
+ def __lt__(self, other: object) -> bool:
279
+ if isinstance(other, ArithDateTime):
280
+ return self._dt < other._dt
281
+ return NotImplemented
282
+
283
+ def __ge__(self, other: object) -> bool:
284
+ if isinstance(other, ArithDateTime):
285
+ return self._dt >= other._dt
286
+ return NotImplemented
287
+
288
+ def __le__(self, other: object) -> bool:
289
+ if isinstance(other, ArithDateTime):
290
+ return self._dt <= other._dt
291
+ return NotImplemented
292
+
293
+ def __str__(self) -> str:
294
+ """Return ISO format string."""
295
+ return self._dt.isoformat()
296
+
297
+ def __repr__(self) -> str:
298
+ return f"ArithDateTime({self._dt!r})"
299
+
300
+
301
+ @attrs.define(frozen=True, eq=False, order=False)
302
+ class ArithDate(ArithString):
303
+ """A date that supports basic arithmetic and range operations for database diffing."""
304
+
305
+ _date: date = attrs.field(converter=_any_to_date)
306
+
307
+ def range(self, other: "ArithDate", count: int) -> List[Self]:
308
+ """Generate evenly spaced date checkpoints between self and other."""
309
+ assert isinstance(other, ArithDate)
310
+
311
+ start_ordinal = self._date.toordinal()
312
+ end_ordinal = other._date.toordinal()
313
+
314
+ checkpoints = split_space(start_ordinal, end_ordinal, count)
315
+ return [self.new(date.fromordinal(int(ordinal))) for ordinal in checkpoints]
316
+
317
+ def __int__(self) -> int:
318
+ """Convert to ordinal for arithmetic operations."""
319
+ return self._date.toordinal()
320
+
321
+ def __add__(self, other: int) -> Self:
322
+ """Add days to the date."""
323
+ if isinstance(other, int):
324
+ new_ordinal = self._date.toordinal() + other
325
+ return self.new(date.fromordinal(new_ordinal))
326
+ return NotImplemented
327
+
328
+ def __sub__(self, other: Union["ArithDate", int]):
329
+ """Subtract days or another date."""
330
+ if isinstance(other, int):
331
+ new_ordinal = self._date.toordinal() - other
332
+ return self.new(date.fromordinal(new_ordinal))
333
+ elif isinstance(other, ArithDate):
334
+ return self._date.toordinal() - other._date.toordinal()
335
+ return NotImplemented
336
+
337
+ def __eq__(self, other: object) -> bool:
338
+ if isinstance(other, ArithDate):
339
+ return self._date == other._date
340
+ return NotImplemented
341
+
342
+ def __ne__(self, other: object) -> bool:
343
+ if isinstance(other, ArithDate):
344
+ return self._date != other._date
345
+ return NotImplemented
346
+
347
+ def __gt__(self, other: object) -> bool:
348
+ if isinstance(other, ArithDate):
349
+ return self._date > other._date
350
+ return NotImplemented
351
+
352
+ def __lt__(self, other: object) -> bool:
353
+ if isinstance(other, ArithDate):
354
+ return self._date < other._date
355
+ return NotImplemented
356
+
357
+ def __ge__(self, other: object) -> bool:
358
+ if isinstance(other, ArithDate):
359
+ return self._date >= other._date
360
+ return NotImplemented
361
+
362
+ def __le__(self, other: object) -> bool:
363
+ if isinstance(other, ArithDate):
364
+ return self._date <= other._date
365
+ return NotImplemented
366
+
367
+ def __str__(self) -> str:
368
+ """Return ISO format date string."""
369
+ return self._date.isoformat()
370
+
371
+ def __repr__(self) -> str:
372
+ return f"ArithDate({self._date!r})"
373
+
374
+
375
+ @attrs.define(frozen=True, eq=False, order=False)
376
+ class ArithTimestamp(ArithDateTime):
377
+ """A timestamp that inherits from ArithDateTime but with explicit timestamp semantics."""
378
+
379
+ def __repr__(self) -> str:
380
+ return f"ArithTimestamp({self._dt!r})"
381
+
382
+
383
+ @attrs.define(frozen=True, eq=False, order=False)
384
+ class ArithTimestampTZ(ArithDateTime):
385
+ """A timezone-aware timestamp that extends ArithDateTime."""
386
+
387
+ def __repr__(self) -> str:
388
+ return f"ArithTimestampTZ({self._dt!r})"
389
+
390
+ def __str__(self) -> str:
391
+ """Return ISO format string with timezone info."""
392
+ return self._dt.isoformat()
393
+
394
+
182
395
  @attrs.define(frozen=True, eq=False, order=False)
183
396
  class ArithUnicodeString(ArithString):
184
397
  """A Unicode string for arbitrary text keys, supporting lexicographical ordering and checkpoint generation across databases."""
185
398
 
186
- string: str = attrs.field(converter=str)
399
+ _str: str = attrs.field(converter=str)
187
400
 
188
401
  @staticmethod
189
402
  def split_space(start: int, end: int, count: int) -> List[int]:
@@ -197,10 +410,10 @@ class ArithUnicodeString(ArithString):
197
410
 
198
411
  def _str_to_int(self) -> int:
199
412
  """Convert string to an integer for interpolation, handling empty strings and Unicode."""
200
- if not self.string:
413
+ if not self._str:
201
414
  return 0 # Handle empty string
202
415
  result = 0
203
- for char in self.string:
416
+ for char in self._str:
204
417
  result = result * 256 + ord(char)
205
418
  return result
206
419
 
@@ -220,9 +433,9 @@ class ArithUnicodeString(ArithString):
220
433
  assert isinstance(other, ArithUnicodeString), "Other must be an ArithUnicodeString"
221
434
 
222
435
  # Handle edge case: same or empty strings
223
- if self.string == other.string or count <= 0:
436
+ if self._str == other._str or count <= 0:
224
437
  return []
225
- if not self.string or not other.string:
438
+ if not self._str or not other._str:
226
439
  return [self.new("a") for _ in range(count)] if count > 0 else []
227
440
 
228
441
  # Ensure min_key < max_key
@@ -233,17 +446,25 @@ class ArithUnicodeString(ArithString):
233
446
  start_int = min_key._str_to_int()
234
447
  end_int = max_key._str_to_int()
235
448
 
449
+ # If the range is too small, return empty list
450
+ if end_int - start_int <= count:
451
+ return []
452
+
236
453
  # Generate checkpoints
237
454
  checkpoints_int = self.split_space(start_int, end_int, count)
238
455
 
239
456
  # Convert back to strings and create instances
240
- checkpoints = [self.new(self._int_to_str(i)) for i in checkpoints_int]
241
-
242
- # Filter to ensure min_key < x < max_key
243
- filtered_checkpoints = [cp for cp in checkpoints if min_key < cp < max_key]
244
-
245
- # Fallback to ensure non-empty list for close keys
246
- return filtered_checkpoints or checkpoints[:count] or [self.new(self._int_to_str(start_int + 1))]
457
+ checkpoints = []
458
+ for i in checkpoints_int:
459
+ # Ensure checkpoint is valid and within bounds
460
+ if start_int < i < end_int:
461
+ checkpoint_str = self._int_to_str(i)
462
+ checkpoint = self.new(checkpoint_str)
463
+ # Double-check the string comparison bounds
464
+ if min_key < checkpoint < max_key:
465
+ checkpoints.append(checkpoint)
466
+
467
+ return checkpoints
247
468
 
248
469
  def __int__(self) -> int:
249
470
  """Convert to integer representation for arithmetic."""
@@ -268,46 +489,46 @@ class ArithUnicodeString(ArithString):
268
489
  def __eq__(self, other: object) -> bool:
269
490
  """Check equality with another ArithUnicodeString."""
270
491
  if isinstance(other, ArithUnicodeString):
271
- return self.string == other.string
492
+ return self._str == other._str
272
493
  return NotImplemented
273
494
 
274
495
  def __ne__(self, other: object) -> bool:
275
496
  """Check inequality with another ArithUnicodeString."""
276
497
  if isinstance(other, ArithUnicodeString):
277
- return self.string != other.string
498
+ return self._str != other._str
278
499
  return NotImplemented
279
500
 
280
501
  def __gt__(self, other: object) -> bool:
281
502
  """Check if greater than another ArithUnicodeString."""
282
503
  if isinstance(other, ArithUnicodeString):
283
- return self.string > other.string
504
+ return self._str > other._str
284
505
  return NotImplemented
285
506
 
286
507
  def __lt__(self, other: object) -> bool:
287
508
  """Check if less than another ArithUnicodeString."""
288
509
  if isinstance(other, ArithUnicodeString):
289
- return self.string < other.string
510
+ return self._str < other._str
290
511
  return NotImplemented
291
512
 
292
513
  def __ge__(self, other: object) -> bool:
293
514
  """Check if greater than or equal to another ArithUnicodeString."""
294
515
  if isinstance(other, ArithUnicodeString):
295
- return self.string >= other.string
516
+ return self._str >= other._str
296
517
  return NotImplemented
297
518
 
298
519
  def __le__(self, other: object) -> bool:
299
520
  """Check if less than or equal to another ArithUnicodeString."""
300
521
  if isinstance(other, ArithUnicodeString):
301
- return self.string <= other.string
522
+ return self._str <= other._str
302
523
  return NotImplemented
303
524
 
304
525
  def __str__(self) -> str:
305
526
  """Return the string representation, escaped for SQL."""
306
- return self.string.replace("'", "''")
527
+ return self._str.replace("'", "''")
307
528
 
308
529
  def __repr__(self) -> str:
309
530
  """Return a detailed representation."""
310
- return f"ArithUnicodeString(string={self.string!r})"
531
+ return f"ArithUnicodeString(string={self._str!r})"
311
532
 
312
533
 
313
534
  @attrs.define(frozen=True, eq=False, order=False)
@@ -482,6 +703,9 @@ def number_to_human(n):
482
703
 
483
704
 
484
705
  def split_space(start, end, count) -> List[int]:
706
+ if isinstance(start, float) or isinstance(end, float):
707
+ step = (end - start) / (count + 1)
708
+ return [start + step * i for i in range(1, count + 1)]
485
709
  size = end - start
486
710
  assert count <= size, (count, size)
487
711
  return list(range(start, end, (size + 1) // (count + 1)))[1 : count + 1]
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.4.8"
15
+ __version__ = "1.5.0"
@@ -395,8 +395,10 @@ class DBTableDiffer:
395
395
  error_message = f"Target table '{target_dataset.get('table_name')}' is empty"
396
396
  is_table_empty = True
397
397
  if not is_table_empty and not self.config.schema_diff:
398
+ pks_len = len(self.table1.key_columns)
398
399
  table_1_sample_data = self.table1.with_schema().get_sample_data(limit=100)
399
- table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100)
400
+ sample_keys = [list(row[:pks_len]) for row in table_1_sample_data]
401
+ table_2_sample_data = self.table2.with_schema().get_sample_data(limit=100, sample_keys=sample_keys)
400
402
  self.diff_iter = diff_tables(
401
403
  self.table1,
402
404
  self.table2,
@@ -598,9 +600,14 @@ class DBTableDiffer:
598
600
  columns_order_wise=columns_order_wise_target if columns_order_wise_target else [],
599
601
  )
600
602
 
601
- sample_value_column_names = list(self.table1.key_columns) + list(self.table1.extra_columns)
602
- sample_value_source_dicts = [dict(zip(sample_value_column_names, row)) for row in table_1_sample_data]
603
- sample_value_target_dicts = [dict(zip(sample_value_column_names, row)) for row in table_2_sample_data]
603
+ sample_value_column_names_src = list(self.table1.key_columns) + list(self.table1.extra_columns)
604
+ sample_value_column_names_tgt = list(self.table2.key_columns) + list(self.table2.extra_columns)
605
+ sample_value_source_dicts = [
606
+ dict(zip(sample_value_column_names_src, row)) for row in table_1_sample_data
607
+ ]
608
+ sample_value_target_dicts = [
609
+ dict(zip(sample_value_column_names_tgt, row)) for row in table_2_sample_data
610
+ ]
604
611
 
605
612
  def get_pk(row, key_columns):
606
613
  return tuple(row[k] for k in key_columns)
@@ -621,9 +628,9 @@ class DBTableDiffer:
621
628
  key = []
622
629
  for val in tup:
623
630
  if isinstance(val, str) and val.isdigit():
624
- key.append(int(val))
631
+ key.append((0, int(val)))
625
632
  else:
626
- key.append(val)
633
+ key.append((1, str(val)))
627
634
  return tuple(key)
628
635
 
629
636
  return sorted(keys, key=sort_key)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "dcs-sdk"
3
- version = "1.4.8"
3
+ version = "1.5.0"
4
4
  description = "SDK for DataChecks"
5
5
  authors = ["Waterdip Labs <hello@waterdip.ai>"]
6
6
  readme = "README.md"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes