dcs-sdk 1.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. data_diff/__init__.py +221 -0
  2. data_diff/__main__.py +517 -0
  3. data_diff/abcs/__init__.py +13 -0
  4. data_diff/abcs/compiler.py +27 -0
  5. data_diff/abcs/database_types.py +402 -0
  6. data_diff/config.py +141 -0
  7. data_diff/databases/__init__.py +38 -0
  8. data_diff/databases/_connect.py +323 -0
  9. data_diff/databases/base.py +1417 -0
  10. data_diff/databases/bigquery.py +376 -0
  11. data_diff/databases/clickhouse.py +217 -0
  12. data_diff/databases/databricks.py +262 -0
  13. data_diff/databases/duckdb.py +207 -0
  14. data_diff/databases/mssql.py +343 -0
  15. data_diff/databases/mysql.py +189 -0
  16. data_diff/databases/oracle.py +238 -0
  17. data_diff/databases/postgresql.py +293 -0
  18. data_diff/databases/presto.py +222 -0
  19. data_diff/databases/redis.py +93 -0
  20. data_diff/databases/redshift.py +233 -0
  21. data_diff/databases/snowflake.py +222 -0
  22. data_diff/databases/sybase.py +720 -0
  23. data_diff/databases/trino.py +73 -0
  24. data_diff/databases/vertica.py +174 -0
  25. data_diff/diff_tables.py +489 -0
  26. data_diff/errors.py +17 -0
  27. data_diff/format.py +369 -0
  28. data_diff/hashdiff_tables.py +1026 -0
  29. data_diff/info_tree.py +76 -0
  30. data_diff/joindiff_tables.py +434 -0
  31. data_diff/lexicographic_space.py +253 -0
  32. data_diff/parse_time.py +88 -0
  33. data_diff/py.typed +0 -0
  34. data_diff/queries/__init__.py +13 -0
  35. data_diff/queries/api.py +213 -0
  36. data_diff/queries/ast_classes.py +811 -0
  37. data_diff/queries/base.py +38 -0
  38. data_diff/queries/extras.py +43 -0
  39. data_diff/query_utils.py +70 -0
  40. data_diff/schema.py +67 -0
  41. data_diff/table_segment.py +583 -0
  42. data_diff/thread_utils.py +112 -0
  43. data_diff/utils.py +1022 -0
  44. data_diff/version.py +15 -0
  45. dcs_core/__init__.py +13 -0
  46. dcs_core/__main__.py +17 -0
  47. dcs_core/__version__.py +15 -0
  48. dcs_core/cli/__init__.py +13 -0
  49. dcs_core/cli/cli.py +165 -0
  50. dcs_core/core/__init__.py +19 -0
  51. dcs_core/core/common/__init__.py +13 -0
  52. dcs_core/core/common/errors.py +50 -0
  53. dcs_core/core/common/models/__init__.py +13 -0
  54. dcs_core/core/common/models/configuration.py +284 -0
  55. dcs_core/core/common/models/dashboard.py +24 -0
  56. dcs_core/core/common/models/data_source_resource.py +75 -0
  57. dcs_core/core/common/models/metric.py +160 -0
  58. dcs_core/core/common/models/profile.py +75 -0
  59. dcs_core/core/common/models/validation.py +216 -0
  60. dcs_core/core/common/models/widget.py +44 -0
  61. dcs_core/core/configuration/__init__.py +13 -0
  62. dcs_core/core/configuration/config_loader.py +139 -0
  63. dcs_core/core/configuration/configuration_parser.py +262 -0
  64. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  65. dcs_core/core/datasource/__init__.py +13 -0
  66. dcs_core/core/datasource/base.py +62 -0
  67. dcs_core/core/datasource/manager.py +112 -0
  68. dcs_core/core/datasource/search_datasource.py +421 -0
  69. dcs_core/core/datasource/sql_datasource.py +1094 -0
  70. dcs_core/core/inspect.py +163 -0
  71. dcs_core/core/logger/__init__.py +13 -0
  72. dcs_core/core/logger/base.py +32 -0
  73. dcs_core/core/logger/default_logger.py +94 -0
  74. dcs_core/core/metric/__init__.py +13 -0
  75. dcs_core/core/metric/base.py +220 -0
  76. dcs_core/core/metric/combined_metric.py +98 -0
  77. dcs_core/core/metric/custom_metric.py +34 -0
  78. dcs_core/core/metric/manager.py +137 -0
  79. dcs_core/core/metric/numeric_metric.py +403 -0
  80. dcs_core/core/metric/reliability_metric.py +90 -0
  81. dcs_core/core/profiling/__init__.py +13 -0
  82. dcs_core/core/profiling/datasource_profiling.py +136 -0
  83. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  84. dcs_core/core/profiling/text_field_profiling.py +67 -0
  85. dcs_core/core/repository/__init__.py +13 -0
  86. dcs_core/core/repository/metric_repository.py +77 -0
  87. dcs_core/core/utils/__init__.py +13 -0
  88. dcs_core/core/utils/log.py +29 -0
  89. dcs_core/core/utils/tracking.py +105 -0
  90. dcs_core/core/utils/utils.py +44 -0
  91. dcs_core/core/validation/__init__.py +13 -0
  92. dcs_core/core/validation/base.py +230 -0
  93. dcs_core/core/validation/completeness_validation.py +153 -0
  94. dcs_core/core/validation/custom_query_validation.py +24 -0
  95. dcs_core/core/validation/manager.py +282 -0
  96. dcs_core/core/validation/numeric_validation.py +276 -0
  97. dcs_core/core/validation/reliability_validation.py +91 -0
  98. dcs_core/core/validation/uniqueness_validation.py +61 -0
  99. dcs_core/core/validation/validity_validation.py +738 -0
  100. dcs_core/integrations/__init__.py +13 -0
  101. dcs_core/integrations/databases/__init__.py +13 -0
  102. dcs_core/integrations/databases/bigquery.py +187 -0
  103. dcs_core/integrations/databases/databricks.py +51 -0
  104. dcs_core/integrations/databases/db2.py +652 -0
  105. dcs_core/integrations/databases/elasticsearch.py +61 -0
  106. dcs_core/integrations/databases/mssql.py +829 -0
  107. dcs_core/integrations/databases/mysql.py +409 -0
  108. dcs_core/integrations/databases/opensearch.py +64 -0
  109. dcs_core/integrations/databases/oracle.py +719 -0
  110. dcs_core/integrations/databases/postgres.py +482 -0
  111. dcs_core/integrations/databases/redshift.py +53 -0
  112. dcs_core/integrations/databases/snowflake.py +48 -0
  113. dcs_core/integrations/databases/spark_df.py +111 -0
  114. dcs_core/integrations/databases/sybase.py +1069 -0
  115. dcs_core/integrations/storage/__init__.py +13 -0
  116. dcs_core/integrations/storage/local_file.py +149 -0
  117. dcs_core/integrations/utils/__init__.py +13 -0
  118. dcs_core/integrations/utils/utils.py +36 -0
  119. dcs_core/report/__init__.py +13 -0
  120. dcs_core/report/dashboard.py +211 -0
  121. dcs_core/report/models.py +88 -0
  122. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  123. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  124. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  125. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  126. dcs_core/report/static/assets/images/docs.svg +6 -0
  127. dcs_core/report/static/assets/images/github.svg +4 -0
  128. dcs_core/report/static/assets/images/logo.svg +7 -0
  129. dcs_core/report/static/assets/images/slack.svg +13 -0
  130. dcs_core/report/static/index.js +2 -0
  131. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  132. dcs_sdk/__init__.py +13 -0
  133. dcs_sdk/__main__.py +18 -0
  134. dcs_sdk/__version__.py +15 -0
  135. dcs_sdk/cli/__init__.py +13 -0
  136. dcs_sdk/cli/cli.py +163 -0
  137. dcs_sdk/sdk/__init__.py +58 -0
  138. dcs_sdk/sdk/config/__init__.py +13 -0
  139. dcs_sdk/sdk/config/config_loader.py +491 -0
  140. dcs_sdk/sdk/data_diff/__init__.py +13 -0
  141. dcs_sdk/sdk/data_diff/data_differ.py +821 -0
  142. dcs_sdk/sdk/rules/__init__.py +15 -0
  143. dcs_sdk/sdk/rules/rules_mappping.py +31 -0
  144. dcs_sdk/sdk/rules/rules_repository.py +214 -0
  145. dcs_sdk/sdk/rules/schema_rules.py +65 -0
  146. dcs_sdk/sdk/utils/__init__.py +13 -0
  147. dcs_sdk/sdk/utils/serializer.py +25 -0
  148. dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
  149. dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
  150. dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
  151. dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
  152. dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
  153. dcs_sdk/sdk/utils/table.py +475 -0
  154. dcs_sdk/sdk/utils/themes.py +40 -0
  155. dcs_sdk/sdk/utils/utils.py +349 -0
  156. dcs_sdk-1.6.5.dist-info/METADATA +150 -0
  157. dcs_sdk-1.6.5.dist-info/RECORD +159 -0
  158. dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
  159. dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
@@ -0,0 +1,402 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import decimal
16
+ from abc import ABC, abstractmethod
17
+ from datetime import date, datetime
18
+ from typing import Collection, List, Optional, Tuple, Type, TypeVar, Union
19
+
20
+ import attrs
21
+
22
+ from data_diff.utils import (
23
+ ArithAlphanumeric,
24
+ ArithDate,
25
+ ArithDateTime,
26
+ ArithTimestamp,
27
+ ArithTimestampTZ,
28
+ ArithUnicodeString,
29
+ ArithUUID,
30
+ Unknown,
31
+ )
32
+
33
+ DbPath = Tuple[str, ...]
34
+ DbKey = Union[
35
+ int,
36
+ str,
37
+ bytes,
38
+ ArithUUID,
39
+ ArithAlphanumeric,
40
+ ArithUnicodeString,
41
+ ArithDateTime,
42
+ ArithDate,
43
+ ArithTimestamp,
44
+ ArithTimestampTZ,
45
+ ]
46
+ DbTime = datetime
47
+
48
+ N = TypeVar("N")
49
+
50
+
51
+ @attrs.frozen(kw_only=True, eq=False, order=False, unsafe_hash=True)
52
+ class Collation:
53
+ """
54
+ A pre-parsed or pre-known record about db collation, per column.
55
+
56
+ The "greater" collation should be used as a target collation for textual PKs
57
+ on both sides of the diff — by coverting the "lesser" collation to self.
58
+
59
+ Snowflake easily absorbs the performance losses, so it has a boost to always
60
+ be greater than any other collation in non-Snowflake databases.
61
+ Other databases need to negotiate which side absorbs the performance impact.
62
+ """
63
+
64
+ # A boost for special databases that are known to absorb the performance dmaage well.
65
+ absorbs_damage: bool = False
66
+
67
+ # Ordinal soring by ASCII/UTF8 (True), or alphabetic as per locale/country/etc (False).
68
+ ordinal: Optional[bool] = None
69
+
70
+ # Lowercase first (aAbBcC or abcABC). Otherwise, uppercase first (AaBbCc or ABCabc).
71
+ lower_first: Optional[bool] = None
72
+
73
+ # 2-letter lower-case locale and upper-case country codes, e.g. en_US. Ignored for ordinals.
74
+ language: Optional[str] = None
75
+ country: Optional[str] = None
76
+
77
+ # There are also space-, punctuation-, width-, kana-(in)sensitivity, so on.
78
+ # Ignore everything not related to xdb alignment. Only case- & accent-sensitivity are common.
79
+ case_sensitive: Optional[bool] = None
80
+ accent_sensitive: Optional[bool] = None
81
+
82
+ # Purely informational, for debugging:
83
+ _source: Union[None, str, Collection[str]] = None
84
+
85
+ def __eq__(self, other: object) -> bool:
86
+ if not isinstance(other, Collation):
87
+ return NotImplemented
88
+ if self.ordinal and other.ordinal:
89
+ # TODO: does it depend on language? what does Albanic_BIN mean in MS SQL?
90
+ return True
91
+ return (
92
+ self.language == other.language
93
+ and (self.country is None or other.country is None or self.country == other.country)
94
+ and self.case_sensitive == other.case_sensitive
95
+ and self.accent_sensitive == other.accent_sensitive
96
+ and self.lower_first == other.lower_first
97
+ )
98
+
99
+ def __ne__(self, other: object) -> bool:
100
+ if not isinstance(other, Collation):
101
+ return NotImplemented
102
+ return not self.__eq__(other)
103
+
104
+ def __gt__(self, other: object) -> bool:
105
+ if not isinstance(other, Collation):
106
+ return NotImplemented
107
+ if self == other:
108
+ return False
109
+ if self.absorbs_damage and not other.absorbs_damage:
110
+ return False
111
+ if other.absorbs_damage and not self.absorbs_damage:
112
+ return True # this one is preferred if it cannot absorb damage as its counterpart can
113
+ if self.ordinal and not other.ordinal:
114
+ return True
115
+ if other.ordinal and not self.ordinal:
116
+ return False
117
+ # TODO: try to align the languages & countries?
118
+ return False
119
+
120
+ def __ge__(self, other: object) -> bool:
121
+ if not isinstance(other, Collation):
122
+ return NotImplemented
123
+ return self == other or self.__gt__(other)
124
+
125
+ def __lt__(self, other: object) -> bool:
126
+ if not isinstance(other, Collation):
127
+ return NotImplemented
128
+ return self != other and not self.__gt__(other)
129
+
130
+ def __le__(self, other: object) -> bool:
131
+ if not isinstance(other, Collation):
132
+ return NotImplemented
133
+ return self == other or not self.__gt__(other)
134
+
135
+
136
+ @attrs.define(frozen=True, kw_only=True)
137
+ class ColType:
138
+ # Arbitrary metadata added and fetched at runtime.
139
+ _notes: List[N] = attrs.field(factory=list, init=False, hash=False, eq=False)
140
+
141
+ def add_note(self, note: N) -> None:
142
+ self._notes.append(note)
143
+
144
+ def get_note(self, cls: Type[N]) -> Optional[N]:
145
+ """Get the latest added note of type ``cls`` or its descendants."""
146
+ for note in reversed(self._notes):
147
+ if isinstance(note, cls):
148
+ return note
149
+ return None
150
+
151
+ @property
152
+ def supported(self) -> bool:
153
+ return True
154
+
155
+
156
+ @attrs.define(frozen=True)
157
+ class PrecisionType(ColType):
158
+ precision: int
159
+ rounds: Union[bool, Unknown] = Unknown
160
+
161
+
162
+ @attrs.define(frozen=True)
163
+ class Boolean(ColType):
164
+ precision = 0
165
+
166
+
167
+ @attrs.define(frozen=True)
168
+ class TemporalType(PrecisionType):
169
+ pass
170
+
171
+
172
+ @attrs.define(frozen=True)
173
+ class IKey(ABC):
174
+ "Interface for ColType, for using a column as a key in table."
175
+
176
+ @property
177
+ @abstractmethod
178
+ def python_type(self) -> type:
179
+ "Return the equivalent Python type of the key"
180
+
181
+ def make_value(self, value):
182
+ if isinstance(value, self.python_type):
183
+ return value
184
+ return self.python_type(value)
185
+
186
+
187
+ @attrs.define(frozen=True)
188
+ class Timestamp(TemporalType, IKey):
189
+ @property
190
+ def python_type(self) -> type:
191
+ return ArithTimestamp
192
+
193
+ def make_value(self, value):
194
+ if isinstance(value, ArithTimestamp):
195
+ return value
196
+ return ArithTimestamp(value)
197
+
198
+
199
+ @attrs.define(frozen=True)
200
+ class TimestampTZ(TemporalType, IKey):
201
+ @property
202
+ def python_type(self) -> type:
203
+ return ArithTimestampTZ
204
+
205
+ def make_value(self, value):
206
+ if isinstance(value, ArithTimestampTZ):
207
+ return value
208
+ return ArithTimestampTZ(value)
209
+
210
+
211
+ @attrs.define(frozen=True)
212
+ class Datetime(TemporalType, IKey):
213
+ @property
214
+ def python_type(self) -> type:
215
+ return ArithDateTime
216
+
217
+ def make_value(self, value):
218
+ if isinstance(value, ArithDateTime):
219
+ return value
220
+ return ArithDateTime(value)
221
+
222
+
223
+ @attrs.define(frozen=True)
224
+ class Date(TemporalType, IKey):
225
+ @property
226
+ def python_type(self) -> type:
227
+ return ArithDate
228
+
229
+ def make_value(self, value):
230
+ if isinstance(value, ArithDate):
231
+ return value
232
+ return ArithDate(value)
233
+
234
+
235
+ @attrs.define(frozen=True)
236
+ class Time(TemporalType):
237
+ pass
238
+
239
+
240
+ @attrs.define(frozen=True)
241
+ class NumericType(ColType):
242
+ # 'precision' signifies how many fractional digits (after the dot) we want to compare
243
+ precision: int
244
+
245
+
246
+ @attrs.define(frozen=True)
247
+ class FractionalType(NumericType):
248
+ pass
249
+
250
+
251
+ @attrs.define(frozen=True)
252
+ class Float(FractionalType):
253
+ python_type = float
254
+
255
+
256
+ @attrs.define(frozen=True)
257
+ class Decimal(FractionalType, IKey): # Snowflake may use Decimal as a key
258
+ @property
259
+ def python_type(self) -> type:
260
+ if self.precision == 0:
261
+ return int
262
+ return decimal.Decimal
263
+
264
+
265
+ @attrs.define(frozen=True)
266
+ class Float(FractionalType, IKey):
267
+ @property
268
+ def python_type(self) -> type:
269
+ if self.precision == 0:
270
+ return int
271
+ return decimal.Decimal
272
+
273
+
274
+ @attrs.define(frozen=True)
275
+ class Numeric(FractionalType, IKey):
276
+ @property
277
+ def python_type(self) -> type:
278
+ return int
279
+
280
+
281
+ @attrs.define(frozen=True)
282
+ class StringType(ColType):
283
+ python_type = str
284
+ collation: Optional[Collation] = attrs.field(default=None, kw_only=True)
285
+
286
+
287
+ @attrs.define(frozen=True)
288
+ class ColType_UUID(ColType, IKey):
289
+ python_type = ArithUUID
290
+
291
+
292
+ @attrs.define(frozen=True)
293
+ class ColType_Alphanum(ColType, IKey):
294
+ python_type = ArithAlphanumeric
295
+
296
+
297
+ @attrs.define(frozen=True)
298
+ class ColType_Unicode(ColType, IKey):
299
+ python_type = ArithUnicodeString
300
+
301
+
302
+ @attrs.define(frozen=True)
303
+ class Native_UUID(ColType_UUID):
304
+ pass
305
+
306
+
307
+ @attrs.define(frozen=True)
308
+ class String_UUID(ColType_UUID, StringType):
309
+ # Case is important for UUIDs stored as regular string, not native UUIDs stored as numbers.
310
+ # We slice them internally as numbers, but render them back to SQL as lower/upper case.
311
+ # None means we do not know for sure, behave as with False, but it might be unreliable.
312
+ lowercase: Optional[bool] = None
313
+ uppercase: Optional[bool] = None
314
+
315
+ def make_value(self, v: str) -> ArithUUID:
316
+ return self.python_type(v, lowercase=self.lowercase, uppercase=self.uppercase)
317
+
318
+
319
+ @attrs.define(frozen=True)
320
+ class String_Alphanum(ColType_Alphanum, StringType):
321
+ @staticmethod
322
+ def test_value(value: str) -> bool:
323
+ try:
324
+ ArithAlphanumeric(value)
325
+ return True
326
+ except ValueError:
327
+ return False
328
+
329
+ def make_value(self, value) -> ArithAlphanumeric:
330
+ if isinstance(value, ArithAlphanumeric):
331
+ return value
332
+ # Coerce non-string primitives (e.g., integers) to string for alphanumeric representation
333
+ return ArithAlphanumeric(str(value))
334
+
335
+
336
+ @attrs.define(frozen=True)
337
+ class String_VaryingAlphanum(String_Alphanum):
338
+ pass
339
+
340
+
341
+ @attrs.define(frozen=True)
342
+ class String_VaryingUnicode(ColType_Unicode, StringType):
343
+ pass
344
+
345
+
346
+ @attrs.define(frozen=True)
347
+ class String_FixedAlphanum(String_Alphanum):
348
+ length: int
349
+
350
+ def make_value(self, value):
351
+ if isinstance(value, self.python_type):
352
+ return value
353
+ if len(value) != self.length:
354
+ raise ValueError(f"Expected alphanumeric value of length {self.length}, but got '{value}'.")
355
+ return self.python_type(value, max_len=self.length)
356
+
357
+
358
+ @attrs.define(frozen=True)
359
+ class Text(ColType_Unicode, StringType):
360
+
361
+ @property
362
+ def supported(self) -> bool:
363
+ return False
364
+
365
+
366
+ # In majority of DBMSes, it is called JSON/JSONB. Only in Snowflake, it is OBJECT.
367
+ @attrs.define(frozen=True)
368
+ class JSON(ColType):
369
+ pass
370
+
371
+
372
+ @attrs.define(frozen=True)
373
+ class Array(ColType):
374
+ item_type: ColType
375
+
376
+
377
+ # Unlike JSON, structs are not free-form and have a very specific set of fields and their types.
378
+ # We do not parse & use those fields now, but we can do this later.
379
+ # For example, in BigQuery:
380
+ # - https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
381
+ # - https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#struct_literals
382
+ @attrs.define(frozen=True)
383
+ class Struct(ColType):
384
+ pass
385
+
386
+
387
+ @attrs.define(frozen=True)
388
+ class Integer(NumericType, IKey):
389
+ precision: int = 0
390
+ python_type: type = int
391
+
392
+ def __attrs_post_init__(self) -> None:
393
+ assert self.precision == 0
394
+
395
+
396
+ @attrs.define(frozen=True)
397
+ class UnknownColType(ColType):
398
+ text: str
399
+
400
+ @property
401
+ def supported(self) -> bool:
402
+ return False
data_diff/config.py ADDED
@@ -0,0 +1,141 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import re
17
+ from typing import Any, Dict
18
+
19
+ import toml
20
+
21
+ _ARRAY_FIELDS = (
22
+ "key_columns",
23
+ "columns",
24
+ )
25
+
26
+
27
+ class ConfigParseError(Exception):
28
+ pass
29
+
30
+
31
+ def is_uri(s: str) -> bool:
32
+ return "://" in s
33
+
34
+
35
+ def _apply_config(config: Dict[str, Any], run_name: str, kw: Dict[str, Any]):
36
+ _resolve_env(config)
37
+
38
+ # Load config
39
+ databases = config.pop("database", {})
40
+ runs = config.pop("run", {})
41
+ if config:
42
+ raise ConfigParseError(f"Unknown option(s): {config}")
43
+
44
+ # Init run_args
45
+ run_args = runs.get("default") or {}
46
+ if run_name:
47
+ if run_name not in runs:
48
+ raise ConfigParseError(f"Cannot find run '{run_name}' in configuration.")
49
+ run_args.update(runs[run_name])
50
+ else:
51
+ run_name = "default"
52
+
53
+ if kw.get("database1") is not None:
54
+ for attr in ("table1", "database2", "table2"):
55
+ if kw[attr] is None:
56
+ raise ValueError(f"Specified database1 but not {attr}. Must specify all 4 arguments, or neither.")
57
+
58
+ for index in "12":
59
+ run_args[index] = {attr: kw.pop(f"{attr}{index}") for attr in ("database", "table")}
60
+
61
+ # Make sure array fields are decoded as list, since array fields in toml are decoded as list, but TableSegment object requires tuple type.
62
+ for field in _ARRAY_FIELDS:
63
+ if isinstance(run_args.get(field), list):
64
+ run_args[field] = tuple(run_args[field])
65
+
66
+ # Process databases + tables
67
+ for index in "12":
68
+ try:
69
+ args = run_args.pop(index)
70
+ except KeyError:
71
+ raise ConfigParseError(
72
+ f"Could not find source #{index}: Expecting a key of '{index}' containing '.database' and '.table'."
73
+ )
74
+ for attr in ("database", "table"):
75
+ if attr not in args:
76
+ raise ConfigParseError(f"Running 'run.{run_name}': Connection #{index} is missing attribute '{attr}'.")
77
+
78
+ database = args.pop("database")
79
+ table = args.pop("table")
80
+ threads = args.pop("threads", None)
81
+ if args:
82
+ raise ConfigParseError(f"Unexpected attributes for connection #{index}: {args}")
83
+
84
+ if not is_uri(database):
85
+ if database not in databases:
86
+ raise ConfigParseError(
87
+ f"Database '{database}' not found in list of databases. Available: {list(databases)}."
88
+ )
89
+ database = dict(databases[database])
90
+ assert isinstance(database, dict)
91
+ if "driver" not in database:
92
+ raise ConfigParseError(f"Database '{database}' did not specify a driver.")
93
+
94
+ run_args[f"database{index}"] = database
95
+ run_args[f"table{index}"] = table
96
+ if threads is not None:
97
+ run_args[f"threads{index}"] = int(threads)
98
+
99
+ # Update keywords
100
+ new_kw = dict(kw) # Set defaults
101
+ new_kw.update(run_args) # Apply config
102
+ new_kw.update({k: v for k, v in kw.items() if v}) # Apply non-empty defaults
103
+
104
+ new_kw["__conf__"] = run_args
105
+
106
+ return new_kw
107
+
108
+
109
+ # There are no strict requirements for the environment variable name format.
110
+ # But most shells only allow alphanumeric characters and underscores.
111
+ # https://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html
112
+ # "Environment variable names (...) consist solely of uppercase letters, digits, and the '_' (underscore)"
113
+ _ENV_VAR_PATTERN = r"\$\{([A-Za-z0-9_]+)\}"
114
+
115
+
116
+ def _resolve_env(config: Dict[str, Any]) -> None:
117
+ """
118
+ Resolve environment variables referenced as ${ENV_VAR_NAME}.
119
+ Missing environment variables are replaced with an empty string.
120
+ """
121
+ for key, value in config.items():
122
+ if isinstance(value, dict):
123
+ _resolve_env(value)
124
+ elif isinstance(value, str):
125
+ config[key] = re.sub(_ENV_VAR_PATTERN, _replace_match, value)
126
+
127
+
128
+ def _replace_match(match: re.Match) -> str:
129
+ # Lookup referenced variable in environment.
130
+ # Replace with empty string if not found
131
+ referenced_var = match.group(1) # group(0) is the whole string
132
+ return os.environ.get(referenced_var, "")
133
+
134
+
135
+ def apply_config_from_file(path: str, run_name: str, kw: Dict[str, Any]):
136
+ with open(path) as f:
137
+ return _apply_config(toml.load(f), run_name, kw)
138
+
139
+
140
+ def apply_config_from_string(toml_config: str, run_name: str, kw: Dict[str, Any]):
141
+ return _apply_config(toml.loads(toml_config), run_name, kw)
@@ -0,0 +1,38 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from data_diff.databases._connect import Connect as Connect
16
+ from data_diff.databases._connect import connect as connect
17
+ from data_diff.databases.base import (
18
+ CHECKSUM_HEXDIGITS,
19
+ CHECKSUM_OFFSET,
20
+ MD5_HEXDIGITS,
21
+ BaseDialect,
22
+ ConnectError,
23
+ Database,
24
+ QueryError,
25
+ )
26
+ from data_diff.databases.bigquery import BigQuery as BigQuery
27
+ from data_diff.databases.clickhouse import Clickhouse as Clickhouse
28
+ from data_diff.databases.databricks import Databricks as Databricks
29
+ from data_diff.databases.duckdb import DuckDB as DuckDB
30
+ from data_diff.databases.mssql import MsSQL as MsSQL
31
+ from data_diff.databases.mysql import MySQL as MySQL
32
+ from data_diff.databases.oracle import Oracle as Oracle
33
+ from data_diff.databases.postgresql import PostgreSQL as PostgreSQL
34
+ from data_diff.databases.presto import Presto as Presto
35
+ from data_diff.databases.redshift import Redshift as Redshift
36
+ from data_diff.databases.snowflake import Snowflake as Snowflake
37
+ from data_diff.databases.trino import Trino as Trino
38
+ from data_diff.databases.vertica import Vertica as Vertica