dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import decimal
|
|
16
|
+
from abc import ABC, abstractmethod
|
|
17
|
+
from datetime import date, datetime
|
|
18
|
+
from typing import Collection, List, Optional, Tuple, Type, TypeVar, Union
|
|
19
|
+
|
|
20
|
+
import attrs
|
|
21
|
+
|
|
22
|
+
from data_diff.utils import (
|
|
23
|
+
ArithAlphanumeric,
|
|
24
|
+
ArithDate,
|
|
25
|
+
ArithDateTime,
|
|
26
|
+
ArithTimestamp,
|
|
27
|
+
ArithTimestampTZ,
|
|
28
|
+
ArithUnicodeString,
|
|
29
|
+
ArithUUID,
|
|
30
|
+
Unknown,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
DbPath = Tuple[str, ...]
|
|
34
|
+
DbKey = Union[
|
|
35
|
+
int,
|
|
36
|
+
str,
|
|
37
|
+
bytes,
|
|
38
|
+
ArithUUID,
|
|
39
|
+
ArithAlphanumeric,
|
|
40
|
+
ArithUnicodeString,
|
|
41
|
+
ArithDateTime,
|
|
42
|
+
ArithDate,
|
|
43
|
+
ArithTimestamp,
|
|
44
|
+
ArithTimestampTZ,
|
|
45
|
+
]
|
|
46
|
+
DbTime = datetime
|
|
47
|
+
|
|
48
|
+
N = TypeVar("N")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@attrs.frozen(kw_only=True, eq=False, order=False, unsafe_hash=True)
|
|
52
|
+
class Collation:
|
|
53
|
+
"""
|
|
54
|
+
A pre-parsed or pre-known record about db collation, per column.
|
|
55
|
+
|
|
56
|
+
The "greater" collation should be used as a target collation for textual PKs
|
|
57
|
+
on both sides of the diff — by coverting the "lesser" collation to self.
|
|
58
|
+
|
|
59
|
+
Snowflake easily absorbs the performance losses, so it has a boost to always
|
|
60
|
+
be greater than any other collation in non-Snowflake databases.
|
|
61
|
+
Other databases need to negotiate which side absorbs the performance impact.
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
# A boost for special databases that are known to absorb the performance dmaage well.
|
|
65
|
+
absorbs_damage: bool = False
|
|
66
|
+
|
|
67
|
+
# Ordinal soring by ASCII/UTF8 (True), or alphabetic as per locale/country/etc (False).
|
|
68
|
+
ordinal: Optional[bool] = None
|
|
69
|
+
|
|
70
|
+
# Lowercase first (aAbBcC or abcABC). Otherwise, uppercase first (AaBbCc or ABCabc).
|
|
71
|
+
lower_first: Optional[bool] = None
|
|
72
|
+
|
|
73
|
+
# 2-letter lower-case locale and upper-case country codes, e.g. en_US. Ignored for ordinals.
|
|
74
|
+
language: Optional[str] = None
|
|
75
|
+
country: Optional[str] = None
|
|
76
|
+
|
|
77
|
+
# There are also space-, punctuation-, width-, kana-(in)sensitivity, so on.
|
|
78
|
+
# Ignore everything not related to xdb alignment. Only case- & accent-sensitivity are common.
|
|
79
|
+
case_sensitive: Optional[bool] = None
|
|
80
|
+
accent_sensitive: Optional[bool] = None
|
|
81
|
+
|
|
82
|
+
# Purely informational, for debugging:
|
|
83
|
+
_source: Union[None, str, Collection[str]] = None
|
|
84
|
+
|
|
85
|
+
def __eq__(self, other: object) -> bool:
|
|
86
|
+
if not isinstance(other, Collation):
|
|
87
|
+
return NotImplemented
|
|
88
|
+
if self.ordinal and other.ordinal:
|
|
89
|
+
# TODO: does it depend on language? what does Albanic_BIN mean in MS SQL?
|
|
90
|
+
return True
|
|
91
|
+
return (
|
|
92
|
+
self.language == other.language
|
|
93
|
+
and (self.country is None or other.country is None or self.country == other.country)
|
|
94
|
+
and self.case_sensitive == other.case_sensitive
|
|
95
|
+
and self.accent_sensitive == other.accent_sensitive
|
|
96
|
+
and self.lower_first == other.lower_first
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def __ne__(self, other: object) -> bool:
|
|
100
|
+
if not isinstance(other, Collation):
|
|
101
|
+
return NotImplemented
|
|
102
|
+
return not self.__eq__(other)
|
|
103
|
+
|
|
104
|
+
def __gt__(self, other: object) -> bool:
|
|
105
|
+
if not isinstance(other, Collation):
|
|
106
|
+
return NotImplemented
|
|
107
|
+
if self == other:
|
|
108
|
+
return False
|
|
109
|
+
if self.absorbs_damage and not other.absorbs_damage:
|
|
110
|
+
return False
|
|
111
|
+
if other.absorbs_damage and not self.absorbs_damage:
|
|
112
|
+
return True # this one is preferred if it cannot absorb damage as its counterpart can
|
|
113
|
+
if self.ordinal and not other.ordinal:
|
|
114
|
+
return True
|
|
115
|
+
if other.ordinal and not self.ordinal:
|
|
116
|
+
return False
|
|
117
|
+
# TODO: try to align the languages & countries?
|
|
118
|
+
return False
|
|
119
|
+
|
|
120
|
+
def __ge__(self, other: object) -> bool:
|
|
121
|
+
if not isinstance(other, Collation):
|
|
122
|
+
return NotImplemented
|
|
123
|
+
return self == other or self.__gt__(other)
|
|
124
|
+
|
|
125
|
+
def __lt__(self, other: object) -> bool:
|
|
126
|
+
if not isinstance(other, Collation):
|
|
127
|
+
return NotImplemented
|
|
128
|
+
return self != other and not self.__gt__(other)
|
|
129
|
+
|
|
130
|
+
def __le__(self, other: object) -> bool:
|
|
131
|
+
if not isinstance(other, Collation):
|
|
132
|
+
return NotImplemented
|
|
133
|
+
return self == other or not self.__gt__(other)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@attrs.define(frozen=True, kw_only=True)
|
|
137
|
+
class ColType:
|
|
138
|
+
# Arbitrary metadata added and fetched at runtime.
|
|
139
|
+
_notes: List[N] = attrs.field(factory=list, init=False, hash=False, eq=False)
|
|
140
|
+
|
|
141
|
+
def add_note(self, note: N) -> None:
|
|
142
|
+
self._notes.append(note)
|
|
143
|
+
|
|
144
|
+
def get_note(self, cls: Type[N]) -> Optional[N]:
|
|
145
|
+
"""Get the latest added note of type ``cls`` or its descendants."""
|
|
146
|
+
for note in reversed(self._notes):
|
|
147
|
+
if isinstance(note, cls):
|
|
148
|
+
return note
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def supported(self) -> bool:
|
|
153
|
+
return True
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@attrs.define(frozen=True)
|
|
157
|
+
class PrecisionType(ColType):
|
|
158
|
+
precision: int
|
|
159
|
+
rounds: Union[bool, Unknown] = Unknown
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@attrs.define(frozen=True)
|
|
163
|
+
class Boolean(ColType):
|
|
164
|
+
precision = 0
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@attrs.define(frozen=True)
|
|
168
|
+
class TemporalType(PrecisionType):
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@attrs.define(frozen=True)
|
|
173
|
+
class IKey(ABC):
|
|
174
|
+
"Interface for ColType, for using a column as a key in table."
|
|
175
|
+
|
|
176
|
+
@property
|
|
177
|
+
@abstractmethod
|
|
178
|
+
def python_type(self) -> type:
|
|
179
|
+
"Return the equivalent Python type of the key"
|
|
180
|
+
|
|
181
|
+
def make_value(self, value):
|
|
182
|
+
if isinstance(value, self.python_type):
|
|
183
|
+
return value
|
|
184
|
+
return self.python_type(value)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@attrs.define(frozen=True)
|
|
188
|
+
class Timestamp(TemporalType, IKey):
|
|
189
|
+
@property
|
|
190
|
+
def python_type(self) -> type:
|
|
191
|
+
return ArithTimestamp
|
|
192
|
+
|
|
193
|
+
def make_value(self, value):
|
|
194
|
+
if isinstance(value, ArithTimestamp):
|
|
195
|
+
return value
|
|
196
|
+
return ArithTimestamp(value)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@attrs.define(frozen=True)
|
|
200
|
+
class TimestampTZ(TemporalType, IKey):
|
|
201
|
+
@property
|
|
202
|
+
def python_type(self) -> type:
|
|
203
|
+
return ArithTimestampTZ
|
|
204
|
+
|
|
205
|
+
def make_value(self, value):
|
|
206
|
+
if isinstance(value, ArithTimestampTZ):
|
|
207
|
+
return value
|
|
208
|
+
return ArithTimestampTZ(value)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@attrs.define(frozen=True)
|
|
212
|
+
class Datetime(TemporalType, IKey):
|
|
213
|
+
@property
|
|
214
|
+
def python_type(self) -> type:
|
|
215
|
+
return ArithDateTime
|
|
216
|
+
|
|
217
|
+
def make_value(self, value):
|
|
218
|
+
if isinstance(value, ArithDateTime):
|
|
219
|
+
return value
|
|
220
|
+
return ArithDateTime(value)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@attrs.define(frozen=True)
|
|
224
|
+
class Date(TemporalType, IKey):
|
|
225
|
+
@property
|
|
226
|
+
def python_type(self) -> type:
|
|
227
|
+
return ArithDate
|
|
228
|
+
|
|
229
|
+
def make_value(self, value):
|
|
230
|
+
if isinstance(value, ArithDate):
|
|
231
|
+
return value
|
|
232
|
+
return ArithDate(value)
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
@attrs.define(frozen=True)
|
|
236
|
+
class Time(TemporalType):
|
|
237
|
+
pass
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
@attrs.define(frozen=True)
|
|
241
|
+
class NumericType(ColType):
|
|
242
|
+
# 'precision' signifies how many fractional digits (after the dot) we want to compare
|
|
243
|
+
precision: int
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
@attrs.define(frozen=True)
|
|
247
|
+
class FractionalType(NumericType):
|
|
248
|
+
pass
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@attrs.define(frozen=True)
|
|
252
|
+
class Float(FractionalType):
|
|
253
|
+
python_type = float
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
@attrs.define(frozen=True)
|
|
257
|
+
class Decimal(FractionalType, IKey): # Snowflake may use Decimal as a key
|
|
258
|
+
@property
|
|
259
|
+
def python_type(self) -> type:
|
|
260
|
+
if self.precision == 0:
|
|
261
|
+
return int
|
|
262
|
+
return decimal.Decimal
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
@attrs.define(frozen=True)
|
|
266
|
+
class Float(FractionalType, IKey):
|
|
267
|
+
@property
|
|
268
|
+
def python_type(self) -> type:
|
|
269
|
+
if self.precision == 0:
|
|
270
|
+
return int
|
|
271
|
+
return decimal.Decimal
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
@attrs.define(frozen=True)
|
|
275
|
+
class Numeric(FractionalType, IKey):
|
|
276
|
+
@property
|
|
277
|
+
def python_type(self) -> type:
|
|
278
|
+
return int
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
@attrs.define(frozen=True)
|
|
282
|
+
class StringType(ColType):
|
|
283
|
+
python_type = str
|
|
284
|
+
collation: Optional[Collation] = attrs.field(default=None, kw_only=True)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
@attrs.define(frozen=True)
|
|
288
|
+
class ColType_UUID(ColType, IKey):
|
|
289
|
+
python_type = ArithUUID
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
@attrs.define(frozen=True)
|
|
293
|
+
class ColType_Alphanum(ColType, IKey):
|
|
294
|
+
python_type = ArithAlphanumeric
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
@attrs.define(frozen=True)
|
|
298
|
+
class ColType_Unicode(ColType, IKey):
|
|
299
|
+
python_type = ArithUnicodeString
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
@attrs.define(frozen=True)
|
|
303
|
+
class Native_UUID(ColType_UUID):
|
|
304
|
+
pass
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
@attrs.define(frozen=True)
|
|
308
|
+
class String_UUID(ColType_UUID, StringType):
|
|
309
|
+
# Case is important for UUIDs stored as regular string, not native UUIDs stored as numbers.
|
|
310
|
+
# We slice them internally as numbers, but render them back to SQL as lower/upper case.
|
|
311
|
+
# None means we do not know for sure, behave as with False, but it might be unreliable.
|
|
312
|
+
lowercase: Optional[bool] = None
|
|
313
|
+
uppercase: Optional[bool] = None
|
|
314
|
+
|
|
315
|
+
def make_value(self, v: str) -> ArithUUID:
|
|
316
|
+
return self.python_type(v, lowercase=self.lowercase, uppercase=self.uppercase)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
@attrs.define(frozen=True)
|
|
320
|
+
class String_Alphanum(ColType_Alphanum, StringType):
|
|
321
|
+
@staticmethod
|
|
322
|
+
def test_value(value: str) -> bool:
|
|
323
|
+
try:
|
|
324
|
+
ArithAlphanumeric(value)
|
|
325
|
+
return True
|
|
326
|
+
except ValueError:
|
|
327
|
+
return False
|
|
328
|
+
|
|
329
|
+
def make_value(self, value) -> ArithAlphanumeric:
|
|
330
|
+
if isinstance(value, ArithAlphanumeric):
|
|
331
|
+
return value
|
|
332
|
+
# Coerce non-string primitives (e.g., integers) to string for alphanumeric representation
|
|
333
|
+
return ArithAlphanumeric(str(value))
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
@attrs.define(frozen=True)
|
|
337
|
+
class String_VaryingAlphanum(String_Alphanum):
|
|
338
|
+
pass
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
@attrs.define(frozen=True)
|
|
342
|
+
class String_VaryingUnicode(ColType_Unicode, StringType):
|
|
343
|
+
pass
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
@attrs.define(frozen=True)
|
|
347
|
+
class String_FixedAlphanum(String_Alphanum):
|
|
348
|
+
length: int
|
|
349
|
+
|
|
350
|
+
def make_value(self, value):
|
|
351
|
+
if isinstance(value, self.python_type):
|
|
352
|
+
return value
|
|
353
|
+
if len(value) != self.length:
|
|
354
|
+
raise ValueError(f"Expected alphanumeric value of length {self.length}, but got '{value}'.")
|
|
355
|
+
return self.python_type(value, max_len=self.length)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
@attrs.define(frozen=True)
|
|
359
|
+
class Text(ColType_Unicode, StringType):
|
|
360
|
+
|
|
361
|
+
@property
|
|
362
|
+
def supported(self) -> bool:
|
|
363
|
+
return False
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
# In majority of DBMSes, it is called JSON/JSONB. Only in Snowflake, it is OBJECT.
|
|
367
|
+
@attrs.define(frozen=True)
|
|
368
|
+
class JSON(ColType):
|
|
369
|
+
pass
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
@attrs.define(frozen=True)
|
|
373
|
+
class Array(ColType):
|
|
374
|
+
item_type: ColType
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
# Unlike JSON, structs are not free-form and have a very specific set of fields and their types.
|
|
378
|
+
# We do not parse & use those fields now, but we can do this later.
|
|
379
|
+
# For example, in BigQuery:
|
|
380
|
+
# - https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
|
|
381
|
+
# - https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#struct_literals
|
|
382
|
+
@attrs.define(frozen=True)
|
|
383
|
+
class Struct(ColType):
|
|
384
|
+
pass
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
@attrs.define(frozen=True)
|
|
388
|
+
class Integer(NumericType, IKey):
|
|
389
|
+
precision: int = 0
|
|
390
|
+
python_type: type = int
|
|
391
|
+
|
|
392
|
+
def __attrs_post_init__(self) -> None:
|
|
393
|
+
assert self.precision == 0
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
@attrs.define(frozen=True)
|
|
397
|
+
class UnknownColType(ColType):
|
|
398
|
+
text: str
|
|
399
|
+
|
|
400
|
+
@property
|
|
401
|
+
def supported(self) -> bool:
|
|
402
|
+
return False
|
data_diff/config.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
from typing import Any, Dict
|
|
18
|
+
|
|
19
|
+
import toml
|
|
20
|
+
|
|
21
|
+
_ARRAY_FIELDS = (
|
|
22
|
+
"key_columns",
|
|
23
|
+
"columns",
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ConfigParseError(Exception):
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def is_uri(s: str) -> bool:
|
|
32
|
+
return "://" in s
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _apply_config(config: Dict[str, Any], run_name: str, kw: Dict[str, Any]):
|
|
36
|
+
_resolve_env(config)
|
|
37
|
+
|
|
38
|
+
# Load config
|
|
39
|
+
databases = config.pop("database", {})
|
|
40
|
+
runs = config.pop("run", {})
|
|
41
|
+
if config:
|
|
42
|
+
raise ConfigParseError(f"Unknown option(s): {config}")
|
|
43
|
+
|
|
44
|
+
# Init run_args
|
|
45
|
+
run_args = runs.get("default") or {}
|
|
46
|
+
if run_name:
|
|
47
|
+
if run_name not in runs:
|
|
48
|
+
raise ConfigParseError(f"Cannot find run '{run_name}' in configuration.")
|
|
49
|
+
run_args.update(runs[run_name])
|
|
50
|
+
else:
|
|
51
|
+
run_name = "default"
|
|
52
|
+
|
|
53
|
+
if kw.get("database1") is not None:
|
|
54
|
+
for attr in ("table1", "database2", "table2"):
|
|
55
|
+
if kw[attr] is None:
|
|
56
|
+
raise ValueError(f"Specified database1 but not {attr}. Must specify all 4 arguments, or neither.")
|
|
57
|
+
|
|
58
|
+
for index in "12":
|
|
59
|
+
run_args[index] = {attr: kw.pop(f"{attr}{index}") for attr in ("database", "table")}
|
|
60
|
+
|
|
61
|
+
# Make sure array fields are decoded as list, since array fields in toml are decoded as list, but TableSegment object requires tuple type.
|
|
62
|
+
for field in _ARRAY_FIELDS:
|
|
63
|
+
if isinstance(run_args.get(field), list):
|
|
64
|
+
run_args[field] = tuple(run_args[field])
|
|
65
|
+
|
|
66
|
+
# Process databases + tables
|
|
67
|
+
for index in "12":
|
|
68
|
+
try:
|
|
69
|
+
args = run_args.pop(index)
|
|
70
|
+
except KeyError:
|
|
71
|
+
raise ConfigParseError(
|
|
72
|
+
f"Could not find source #{index}: Expecting a key of '{index}' containing '.database' and '.table'."
|
|
73
|
+
)
|
|
74
|
+
for attr in ("database", "table"):
|
|
75
|
+
if attr not in args:
|
|
76
|
+
raise ConfigParseError(f"Running 'run.{run_name}': Connection #{index} is missing attribute '{attr}'.")
|
|
77
|
+
|
|
78
|
+
database = args.pop("database")
|
|
79
|
+
table = args.pop("table")
|
|
80
|
+
threads = args.pop("threads", None)
|
|
81
|
+
if args:
|
|
82
|
+
raise ConfigParseError(f"Unexpected attributes for connection #{index}: {args}")
|
|
83
|
+
|
|
84
|
+
if not is_uri(database):
|
|
85
|
+
if database not in databases:
|
|
86
|
+
raise ConfigParseError(
|
|
87
|
+
f"Database '{database}' not found in list of databases. Available: {list(databases)}."
|
|
88
|
+
)
|
|
89
|
+
database = dict(databases[database])
|
|
90
|
+
assert isinstance(database, dict)
|
|
91
|
+
if "driver" not in database:
|
|
92
|
+
raise ConfigParseError(f"Database '{database}' did not specify a driver.")
|
|
93
|
+
|
|
94
|
+
run_args[f"database{index}"] = database
|
|
95
|
+
run_args[f"table{index}"] = table
|
|
96
|
+
if threads is not None:
|
|
97
|
+
run_args[f"threads{index}"] = int(threads)
|
|
98
|
+
|
|
99
|
+
# Update keywords
|
|
100
|
+
new_kw = dict(kw) # Set defaults
|
|
101
|
+
new_kw.update(run_args) # Apply config
|
|
102
|
+
new_kw.update({k: v for k, v in kw.items() if v}) # Apply non-empty defaults
|
|
103
|
+
|
|
104
|
+
new_kw["__conf__"] = run_args
|
|
105
|
+
|
|
106
|
+
return new_kw
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# There are no strict requirements for the environment variable name format.
|
|
110
|
+
# But most shells only allow alphanumeric characters and underscores.
|
|
111
|
+
# https://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html
|
|
112
|
+
# "Environment variable names (...) consist solely of uppercase letters, digits, and the '_' (underscore)"
|
|
113
|
+
_ENV_VAR_PATTERN = r"\$\{([A-Za-z0-9_]+)\}"
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _resolve_env(config: Dict[str, Any]) -> None:
|
|
117
|
+
"""
|
|
118
|
+
Resolve environment variables referenced as ${ENV_VAR_NAME}.
|
|
119
|
+
Missing environment variables are replaced with an empty string.
|
|
120
|
+
"""
|
|
121
|
+
for key, value in config.items():
|
|
122
|
+
if isinstance(value, dict):
|
|
123
|
+
_resolve_env(value)
|
|
124
|
+
elif isinstance(value, str):
|
|
125
|
+
config[key] = re.sub(_ENV_VAR_PATTERN, _replace_match, value)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _replace_match(match: re.Match) -> str:
|
|
129
|
+
# Lookup referenced variable in environment.
|
|
130
|
+
# Replace with empty string if not found
|
|
131
|
+
referenced_var = match.group(1) # group(0) is the whole string
|
|
132
|
+
return os.environ.get(referenced_var, "")
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def apply_config_from_file(path: str, run_name: str, kw: Dict[str, Any]):
|
|
136
|
+
with open(path) as f:
|
|
137
|
+
return _apply_config(toml.load(f), run_name, kw)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def apply_config_from_string(toml_config: str, run_name: str, kw: Dict[str, Any]):
|
|
141
|
+
return _apply_config(toml.loads(toml_config), run_name, kw)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from data_diff.databases._connect import Connect as Connect
|
|
16
|
+
from data_diff.databases._connect import connect as connect
|
|
17
|
+
from data_diff.databases.base import (
|
|
18
|
+
CHECKSUM_HEXDIGITS,
|
|
19
|
+
CHECKSUM_OFFSET,
|
|
20
|
+
MD5_HEXDIGITS,
|
|
21
|
+
BaseDialect,
|
|
22
|
+
ConnectError,
|
|
23
|
+
Database,
|
|
24
|
+
QueryError,
|
|
25
|
+
)
|
|
26
|
+
from data_diff.databases.bigquery import BigQuery as BigQuery
|
|
27
|
+
from data_diff.databases.clickhouse import Clickhouse as Clickhouse
|
|
28
|
+
from data_diff.databases.databricks import Databricks as Databricks
|
|
29
|
+
from data_diff.databases.duckdb import DuckDB as DuckDB
|
|
30
|
+
from data_diff.databases.mssql import MsSQL as MsSQL
|
|
31
|
+
from data_diff.databases.mysql import MySQL as MySQL
|
|
32
|
+
from data_diff.databases.oracle import Oracle as Oracle
|
|
33
|
+
from data_diff.databases.postgresql import PostgreSQL as PostgreSQL
|
|
34
|
+
from data_diff.databases.presto import Presto as Presto
|
|
35
|
+
from data_diff.databases.redshift import Redshift as Redshift
|
|
36
|
+
from data_diff.databases.snowflake import Snowflake as Snowflake
|
|
37
|
+
from data_diff.databases.trino import Trino as Trino
|
|
38
|
+
from data_diff.databases.vertica import Vertica as Vertica
|