dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import base64
|
|
16
|
+
import logging
|
|
17
|
+
from typing import Any, ClassVar, List, Optional, Type, Union
|
|
18
|
+
|
|
19
|
+
import attrs
|
|
20
|
+
|
|
21
|
+
from data_diff.abcs.database_types import (
|
|
22
|
+
Boolean,
|
|
23
|
+
Date,
|
|
24
|
+
DbPath,
|
|
25
|
+
Decimal,
|
|
26
|
+
Float,
|
|
27
|
+
FractionalType,
|
|
28
|
+
TemporalType,
|
|
29
|
+
Text,
|
|
30
|
+
Time,
|
|
31
|
+
Timestamp,
|
|
32
|
+
TimestampTZ,
|
|
33
|
+
)
|
|
34
|
+
from data_diff.databases.base import (
|
|
35
|
+
CHECKSUM_MASK,
|
|
36
|
+
CHECKSUM_OFFSET,
|
|
37
|
+
BaseDialect,
|
|
38
|
+
ConnectError,
|
|
39
|
+
Database,
|
|
40
|
+
ThreadLocalInterpreter,
|
|
41
|
+
import_helper,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@import_helper("snowflake")
|
|
46
|
+
def import_snowflake():
|
|
47
|
+
import snowflake.connector
|
|
48
|
+
from cryptography.hazmat.backends import default_backend
|
|
49
|
+
from cryptography.hazmat.primitives import serialization
|
|
50
|
+
|
|
51
|
+
return snowflake, serialization, default_backend
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Dialect(BaseDialect):
|
|
55
|
+
name = "Snowflake"
|
|
56
|
+
ROUNDS_ON_PREC_LOSS = False
|
|
57
|
+
TYPE_CLASSES = {
|
|
58
|
+
# Timestamps
|
|
59
|
+
"TIMESTAMP_NTZ": Timestamp,
|
|
60
|
+
"TIMESTAMP_LTZ": Timestamp,
|
|
61
|
+
"TIMESTAMP_TZ": TimestampTZ,
|
|
62
|
+
"DATE": Date,
|
|
63
|
+
"TIME": Time,
|
|
64
|
+
# Numbers
|
|
65
|
+
"NUMBER": Decimal,
|
|
66
|
+
"FLOAT": Float,
|
|
67
|
+
# Text
|
|
68
|
+
"TEXT": Text,
|
|
69
|
+
# Boolean
|
|
70
|
+
"BOOLEAN": Boolean,
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def explain_as_text(self, query: str) -> str:
|
|
74
|
+
return f"EXPLAIN USING TEXT {query}"
|
|
75
|
+
|
|
76
|
+
def quote(self, s: str, is_table: bool = False):
|
|
77
|
+
return f'"{s}"'
|
|
78
|
+
|
|
79
|
+
def to_string(self, s: str):
|
|
80
|
+
return f"cast({s} as string)"
|
|
81
|
+
|
|
82
|
+
def set_timezone_to_utc(self) -> str:
|
|
83
|
+
return "ALTER SESSION SET TIMEZONE = 'UTC'"
|
|
84
|
+
|
|
85
|
+
def optimizer_hints(self, hints: str) -> str:
|
|
86
|
+
raise NotImplementedError("Optimizer hints not yet implemented in snowflake")
|
|
87
|
+
|
|
88
|
+
def type_repr(self, t) -> str:
|
|
89
|
+
if isinstance(t, TimestampTZ):
|
|
90
|
+
return f"timestamp_tz({t.precision})"
|
|
91
|
+
return super().type_repr(t)
|
|
92
|
+
|
|
93
|
+
def md5_as_int(self, s: str) -> str:
|
|
94
|
+
return f"BITAND(md5_number_lower64({s}), {CHECKSUM_MASK}) - {CHECKSUM_OFFSET}"
|
|
95
|
+
|
|
96
|
+
def md5_as_hex(self, s: str) -> str:
|
|
97
|
+
return f"md5({s})"
|
|
98
|
+
|
|
99
|
+
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
|
|
100
|
+
try:
|
|
101
|
+
is_date = coltype.is_date
|
|
102
|
+
is_time = coltype.is_time
|
|
103
|
+
except:
|
|
104
|
+
is_date = False
|
|
105
|
+
is_time = False
|
|
106
|
+
if isinstance(coltype, Date) or is_date:
|
|
107
|
+
return f"({value}::varchar)"
|
|
108
|
+
elif isinstance(coltype, Time) or is_time:
|
|
109
|
+
microseconds = f"TIMEDIFF(microsecond, cast('00:00:00' as time), {value})"
|
|
110
|
+
rounded = f"round({microseconds}, -6 + {coltype.precision})"
|
|
111
|
+
time_value = f"TIMEADD(microsecond, {rounded}, cast('00:00:00' as time))"
|
|
112
|
+
converted = f"TO_VARCHAR({time_value}, 'HH24:MI:SS.FF6')"
|
|
113
|
+
return converted
|
|
114
|
+
|
|
115
|
+
if coltype.rounds:
|
|
116
|
+
timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, convert_timezone('UTC', {value})::timestamp(9))/1000000000, {coltype.precision}))"
|
|
117
|
+
else:
|
|
118
|
+
timestamp = f"cast(convert_timezone('UTC', {value}) as timestamp({coltype.precision}))"
|
|
119
|
+
|
|
120
|
+
return f"to_char({timestamp}, 'YYYY-MM-DD HH24:MI:SS.FF6')"
|
|
121
|
+
|
|
122
|
+
def normalize_number(self, value: str, coltype: FractionalType) -> str:
|
|
123
|
+
return self.to_string(f"cast({value} as decimal(38, {coltype.precision}))")
|
|
124
|
+
|
|
125
|
+
def normalize_boolean(self, value: str, _coltype: Boolean) -> str:
|
|
126
|
+
return self.to_string(f"{value}::int")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@attrs.define(frozen=False, init=False, kw_only=True)
|
|
130
|
+
class Snowflake(Database):
|
|
131
|
+
DIALECT_CLASS: ClassVar[Type[BaseDialect]] = Dialect
|
|
132
|
+
CONNECT_URI_HELP = "snowflake://<user>:<password>@<account>/<database>/<SCHEMA>?warehouse=<WAREHOUSE>"
|
|
133
|
+
CONNECT_URI_PARAMS = ["database", "schema"]
|
|
134
|
+
CONNECT_URI_KWPARAMS = ["warehouse"]
|
|
135
|
+
|
|
136
|
+
_conn: Any
|
|
137
|
+
|
|
138
|
+
def __init__(self, *, schema: str, key: Optional[str] = None, key_content: Optional[str] = None, **kw) -> None:
|
|
139
|
+
super().__init__()
|
|
140
|
+
snowflake, serialization, default_backend = import_snowflake()
|
|
141
|
+
logging.getLogger("snowflake.connector").setLevel(logging.WARNING)
|
|
142
|
+
|
|
143
|
+
# Ignore the error: snowflake.connector.network.RetryRequest: could not find io module state
|
|
144
|
+
# It's a known issue: https://github.com/snowflakedb/snowflake-connector-python/issues/145
|
|
145
|
+
logging.getLogger("snowflake.connector.network").disabled = True
|
|
146
|
+
|
|
147
|
+
assert '"' not in schema, "Schema name should not contain quotes!"
|
|
148
|
+
if key_content and key:
|
|
149
|
+
raise ConnectError("Only key value or key file path can be specified, not both")
|
|
150
|
+
|
|
151
|
+
key_bytes = None
|
|
152
|
+
if key:
|
|
153
|
+
with open(key, "rb") as f:
|
|
154
|
+
key_bytes = f.read()
|
|
155
|
+
if key_content:
|
|
156
|
+
key_bytes = base64.b64decode(key_content)
|
|
157
|
+
|
|
158
|
+
# If a private key is used, read it from the specified path and pass it as "private_key" to the connector.
|
|
159
|
+
if key_bytes:
|
|
160
|
+
if "password" in kw:
|
|
161
|
+
raise ConnectError("Cannot use password and key at the same time")
|
|
162
|
+
if kw.get("private_key_passphrase"):
|
|
163
|
+
encoded_passphrase = kw.get("private_key_passphrase").encode()
|
|
164
|
+
else:
|
|
165
|
+
encoded_passphrase = None
|
|
166
|
+
p_key = serialization.load_pem_private_key(
|
|
167
|
+
key_bytes,
|
|
168
|
+
password=encoded_passphrase,
|
|
169
|
+
backend=default_backend(),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
kw["private_key"] = p_key.private_bytes(
|
|
173
|
+
encoding=serialization.Encoding.DER,
|
|
174
|
+
format=serialization.PrivateFormat.PKCS8,
|
|
175
|
+
encryption_algorithm=serialization.NoEncryption(),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
self._conn = snowflake.connector.connect(schema=f'"{schema}"', **kw)
|
|
179
|
+
|
|
180
|
+
self.default_schema = schema
|
|
181
|
+
|
|
182
|
+
def close(self):
|
|
183
|
+
super().close()
|
|
184
|
+
self._conn.close()
|
|
185
|
+
|
|
186
|
+
def _query(self, sql_code: Union[str, ThreadLocalInterpreter]):
|
|
187
|
+
"Uses the standard SQL cursor interface"
|
|
188
|
+
return self._query_conn(self._conn, sql_code)
|
|
189
|
+
|
|
190
|
+
def select_table_schema(self, path: DbPath) -> str:
|
|
191
|
+
"""Provide SQL for selecting the table schema as (name, type, date_prec, num_prec)"""
|
|
192
|
+
database, schema, name = self._normalize_table_path(path)
|
|
193
|
+
info_schema_path = ["information_schema", "columns"]
|
|
194
|
+
if database:
|
|
195
|
+
info_schema_path.insert(0, database)
|
|
196
|
+
|
|
197
|
+
return (
|
|
198
|
+
"SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale, "
|
|
199
|
+
" coalesce(collation_name, 'utf8') as collation_name, "
|
|
200
|
+
" coalesce(character_maximum_length, NULL) as character_maximum_length "
|
|
201
|
+
f"FROM {'.'.join(info_schema_path)} "
|
|
202
|
+
f"WHERE table_name = '{name}' AND table_schema = '{schema}'"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
def _normalize_table_path(self, path: DbPath) -> DbPath:
|
|
206
|
+
if len(path) == 1:
|
|
207
|
+
return None, self.default_schema, path[0]
|
|
208
|
+
elif len(path) == 2:
|
|
209
|
+
return None, path[0], path[1]
|
|
210
|
+
elif len(path) == 3:
|
|
211
|
+
return path
|
|
212
|
+
|
|
213
|
+
raise ValueError(
|
|
214
|
+
f"{self.name}: Bad table path for {self}: '{'.'.join(path)}'. Expected format: table, schema.table, or database.schema.table"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def is_autocommit(self) -> bool:
|
|
219
|
+
return True
|
|
220
|
+
|
|
221
|
+
def query_table_unique_columns(self, path: DbPath) -> List[str]:
|
|
222
|
+
return []
|