dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import base64
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
from typing import Any, Dict, List, Optional
|
|
19
|
+
|
|
20
|
+
from loguru import logger
|
|
21
|
+
from sqlalchemy import create_engine
|
|
22
|
+
|
|
23
|
+
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
24
|
+
from dcs_core.core.common.models.data_source_resource import RawColumnInfo
|
|
25
|
+
from dcs_core.core.datasource.sql_datasource import SQLDataSource
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class BigQueryDataSource(SQLDataSource):
|
|
29
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
30
|
+
super().__init__(data_source_name, data_connection)
|
|
31
|
+
self.project_id = self.data_connection.get("project")
|
|
32
|
+
self.dataset_id = self.data_connection.get("dataset")
|
|
33
|
+
self.schema_name = self.dataset_id
|
|
34
|
+
self.keyfile = self.data_connection.get("keyfile")
|
|
35
|
+
self.credentials_base64 = self.data_connection.get("credentials_base64")
|
|
36
|
+
|
|
37
|
+
def connect(self) -> Any:
|
|
38
|
+
"""
|
|
39
|
+
Connect to the data source
|
|
40
|
+
"""
|
|
41
|
+
try:
|
|
42
|
+
credentials = None
|
|
43
|
+
if self.credentials_base64:
|
|
44
|
+
credentials = self.credentials_base64
|
|
45
|
+
elif self.keyfile:
|
|
46
|
+
if os.path.exists(self.keyfile):
|
|
47
|
+
with open(self.keyfile, "rb") as f:
|
|
48
|
+
credentials = f.read()
|
|
49
|
+
credentials = json.loads(credentials)
|
|
50
|
+
credentials = base64.b64encode(json.dumps(credentials).encode("utf-8")).decode("utf-8")
|
|
51
|
+
else:
|
|
52
|
+
try:
|
|
53
|
+
if self._is_base64(self.keyfile):
|
|
54
|
+
credentials = self.keyfile
|
|
55
|
+
else:
|
|
56
|
+
credentials = base64.b64decode(self.keyfile).decode("utf-8")
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.error(f"Failed to decode keyfile: {e}")
|
|
59
|
+
credentials = json.loads(self.keyfile)
|
|
60
|
+
credentials = base64.b64encode(json.dumps(credentials).encode("utf-8")).decode("utf-8")
|
|
61
|
+
|
|
62
|
+
if not credentials:
|
|
63
|
+
raise
|
|
64
|
+
url = f"bigquery://{self.project_id}/{self.dataset_id}"
|
|
65
|
+
engine = create_engine(url, credentials_base64=credentials)
|
|
66
|
+
self.connection = engine.connect()
|
|
67
|
+
return self.connection
|
|
68
|
+
except Exception as e:
|
|
69
|
+
raise DataChecksDataSourcesConnectionError(message=f"Failed to connect to BigQuery data source: [{str(e)}]")
|
|
70
|
+
|
|
71
|
+
def _is_base64(self, s: str) -> bool:
|
|
72
|
+
try:
|
|
73
|
+
if len(s) % 4 != 0:
|
|
74
|
+
return False
|
|
75
|
+
base64.b64decode(s, validate=True)
|
|
76
|
+
return True
|
|
77
|
+
except Exception:
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
def quote_column(self, column: str) -> str:
|
|
81
|
+
"""
|
|
82
|
+
Quote the column name
|
|
83
|
+
:param column: name of the column
|
|
84
|
+
:return: quoted column name
|
|
85
|
+
"""
|
|
86
|
+
return f"`{column}`"
|
|
87
|
+
|
|
88
|
+
def qualified_table_name(self, table_name: str) -> str:
|
|
89
|
+
"""
|
|
90
|
+
Get the qualified table name
|
|
91
|
+
:param table_name: name of the table
|
|
92
|
+
:return: qualified table name
|
|
93
|
+
"""
|
|
94
|
+
if self.project_id and self.dataset_id:
|
|
95
|
+
return f"`{self.project_id}`.`{self.dataset_id}`.`{table_name}`"
|
|
96
|
+
elif self.dataset_id:
|
|
97
|
+
return f"`{self.dataset_id}`.`{table_name}`"
|
|
98
|
+
elif self.project_id:
|
|
99
|
+
return f"`{self.project_id}`.`{table_name}`"
|
|
100
|
+
|
|
101
|
+
return f"`{table_name}`"
|
|
102
|
+
|
|
103
|
+
def query_get_table_names(self, schema: str | None = None) -> List[str]:
|
|
104
|
+
"""
|
|
105
|
+
Get the list of BigQuery tables (excluding views) in a dataset.
|
|
106
|
+
:param schema: optional dataset name
|
|
107
|
+
:return: list of table names
|
|
108
|
+
"""
|
|
109
|
+
schema = schema or self.schema_name
|
|
110
|
+
project = self.project_id
|
|
111
|
+
query = (
|
|
112
|
+
f"SELECT table_name FROM `{project}.{schema}.INFORMATION_SCHEMA.TABLES` "
|
|
113
|
+
"WHERE table_type = 'BASE TABLE' "
|
|
114
|
+
"ORDER BY table_name"
|
|
115
|
+
)
|
|
116
|
+
rows = self.fetchall(query)
|
|
117
|
+
return [row[0] for row in rows] if rows else []
|
|
118
|
+
|
|
119
|
+
def query_get_table_columns(
|
|
120
|
+
self,
|
|
121
|
+
table: str,
|
|
122
|
+
schema: str | None = None,
|
|
123
|
+
) -> RawColumnInfo:
|
|
124
|
+
"""
|
|
125
|
+
Get the list of tables in the database.
|
|
126
|
+
:param schema: optional schema name
|
|
127
|
+
:return: list of table names
|
|
128
|
+
"""
|
|
129
|
+
schema = schema or self.schema_name
|
|
130
|
+
query = (
|
|
131
|
+
"SELECT column_name, data_type, "
|
|
132
|
+
"NULL AS datetime_precision, "
|
|
133
|
+
"NULL AS numeric_precision, "
|
|
134
|
+
"NULL AS numeric_scale, "
|
|
135
|
+
"NULL AS collation_name, "
|
|
136
|
+
"NULL AS character_maximum_length "
|
|
137
|
+
f"FROM `{self.project_id}.{schema}.INFORMATION_SCHEMA.COLUMNS` "
|
|
138
|
+
f"WHERE table_name = '{table}'"
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
rows = self.fetchall(query)
|
|
142
|
+
if not rows:
|
|
143
|
+
raise RuntimeError(f"{table}: Table, {schema}: Schema, does not exist, or has no columns")
|
|
144
|
+
column_info = {
|
|
145
|
+
r[0]: RawColumnInfo(
|
|
146
|
+
column_name=self.safe_get(r, 0),
|
|
147
|
+
data_type=self.safe_get(r, 1),
|
|
148
|
+
datetime_precision=self.safe_get(r, 2),
|
|
149
|
+
numeric_precision=self.safe_get(r, 3),
|
|
150
|
+
numeric_scale=self.safe_get(r, 4),
|
|
151
|
+
collation_name=self.safe_get(r, 5),
|
|
152
|
+
character_maximum_length=self.safe_get(r, 6),
|
|
153
|
+
)
|
|
154
|
+
for r in rows
|
|
155
|
+
}
|
|
156
|
+
return column_info
|
|
157
|
+
|
|
158
|
+
def create_view(
|
|
159
|
+
self,
|
|
160
|
+
query: Optional[str] = None,
|
|
161
|
+
dataset: Optional[str] = None,
|
|
162
|
+
view_name: Optional[str] = None,
|
|
163
|
+
) -> str | None:
|
|
164
|
+
view_name = self.generate_view_name(view_name=view_name)
|
|
165
|
+
full_name = f"`{self.project}`.`{dataset}`.`{view_name}`" if dataset else f"`{view_name}`"
|
|
166
|
+
try:
|
|
167
|
+
if query is None:
|
|
168
|
+
create_view_query = f"CREATE VIEW {full_name} AS SELECT 1 AS dummy_column WHERE FALSE"
|
|
169
|
+
self.connection.execute(create_view_query)
|
|
170
|
+
return full_name
|
|
171
|
+
else:
|
|
172
|
+
create_view_query = f"CREATE VIEW {full_name} AS {query}"
|
|
173
|
+
self.connection.execute(create_view_query)
|
|
174
|
+
return full_name
|
|
175
|
+
except Exception as e:
|
|
176
|
+
logger.error(f"Error creating view: {e}")
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
def drop_view(self, view_name: str, dataset: Optional[str] = None) -> bool:
|
|
180
|
+
full_name = f"`{self.project}`.`{dataset}`.`{view_name}`" if dataset else f"`{view_name}`"
|
|
181
|
+
try:
|
|
182
|
+
drop_view_query = f"DROP VIEW {full_name}"
|
|
183
|
+
self.connection.execute(drop_view_query)
|
|
184
|
+
return True
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.error(f"Error dropping view: {e}")
|
|
187
|
+
return False
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Any, Dict
|
|
16
|
+
|
|
17
|
+
from sqlalchemy import create_engine
|
|
18
|
+
from sqlalchemy.engine import URL
|
|
19
|
+
|
|
20
|
+
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
21
|
+
from dcs_core.core.datasource.sql_datasource import SQLDataSource
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DatabricksDataSource(SQLDataSource):
|
|
25
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
26
|
+
super().__init__(data_source_name, data_connection)
|
|
27
|
+
|
|
28
|
+
def connect(self) -> Any:
|
|
29
|
+
"""
|
|
30
|
+
Connect to the data source
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
url = URL.create(
|
|
34
|
+
"databricks",
|
|
35
|
+
username="token",
|
|
36
|
+
password=self.data_connection.get("token"),
|
|
37
|
+
host=self.data_connection.get("host"),
|
|
38
|
+
port=self.data_connection.get("port", 443),
|
|
39
|
+
database=self.data_connection.get("schema"),
|
|
40
|
+
query={
|
|
41
|
+
"http_path": self.data_connection.get("http_path"),
|
|
42
|
+
"catalog": self.data_connection.get("catalog"),
|
|
43
|
+
},
|
|
44
|
+
)
|
|
45
|
+
engine = create_engine(url, echo=True)
|
|
46
|
+
self.connection = engine.connect()
|
|
47
|
+
return self.connection
|
|
48
|
+
except Exception as e:
|
|
49
|
+
raise DataChecksDataSourcesConnectionError(
|
|
50
|
+
message=f"Failed to connect to Databricks data source: [{str(e)}]"
|
|
51
|
+
)
|