dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import glob
|
|
15
|
+
from abc import ABC
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Dict, List, TypeVar, Union
|
|
18
|
+
|
|
19
|
+
from pyparsing import Combine, Group, Literal
|
|
20
|
+
from pyparsing import Optional as OptionalParsing
|
|
21
|
+
from pyparsing import Word, delimitedList, nums, oneOf
|
|
22
|
+
|
|
23
|
+
from dcs_core.core.common.errors import DataChecksConfigurationError
|
|
24
|
+
from dcs_core.core.common.models.configuration import (
|
|
25
|
+
Configuration,
|
|
26
|
+
DataSourceConfiguration,
|
|
27
|
+
DataSourceConnectionConfiguration,
|
|
28
|
+
DataSourceType,
|
|
29
|
+
LocalFileStorageParameters,
|
|
30
|
+
MetricConfiguration,
|
|
31
|
+
MetricsFilterConfiguration,
|
|
32
|
+
MetricStorageConfiguration,
|
|
33
|
+
MetricStorageType,
|
|
34
|
+
)
|
|
35
|
+
from dcs_core.core.common.models.data_source_resource import Field, Index, Table
|
|
36
|
+
from dcs_core.core.common.models.metric import MetricsType
|
|
37
|
+
from dcs_core.core.common.models.validation import ConditionType, Threshold, Validation
|
|
38
|
+
from dcs_core.core.configuration.config_loader import parse_config
|
|
39
|
+
|
|
40
|
+
CONDITION_TYPE_MAPPING = {
|
|
41
|
+
">=": ConditionType.GTE,
|
|
42
|
+
"<=": ConditionType.LTE,
|
|
43
|
+
"=": ConditionType.EQ,
|
|
44
|
+
"<": ConditionType.LT,
|
|
45
|
+
">": ConditionType.GT,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
OUTPUT = TypeVar("OUTPUT")
|
|
50
|
+
INPUT = TypeVar("INPUT", Dict, List)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class ConfigParser(ABC):
|
|
54
|
+
def parse(self, config: INPUT) -> OUTPUT:
|
|
55
|
+
raise NotImplementedError
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class DataSourceConfigParser(ConfigParser):
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _data_source_connection_config_parser(
|
|
61
|
+
config: Dict,
|
|
62
|
+
) -> DataSourceConnectionConfiguration:
|
|
63
|
+
connection_config = DataSourceConnectionConfiguration(
|
|
64
|
+
host=config["connection"].get("host"),
|
|
65
|
+
port=config["connection"].get("port"),
|
|
66
|
+
username=config["connection"].get("username"),
|
|
67
|
+
password=config["connection"].get("password"),
|
|
68
|
+
database=config["connection"].get("database"),
|
|
69
|
+
schema=config["connection"].get("schema"),
|
|
70
|
+
project=config["connection"].get("project"),
|
|
71
|
+
dataset=config["connection"].get("dataset"),
|
|
72
|
+
credentials_base64=config["connection"].get("credentials_base64"),
|
|
73
|
+
keyfile=config["connection"].get("keyfile"),
|
|
74
|
+
token=config["connection"].get("token"),
|
|
75
|
+
catalog=config["connection"].get("catalog"),
|
|
76
|
+
http_path=config["connection"].get("http_path"),
|
|
77
|
+
account=config["connection"].get("account"),
|
|
78
|
+
warehouse=config["connection"].get("warehouse"),
|
|
79
|
+
role=config["connection"].get("role"),
|
|
80
|
+
)
|
|
81
|
+
return connection_config
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def _check_for_duplicate_names(config_list: List):
|
|
85
|
+
names = []
|
|
86
|
+
for config in config_list:
|
|
87
|
+
if config["name"] in names:
|
|
88
|
+
raise DataChecksConfigurationError(f"Duplicate datasource names found: {config['name']}")
|
|
89
|
+
names.append(config["name"])
|
|
90
|
+
|
|
91
|
+
def parse(self, config_list: List[Dict]) -> Dict[str, DataSourceConfiguration]:
|
|
92
|
+
self._check_for_duplicate_names(config_list=config_list)
|
|
93
|
+
data_source_configurations: Dict[str, DataSourceConfiguration] = {}
|
|
94
|
+
|
|
95
|
+
for config in config_list:
|
|
96
|
+
name_ = config["name"]
|
|
97
|
+
data_source_configuration = DataSourceConfiguration(
|
|
98
|
+
name=name_,
|
|
99
|
+
type=DataSourceType(config["type"].lower()),
|
|
100
|
+
connection_config=self._data_source_connection_config_parser(config=config),
|
|
101
|
+
)
|
|
102
|
+
data_source_configurations[name_] = data_source_configuration
|
|
103
|
+
|
|
104
|
+
return data_source_configurations
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class StorageConfigParser(ConfigParser):
|
|
108
|
+
@staticmethod
|
|
109
|
+
def _local_file_storage_config_parser(config: Dict) -> LocalFileStorageParameters:
|
|
110
|
+
if "params" not in config:
|
|
111
|
+
raise DataChecksConfigurationError("storage params should be provided for local file storage configuration")
|
|
112
|
+
if "path" not in config["params"]:
|
|
113
|
+
raise DataChecksConfigurationError("path should be provided for local file storage configuration")
|
|
114
|
+
storage_config = LocalFileStorageParameters(path=config["params"]["path"])
|
|
115
|
+
|
|
116
|
+
return storage_config
|
|
117
|
+
|
|
118
|
+
def parse(self, config: Dict) -> Union[MetricStorageConfiguration, None]:
|
|
119
|
+
if config["type"] == "local_file":
|
|
120
|
+
storage_config = MetricStorageConfiguration(
|
|
121
|
+
type=MetricStorageType.LOCAL_FILE,
|
|
122
|
+
params=self._local_file_storage_config_parser(config=config),
|
|
123
|
+
)
|
|
124
|
+
return storage_config
|
|
125
|
+
else:
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class MetricsConfigParser(ConfigParser):
|
|
130
|
+
def __init__(self, data_source_configurations: Dict[str, DataSourceConfiguration]):
|
|
131
|
+
self.data_source_configurations = data_source_configurations
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def _duplicate_metric_names_check(config: List[Dict]):
|
|
135
|
+
names = []
|
|
136
|
+
for metric_yaml_configuration in config:
|
|
137
|
+
if metric_yaml_configuration["name"] in names:
|
|
138
|
+
raise DataChecksConfigurationError(f"Duplicate metric names found: {metric_yaml_configuration['name']}")
|
|
139
|
+
names.append(metric_yaml_configuration["name"])
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def _parse_combined_metric_config(configuration: Dict) -> MetricConfiguration:
|
|
143
|
+
expression_str = configuration["expression"]
|
|
144
|
+
metric_configuration = MetricConfiguration(
|
|
145
|
+
name=configuration["name"],
|
|
146
|
+
metric_type=MetricsType(configuration["metric_type"].lower()),
|
|
147
|
+
expression=expression_str,
|
|
148
|
+
)
|
|
149
|
+
return metric_configuration
|
|
150
|
+
|
|
151
|
+
@staticmethod
|
|
152
|
+
def _parse_resource_table(resource_str: str) -> Table:
|
|
153
|
+
splits = resource_str.split(".")
|
|
154
|
+
if len(splits) != 2:
|
|
155
|
+
raise ValueError(f"Invalid resource string {resource_str}")
|
|
156
|
+
return Table(data_source=splits[0], name=splits[1])
|
|
157
|
+
|
|
158
|
+
@staticmethod
|
|
159
|
+
def _parse_resource_index(resource_str: str) -> Index:
|
|
160
|
+
splits = resource_str.split(".")
|
|
161
|
+
if len(splits) != 2:
|
|
162
|
+
raise ValueError(f"Invalid resource string {resource_str}")
|
|
163
|
+
return Index(data_source=splits[0], name=splits[1])
|
|
164
|
+
|
|
165
|
+
@staticmethod
|
|
166
|
+
def _parse_resource_field(resource_str: str, belongs_to: str) -> Field:
|
|
167
|
+
splits = resource_str.split(".")
|
|
168
|
+
if len(splits) != 3:
|
|
169
|
+
raise ValueError(f"Invalid resource string {resource_str}")
|
|
170
|
+
if belongs_to == "table":
|
|
171
|
+
return Field(belongs_to=Table(data_source=splits[0], name=splits[1]), name=splits[2])
|
|
172
|
+
elif belongs_to == "index":
|
|
173
|
+
return Field(belongs_to=Index(data_source=splits[0], name=splits[1]), name=splits[2])
|
|
174
|
+
|
|
175
|
+
def _metric_resource_parser(
|
|
176
|
+
self,
|
|
177
|
+
resource_str: str,
|
|
178
|
+
data_source_type: DataSourceType,
|
|
179
|
+
metric_type: MetricsType,
|
|
180
|
+
) -> Union[Table, Index, Field]:
|
|
181
|
+
if data_source_type in [
|
|
182
|
+
DataSourceType.OPENSEARCH,
|
|
183
|
+
DataSourceType.ELASTICSEARCH,
|
|
184
|
+
]:
|
|
185
|
+
if metric_type in [MetricsType.DOCUMENT_COUNT]:
|
|
186
|
+
return self._parse_resource_index(resource_str)
|
|
187
|
+
else:
|
|
188
|
+
return self._parse_resource_field(resource_str, "index")
|
|
189
|
+
else:
|
|
190
|
+
if metric_type in [MetricsType.ROW_COUNT, MetricsType.CUSTOM_SQL]:
|
|
191
|
+
return self._parse_resource_table(resource_str)
|
|
192
|
+
else:
|
|
193
|
+
return self._parse_resource_field(resource_str, "table")
|
|
194
|
+
|
|
195
|
+
def _parse_generic_metric_configuration(self, configuration: Dict, metric_type: MetricsType) -> MetricConfiguration:
|
|
196
|
+
resource_str = configuration["resource"]
|
|
197
|
+
data_source_name = resource_str.split(".")[0]
|
|
198
|
+
|
|
199
|
+
data_source_configuration: DataSourceConfiguration = self.data_source_configurations[data_source_name]
|
|
200
|
+
|
|
201
|
+
metric_configuration = MetricConfiguration(
|
|
202
|
+
name=configuration["name"],
|
|
203
|
+
metric_type=metric_type,
|
|
204
|
+
resource=self._metric_resource_parser(
|
|
205
|
+
resource_str=resource_str,
|
|
206
|
+
data_source_type=data_source_configuration.type,
|
|
207
|
+
metric_type=metric_type,
|
|
208
|
+
),
|
|
209
|
+
filters=configuration.get("filters"),
|
|
210
|
+
)
|
|
211
|
+
if "filters" in configuration:
|
|
212
|
+
metric_configuration.filter = MetricsFilterConfiguration(where=configuration["filters"]["where"])
|
|
213
|
+
if "query" in configuration:
|
|
214
|
+
metric_configuration.query = configuration["query"]
|
|
215
|
+
|
|
216
|
+
return metric_configuration
|
|
217
|
+
|
|
218
|
+
@staticmethod
|
|
219
|
+
def _parse_threshold_str(threshold: str) -> Threshold:
|
|
220
|
+
try:
|
|
221
|
+
operator = oneOf(">= <= = < >").setParseAction(lambda t: CONDITION_TYPE_MAPPING[t[0]])
|
|
222
|
+
number = Combine(
|
|
223
|
+
OptionalParsing(Literal("-")) + Word(nums) + OptionalParsing(Literal(".") + Word(nums))
|
|
224
|
+
).setParseAction(lambda t: float(t[0]))
|
|
225
|
+
|
|
226
|
+
condition = operator + number
|
|
227
|
+
conditions = delimitedList(
|
|
228
|
+
Group(condition) | Group(condition + Literal("&") + condition),
|
|
229
|
+
delim="&",
|
|
230
|
+
)
|
|
231
|
+
result = conditions.parseString(threshold)
|
|
232
|
+
return Threshold(**{operator: value for operator, value in result})
|
|
233
|
+
|
|
234
|
+
except Exception as e:
|
|
235
|
+
raise DataChecksConfigurationError(f"Invalid threshold configuration {threshold}: {str(e)}")
|
|
236
|
+
|
|
237
|
+
def _parse_validation_configuration(self, validation_config: Dict) -> Validation:
|
|
238
|
+
if "threshold" in validation_config:
|
|
239
|
+
threshold = self._parse_threshold_str(threshold=validation_config["threshold"])
|
|
240
|
+
return Validation(threshold=threshold)
|
|
241
|
+
else:
|
|
242
|
+
raise DataChecksConfigurationError(f"Invalid validation configuration {validation_config}")
|
|
243
|
+
|
|
244
|
+
def parse(self, config_list: List[Dict]) -> Dict[str, MetricConfiguration]:
|
|
245
|
+
self._duplicate_metric_names_check(config=config_list)
|
|
246
|
+
metric_configurations: Dict[str, MetricConfiguration] = {}
|
|
247
|
+
|
|
248
|
+
for config in config_list:
|
|
249
|
+
metric_type = MetricsType(config["metric_type"].lower())
|
|
250
|
+
if metric_type == MetricsType.COMBINED:
|
|
251
|
+
metric_configuration = self._parse_combined_metric_config(configuration=config)
|
|
252
|
+
else:
|
|
253
|
+
metric_configuration = self._parse_generic_metric_configuration(
|
|
254
|
+
configuration=config, metric_type=metric_type
|
|
255
|
+
)
|
|
256
|
+
if "validation" in config and config["validation"] is not None:
|
|
257
|
+
metric_configuration.validation = self._parse_validation_configuration(config["validation"])
|
|
258
|
+
metric_configurations[metric_configuration.name] = metric_configuration
|
|
259
|
+
return metric_configurations
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def _parse_configuration_from_dict(config_dict: Dict) -> Configuration:
|
|
263
|
+
try:
|
|
264
|
+
data_source_configurations = DataSourceConfigParser().parse(config_list=config_dict["data_sources"])
|
|
265
|
+
metric_configurations = MetricsConfigParser(data_source_configurations=data_source_configurations).parse(
|
|
266
|
+
config_list=config_dict["metrics"]
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
configuration = Configuration(data_sources=data_source_configurations, metrics=metric_configurations)
|
|
270
|
+
|
|
271
|
+
if "storage" in config_dict and config_dict["storage"] is not None:
|
|
272
|
+
configuration.storage = StorageConfigParser().parse(config=config_dict["storage"])
|
|
273
|
+
return configuration
|
|
274
|
+
except Exception as ex:
|
|
275
|
+
raise DataChecksConfigurationError(message=f"Failed to parse configuration: {str(ex)}")
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def load_configuration_from_yaml_str(yaml_string: str) -> Configuration:
|
|
279
|
+
"""
|
|
280
|
+
Load configuration from a yaml string
|
|
281
|
+
"""
|
|
282
|
+
try:
|
|
283
|
+
config_dict: Dict = parse_config(data=yaml_string)
|
|
284
|
+
except Exception as ex:
|
|
285
|
+
raise DataChecksConfigurationError(message=f"Failed to parse configuration: {str(ex)}")
|
|
286
|
+
return _parse_configuration_from_dict(config_dict=config_dict)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def load_configuration(configuration_path: str) -> Configuration:
|
|
290
|
+
"""
|
|
291
|
+
Load configuration from a yaml file
|
|
292
|
+
:param configuration_path:
|
|
293
|
+
:return:
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
path = Path(configuration_path)
|
|
297
|
+
if not path.exists():
|
|
298
|
+
raise DataChecksConfigurationError(message=f"Configuration file {configuration_path} does not exist")
|
|
299
|
+
if path.is_file():
|
|
300
|
+
with open(configuration_path) as config_yaml_file:
|
|
301
|
+
yaml_string = config_yaml_file.read()
|
|
302
|
+
return load_configuration_from_yaml_str(yaml_string)
|
|
303
|
+
else:
|
|
304
|
+
config_files = glob.glob(f"{configuration_path}/*.yaml")
|
|
305
|
+
if len(config_files) == 0:
|
|
306
|
+
raise DataChecksConfigurationError(message=f"No configuration files found in {configuration_path}")
|
|
307
|
+
else:
|
|
308
|
+
config_dict_list: List[Dict] = []
|
|
309
|
+
for config_file in config_files:
|
|
310
|
+
with open(config_file) as config_yaml_file:
|
|
311
|
+
yaml_string = config_yaml_file.read()
|
|
312
|
+
config_dict: Dict = parse_config(data=yaml_string)
|
|
313
|
+
config_dict_list.append(config_dict)
|
|
314
|
+
|
|
315
|
+
final_config_dict = {
|
|
316
|
+
"data_sources": [],
|
|
317
|
+
"metrics": [],
|
|
318
|
+
"storage": None,
|
|
319
|
+
}
|
|
320
|
+
for config_dict in config_dict_list:
|
|
321
|
+
if "data_sources" in config_dict:
|
|
322
|
+
final_config_dict["data_sources"].extend(config_dict["data_sources"])
|
|
323
|
+
if "metrics" in config_dict:
|
|
324
|
+
final_config_dict["metrics"].extend(config_dict["metrics"])
|
|
325
|
+
if "storage" in config_dict:
|
|
326
|
+
final_config_dict["storage"] = config_dict["storage"]
|
|
327
|
+
|
|
328
|
+
return _parse_configuration_from_dict(final_config_dict)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from abc import ABC
|
|
16
|
+
from typing import Any, Dict, Optional
|
|
17
|
+
|
|
18
|
+
from dcs_core.core.common.models.configuration import DataSourceLanguageSupport
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DataSource(ABC):
|
|
22
|
+
"""
|
|
23
|
+
Abstract class for data sources
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
NUMERIC_PYTHON_TYPES_FOR_PROFILING = ["int", "float"]
|
|
27
|
+
TEXT_PYTHON_TYPES_FOR_PROFILING = ["str"]
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
data_source_name: str,
|
|
32
|
+
data_connection: Dict,
|
|
33
|
+
language_support: Optional[DataSourceLanguageSupport] = DataSourceLanguageSupport.SQL,
|
|
34
|
+
):
|
|
35
|
+
self._data_source_name: str = data_source_name
|
|
36
|
+
self.data_connection: Dict = data_connection
|
|
37
|
+
self.language_support = language_support
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def data_source_name(self) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Get the data source name
|
|
43
|
+
"""
|
|
44
|
+
return self._data_source_name
|
|
45
|
+
|
|
46
|
+
def connect(self) -> Any:
|
|
47
|
+
"""
|
|
48
|
+
Connect to the data source
|
|
49
|
+
"""
|
|
50
|
+
raise NotImplementedError("connect method is not implemented")
|
|
51
|
+
|
|
52
|
+
def is_connected(self) -> bool:
|
|
53
|
+
"""
|
|
54
|
+
Check if the data source is connected
|
|
55
|
+
"""
|
|
56
|
+
raise NotImplementedError("is_connected method is not implemented")
|
|
57
|
+
|
|
58
|
+
def close(self):
|
|
59
|
+
"""
|
|
60
|
+
Close the connection
|
|
61
|
+
"""
|
|
62
|
+
raise NotImplementedError("close_connection method is not implemented")
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import importlib
|
|
15
|
+
from dataclasses import asdict
|
|
16
|
+
from typing import Dict, List
|
|
17
|
+
|
|
18
|
+
from dcs_core.core.common.errors import DataChecksDataSourcesConnectionError
|
|
19
|
+
from dcs_core.core.common.models.configuration import (
|
|
20
|
+
Configuration,
|
|
21
|
+
DataSourceConfiguration,
|
|
22
|
+
)
|
|
23
|
+
from dcs_core.core.datasource.base import DataSource
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DataSourceManager:
|
|
27
|
+
"""
|
|
28
|
+
Data source manager.
|
|
29
|
+
This class is responsible for managing the data sources.
|
|
30
|
+
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
DATA_SOURCE_CLASS_NAME_MAPPER = {
|
|
34
|
+
"opensearch": "OpenSearchDataSource",
|
|
35
|
+
"elasticsearch": "ElasticSearchDataSource",
|
|
36
|
+
"postgres": "PostgresDataSource",
|
|
37
|
+
"mysql": "MysqlDataSource",
|
|
38
|
+
"bigquery": "BigQueryDataSource",
|
|
39
|
+
"databricks": "DatabricksDataSource",
|
|
40
|
+
"redshift": "RedShiftDataSource",
|
|
41
|
+
"snowflake": "SnowFlakeDataSource",
|
|
42
|
+
"mssql": "MssqlDataSource",
|
|
43
|
+
"oracle": "OracleDataSource",
|
|
44
|
+
"db2": "DB2DataSource",
|
|
45
|
+
"sybase": "SybaseDataSource",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def __init__(self, config: Configuration):
|
|
49
|
+
self._config = config
|
|
50
|
+
self._data_sources: Dict[str, DataSource] = {}
|
|
51
|
+
|
|
52
|
+
def connect(self):
|
|
53
|
+
for name, data_source_config in self._config.data_sources.items():
|
|
54
|
+
self._data_sources[data_source_config.name] = self._create_data_source(
|
|
55
|
+
data_source_config=data_source_config
|
|
56
|
+
)
|
|
57
|
+
for data_source in self._data_sources.values():
|
|
58
|
+
try:
|
|
59
|
+
data_source.connect()
|
|
60
|
+
except Exception as e:
|
|
61
|
+
raise DataChecksDataSourcesConnectionError(
|
|
62
|
+
f"Failed to connect to data source {data_source.data_source_name} [{str(e)}]"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def get_data_sources(self) -> Dict[str, DataSource]:
|
|
67
|
+
"""
|
|
68
|
+
Get the data sources
|
|
69
|
+
:return:
|
|
70
|
+
"""
|
|
71
|
+
return self._data_sources
|
|
72
|
+
|
|
73
|
+
def _create_data_source(self, data_source_config: DataSourceConfiguration) -> DataSource:
|
|
74
|
+
"""
|
|
75
|
+
Create a data source
|
|
76
|
+
:param data_source_config: data source configuration
|
|
77
|
+
:return: data source
|
|
78
|
+
"""
|
|
79
|
+
data_source_name = data_source_config.name
|
|
80
|
+
data_source_type = data_source_config.type
|
|
81
|
+
if data_source_type == "spark_df":
|
|
82
|
+
from dcs_core.integrations.databases.spark_df import SparkDFDataSource
|
|
83
|
+
|
|
84
|
+
return SparkDFDataSource(
|
|
85
|
+
data_source_name,
|
|
86
|
+
{"spark_session": data_source_config.connection_config.spark_session},
|
|
87
|
+
)
|
|
88
|
+
try:
|
|
89
|
+
module_name = f"dcs_core.integrations.databases.{data_source_config.type.value}"
|
|
90
|
+
module = importlib.import_module(module_name)
|
|
91
|
+
data_source_class = self.DATA_SOURCE_CLASS_NAME_MAPPER[data_source_config.type]
|
|
92
|
+
data_source_class = getattr(module, data_source_class)
|
|
93
|
+
return data_source_class(data_source_name, asdict(data_source_config.connection_config))
|
|
94
|
+
except ModuleNotFoundError as e:
|
|
95
|
+
raise DataChecksDataSourcesConnectionError(
|
|
96
|
+
f'Failed to initiate data source type "{data_source_type}" [{str(e)}]'
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def get_data_source(self, data_source_name: str) -> DataSource:
|
|
100
|
+
"""
|
|
101
|
+
Get a data source
|
|
102
|
+
:param data_source_name:
|
|
103
|
+
:return:
|
|
104
|
+
"""
|
|
105
|
+
return self._data_sources[data_source_name]
|
|
106
|
+
|
|
107
|
+
def get_data_source_names(self) -> List[str]:
|
|
108
|
+
"""
|
|
109
|
+
Get the data source names
|
|
110
|
+
:return:
|
|
111
|
+
"""
|
|
112
|
+
return list(self._data_sources.keys())
|