dcs-sdk 1.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_diff/__init__.py +221 -0
- data_diff/__main__.py +517 -0
- data_diff/abcs/__init__.py +13 -0
- data_diff/abcs/compiler.py +27 -0
- data_diff/abcs/database_types.py +402 -0
- data_diff/config.py +141 -0
- data_diff/databases/__init__.py +38 -0
- data_diff/databases/_connect.py +323 -0
- data_diff/databases/base.py +1417 -0
- data_diff/databases/bigquery.py +376 -0
- data_diff/databases/clickhouse.py +217 -0
- data_diff/databases/databricks.py +262 -0
- data_diff/databases/duckdb.py +207 -0
- data_diff/databases/mssql.py +343 -0
- data_diff/databases/mysql.py +189 -0
- data_diff/databases/oracle.py +238 -0
- data_diff/databases/postgresql.py +293 -0
- data_diff/databases/presto.py +222 -0
- data_diff/databases/redis.py +93 -0
- data_diff/databases/redshift.py +233 -0
- data_diff/databases/snowflake.py +222 -0
- data_diff/databases/sybase.py +720 -0
- data_diff/databases/trino.py +73 -0
- data_diff/databases/vertica.py +174 -0
- data_diff/diff_tables.py +489 -0
- data_diff/errors.py +17 -0
- data_diff/format.py +369 -0
- data_diff/hashdiff_tables.py +1026 -0
- data_diff/info_tree.py +76 -0
- data_diff/joindiff_tables.py +434 -0
- data_diff/lexicographic_space.py +253 -0
- data_diff/parse_time.py +88 -0
- data_diff/py.typed +0 -0
- data_diff/queries/__init__.py +13 -0
- data_diff/queries/api.py +213 -0
- data_diff/queries/ast_classes.py +811 -0
- data_diff/queries/base.py +38 -0
- data_diff/queries/extras.py +43 -0
- data_diff/query_utils.py +70 -0
- data_diff/schema.py +67 -0
- data_diff/table_segment.py +583 -0
- data_diff/thread_utils.py +112 -0
- data_diff/utils.py +1022 -0
- data_diff/version.py +15 -0
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +829 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +482 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__init__.py +13 -0
- dcs_sdk/__main__.py +18 -0
- dcs_sdk/__version__.py +15 -0
- dcs_sdk/cli/__init__.py +13 -0
- dcs_sdk/cli/cli.py +163 -0
- dcs_sdk/sdk/__init__.py +58 -0
- dcs_sdk/sdk/config/__init__.py +13 -0
- dcs_sdk/sdk/config/config_loader.py +491 -0
- dcs_sdk/sdk/data_diff/__init__.py +13 -0
- dcs_sdk/sdk/data_diff/data_differ.py +821 -0
- dcs_sdk/sdk/rules/__init__.py +15 -0
- dcs_sdk/sdk/rules/rules_mappping.py +31 -0
- dcs_sdk/sdk/rules/rules_repository.py +214 -0
- dcs_sdk/sdk/rules/schema_rules.py +65 -0
- dcs_sdk/sdk/utils/__init__.py +13 -0
- dcs_sdk/sdk/utils/serializer.py +25 -0
- dcs_sdk/sdk/utils/similarity_score/__init__.py +13 -0
- dcs_sdk/sdk/utils/similarity_score/base_provider.py +153 -0
- dcs_sdk/sdk/utils/similarity_score/cosine_similarity_provider.py +39 -0
- dcs_sdk/sdk/utils/similarity_score/jaccard_provider.py +24 -0
- dcs_sdk/sdk/utils/similarity_score/levenshtein_distance_provider.py +31 -0
- dcs_sdk/sdk/utils/table.py +475 -0
- dcs_sdk/sdk/utils/themes.py +40 -0
- dcs_sdk/sdk/utils/utils.py +349 -0
- dcs_sdk-1.6.5.dist-info/METADATA +150 -0
- dcs_sdk-1.6.5.dist-info/RECORD +159 -0
- dcs_sdk-1.6.5.dist-info/WHEEL +4 -0
- dcs_sdk-1.6.5.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
|
|
18
|
+
import yaml
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def parse_config(
|
|
22
|
+
path=None,
|
|
23
|
+
data=None,
|
|
24
|
+
tag="!ENV",
|
|
25
|
+
default_sep=":",
|
|
26
|
+
default_value="N/A",
|
|
27
|
+
raise_if_na=False,
|
|
28
|
+
loader=yaml.SafeLoader,
|
|
29
|
+
encoding="utf-8",
|
|
30
|
+
):
|
|
31
|
+
"""
|
|
32
|
+
Load yaml configuration from path or from the contents of a file (data)
|
|
33
|
+
and resolve any environment variables. The environment variables
|
|
34
|
+
must have the tag e.g. !ENV *before* them and be in this format to be
|
|
35
|
+
parsed: ${VAR_NAME}
|
|
36
|
+
E.g.:
|
|
37
|
+
databse:
|
|
38
|
+
name: test_db
|
|
39
|
+
username: !ENV ${DB_USER:paws}
|
|
40
|
+
password: !ENV ${DB_PASS:meaw2}
|
|
41
|
+
url: !ENV 'http://${DB_BASE_URL:straight_to_production}:${DB_PORT:12345}'
|
|
42
|
+
|
|
43
|
+
:param str path: the path to the yaml file
|
|
44
|
+
:param str data: the yaml data itself as a stream
|
|
45
|
+
:param str tag: the tag to look for, if None, all env variables will be
|
|
46
|
+
resolved.
|
|
47
|
+
:param str default_sep: if any default values are set, use this field
|
|
48
|
+
to separate them from the enironment variable name. E.g. ':' can be
|
|
49
|
+
used.
|
|
50
|
+
:param str default_value: the tag to look for
|
|
51
|
+
:param bool raise_if_na: raise an exception if there is no default
|
|
52
|
+
value set for the env variable.
|
|
53
|
+
:param Type[yaml.loader] loader: Specify which loader to use. Defaults to
|
|
54
|
+
yaml.SafeLoader
|
|
55
|
+
:param str encoding: the encoding of the data if a path is specified,
|
|
56
|
+
defaults to utf-8
|
|
57
|
+
:return: the dict configuration
|
|
58
|
+
:rtype: dict[str, T]
|
|
59
|
+
"""
|
|
60
|
+
default_sep = default_sep or ""
|
|
61
|
+
default_value = default_value or ""
|
|
62
|
+
default_sep_pattern = r"(" + default_sep + "[^}]+)?" if default_sep else ""
|
|
63
|
+
pattern = re.compile(r".*?\$\{([^}{" + default_sep + r"]+)" + default_sep_pattern + r"\}.*?")
|
|
64
|
+
loader = loader or yaml.SafeLoader
|
|
65
|
+
|
|
66
|
+
# the tag will be used to mark where to start searching for the pattern
|
|
67
|
+
# e.g. a_key: !ENV somestring${ENV_VAR}other_stuff_follows
|
|
68
|
+
loader.add_implicit_resolver(tag, pattern, first=[tag])
|
|
69
|
+
|
|
70
|
+
# For inner type conversions because double tags do not work, e.g. !ENV !!float
|
|
71
|
+
type_tag = "tag:yaml.org,2002:"
|
|
72
|
+
type_tag_pattern = re.compile(f"({type_tag}\w+\s)")
|
|
73
|
+
|
|
74
|
+
def constructor_env_variables(loader, node):
|
|
75
|
+
"""
|
|
76
|
+
Extracts the environment variable from the yaml node's value
|
|
77
|
+
:param yaml.Loader loader: the yaml loader (as defined above)
|
|
78
|
+
:param node: the current node (key-value) in the yaml
|
|
79
|
+
:return: the parsed string that contains the value of the environment
|
|
80
|
+
variable or the default value if defined for the variable. If no value
|
|
81
|
+
for the variable can be found, then the value is replaced by
|
|
82
|
+
default_value='N/A'
|
|
83
|
+
"""
|
|
84
|
+
value = loader.construct_scalar(node)
|
|
85
|
+
match = pattern.findall(value) # to find all env variables in line
|
|
86
|
+
dt = "".join(type_tag_pattern.findall(value)) or ""
|
|
87
|
+
value = value.replace(dt, "")
|
|
88
|
+
if match:
|
|
89
|
+
full_value = value
|
|
90
|
+
for g in match:
|
|
91
|
+
curr_default_value = default_value
|
|
92
|
+
env_var_name = g
|
|
93
|
+
env_var_name_with_default = g
|
|
94
|
+
if default_sep and isinstance(g, tuple) and len(g) > 1:
|
|
95
|
+
env_var_name = g[0]
|
|
96
|
+
env_var_name_with_default = "".join(g)
|
|
97
|
+
found = False
|
|
98
|
+
for each in g:
|
|
99
|
+
if default_sep in each:
|
|
100
|
+
_, curr_default_value = each.split(default_sep, 1)
|
|
101
|
+
found = True
|
|
102
|
+
break
|
|
103
|
+
if not found and raise_if_na:
|
|
104
|
+
raise ValueError(f"Could not find default value for {env_var_name}")
|
|
105
|
+
full_value = full_value.replace(
|
|
106
|
+
f"${{{env_var_name_with_default}}}",
|
|
107
|
+
os.environ.get(env_var_name, curr_default_value),
|
|
108
|
+
)
|
|
109
|
+
if dt:
|
|
110
|
+
# do one more roundtrip with the dt constructor:
|
|
111
|
+
node.value = full_value
|
|
112
|
+
node.tag = dt.strip()
|
|
113
|
+
return loader.yaml_constructors[node.tag](loader, node)
|
|
114
|
+
return full_value
|
|
115
|
+
|
|
116
|
+
return value
|
|
117
|
+
|
|
118
|
+
loader.add_constructor(tag, constructor_env_variables)
|
|
119
|
+
|
|
120
|
+
if path:
|
|
121
|
+
with open(path, encoding=encoding) as conf_data:
|
|
122
|
+
yaml_data_str = conf_data.read()
|
|
123
|
+
elif data:
|
|
124
|
+
yaml_data_str = data
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError("Either a path or data should be defined as input")
|
|
127
|
+
|
|
128
|
+
# yaml_data_str = yaml_data_str.replace("on:", "for_temp:")
|
|
129
|
+
yaml_data_str_replaces = re.sub(r"\bon\b:", "for_temp:", yaml_data_str)
|
|
130
|
+
conf_dict = yaml.load(yaml_data_str_replaces, Loader=loader)
|
|
131
|
+
|
|
132
|
+
for key, value in conf_dict.items():
|
|
133
|
+
if key != "data_sources" and key.startswith("validations"):
|
|
134
|
+
if isinstance(value, list):
|
|
135
|
+
for validation in value:
|
|
136
|
+
for k, v in validation.items():
|
|
137
|
+
if "for_temp" in v:
|
|
138
|
+
v["on"] = v.pop("for_temp")
|
|
139
|
+
return conf_dict
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
import glob
|
|
15
|
+
import re
|
|
16
|
+
from abc import ABC
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Dict, List, Optional, TypeVar, Union
|
|
19
|
+
|
|
20
|
+
from pyparsing import Combine, Group, Literal
|
|
21
|
+
from pyparsing import Optional as OptionalParsing
|
|
22
|
+
from pyparsing import Word, delimitedList, nums, oneOf
|
|
23
|
+
|
|
24
|
+
from dcs_core.core.common.errors import DataChecksConfigurationError
|
|
25
|
+
from dcs_core.core.common.models.configuration import (
|
|
26
|
+
Configuration,
|
|
27
|
+
DataSourceConfiguration,
|
|
28
|
+
DataSourceConnectionConfiguration,
|
|
29
|
+
DataSourceLanguageSupport,
|
|
30
|
+
DataSourceType,
|
|
31
|
+
ValidationConfig,
|
|
32
|
+
ValidationConfigByDataset,
|
|
33
|
+
)
|
|
34
|
+
from dcs_core.core.common.models.data_source_resource import Field, Index, Table
|
|
35
|
+
from dcs_core.core.common.models.metric import MetricsType
|
|
36
|
+
from dcs_core.core.common.models.validation import ConditionType, Threshold, Validation
|
|
37
|
+
from dcs_core.core.configuration.config_loader import parse_config
|
|
38
|
+
|
|
39
|
+
CONDITION_TYPE_MAPPING = {
|
|
40
|
+
">=": ConditionType.GTE,
|
|
41
|
+
"<=": ConditionType.LTE,
|
|
42
|
+
"=": ConditionType.EQ,
|
|
43
|
+
"<": ConditionType.LT,
|
|
44
|
+
">": ConditionType.GT,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
OUTPUT = TypeVar("OUTPUT")
|
|
49
|
+
INPUT = TypeVar("INPUT", Dict, List)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ConfigParser(ABC):
|
|
53
|
+
def parse(self, config: INPUT) -> OUTPUT:
|
|
54
|
+
raise NotImplementedError
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class DataSourceConfigParser(ConfigParser):
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _data_source_connection_config_parser(
|
|
60
|
+
config: Dict,
|
|
61
|
+
) -> DataSourceConnectionConfiguration:
|
|
62
|
+
connection_config = DataSourceConnectionConfiguration(
|
|
63
|
+
host=config["connection"].get("host"),
|
|
64
|
+
port=config["connection"].get("port"),
|
|
65
|
+
username=config["connection"].get("username"),
|
|
66
|
+
password=config["connection"].get("password"),
|
|
67
|
+
database=config["connection"].get("database"),
|
|
68
|
+
schema=config["connection"].get("schema"),
|
|
69
|
+
project=config["connection"].get("project"),
|
|
70
|
+
dataset=config["connection"].get("dataset"),
|
|
71
|
+
credentials_base64=config["connection"].get("credentials_base64"),
|
|
72
|
+
keyfile=config["connection"].get("keyfile"),
|
|
73
|
+
token=config["connection"].get("token"),
|
|
74
|
+
catalog=config["connection"].get("catalog"),
|
|
75
|
+
http_path=config["connection"].get("http_path"),
|
|
76
|
+
account=config["connection"].get("account"),
|
|
77
|
+
warehouse=config["connection"].get("warehouse"),
|
|
78
|
+
role=config["connection"].get("role"),
|
|
79
|
+
service_name=config["connection"].get("service_name"),
|
|
80
|
+
security=config["connection"].get("security"),
|
|
81
|
+
protocol=config["connection"].get("protocol"),
|
|
82
|
+
driver=config["connection"].get("driver"),
|
|
83
|
+
server=config["connection"].get("server"),
|
|
84
|
+
)
|
|
85
|
+
return connection_config
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
def _check_for_duplicate_names(config_list: List):
|
|
89
|
+
names = []
|
|
90
|
+
for config in config_list:
|
|
91
|
+
if config["name"] in names:
|
|
92
|
+
raise DataChecksConfigurationError(f"Duplicate datasource names found: {config['name']}")
|
|
93
|
+
names.append(config["name"])
|
|
94
|
+
|
|
95
|
+
def parse(self, config_list: List[Dict]) -> Dict[str, DataSourceConfiguration]:
|
|
96
|
+
self._check_for_duplicate_names(config_list=config_list)
|
|
97
|
+
data_source_configurations: Dict[str, DataSourceConfiguration] = {}
|
|
98
|
+
|
|
99
|
+
for config in config_list:
|
|
100
|
+
name_ = config["name"]
|
|
101
|
+
data_source_type = DataSourceType(config["type"].lower())
|
|
102
|
+
if data_source_type in [
|
|
103
|
+
DataSourceType.ELASTICSEARCH,
|
|
104
|
+
DataSourceType.OPENSEARCH,
|
|
105
|
+
]:
|
|
106
|
+
language_support = DataSourceLanguageSupport.DSL_ES
|
|
107
|
+
else:
|
|
108
|
+
language_support = DataSourceLanguageSupport.SQL
|
|
109
|
+
data_source_configuration = DataSourceConfiguration(
|
|
110
|
+
name=name_,
|
|
111
|
+
type=DataSourceType(config["type"].lower()),
|
|
112
|
+
connection_config=self._data_source_connection_config_parser(config=config),
|
|
113
|
+
language_support=language_support,
|
|
114
|
+
)
|
|
115
|
+
data_source_configurations[name_] = data_source_configuration
|
|
116
|
+
|
|
117
|
+
return data_source_configurations
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class ValidationConfigParser(ConfigParser):
|
|
121
|
+
def parse(self, config: Dict) -> Dict[str, ValidationConfigByDataset]:
|
|
122
|
+
validation_group: Dict[str, ValidationConfigByDataset] = {}
|
|
123
|
+
for key, validations in config.items():
|
|
124
|
+
match = re.search(r"^(validations for)\s([ \w-]+)\.([ \w-]+)$", key)
|
|
125
|
+
if match:
|
|
126
|
+
data_source, dataset = match.group(2), match.group(3)
|
|
127
|
+
validation_dict = {}
|
|
128
|
+
for validation in validations:
|
|
129
|
+
if not isinstance(validation, dict):
|
|
130
|
+
raise DataChecksConfigurationError(message=f"Validation must be a dictionary")
|
|
131
|
+
if len(validation) != 1:
|
|
132
|
+
raise DataChecksConfigurationError(message=f"Validation must have only one name")
|
|
133
|
+
validation_name, value = next(iter(validation.items()))
|
|
134
|
+
|
|
135
|
+
validation_config = ValidationConfig(
|
|
136
|
+
name=validation_name,
|
|
137
|
+
on=value.get("on"),
|
|
138
|
+
threshold=(
|
|
139
|
+
self._parse_threshold_str(value.get("threshold")) if value.get("threshold") else None
|
|
140
|
+
),
|
|
141
|
+
where=value.get("where"),
|
|
142
|
+
query=value.get("query"),
|
|
143
|
+
regex=value.get("regex"),
|
|
144
|
+
values=value.get("values"),
|
|
145
|
+
ref=value.get("ref"),
|
|
146
|
+
)
|
|
147
|
+
validation_dict[validation_name] = validation_config
|
|
148
|
+
|
|
149
|
+
validation_group[f"{data_source}.{dataset}"] = ValidationConfigByDataset(
|
|
150
|
+
data_source=data_source,
|
|
151
|
+
dataset=dataset,
|
|
152
|
+
validations=validation_dict,
|
|
153
|
+
)
|
|
154
|
+
return validation_group
|
|
155
|
+
|
|
156
|
+
@staticmethod
|
|
157
|
+
def _parse_threshold_str(threshold: str) -> Threshold:
|
|
158
|
+
try:
|
|
159
|
+
operator = oneOf(">= <= = < >").setParseAction(lambda t: CONDITION_TYPE_MAPPING[t[0]])
|
|
160
|
+
number = Combine(
|
|
161
|
+
OptionalParsing(Literal("-")) + Word(nums) + OptionalParsing(Literal(".") + Word(nums))
|
|
162
|
+
).setParseAction(lambda t: float(t[0]))
|
|
163
|
+
|
|
164
|
+
condition = operator + number
|
|
165
|
+
conditions = delimitedList(
|
|
166
|
+
Group(condition) | Group(condition + Literal("&") + condition),
|
|
167
|
+
delim="&",
|
|
168
|
+
)
|
|
169
|
+
result = conditions.parseString(threshold)
|
|
170
|
+
return Threshold(**{operator: value for operator, value in result})
|
|
171
|
+
|
|
172
|
+
except Exception as e:
|
|
173
|
+
raise DataChecksConfigurationError(f"Invalid threshold configuration {threshold}: {str(e)}")
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _parse_configuration_from_dict(config_dict: Dict) -> Configuration:
|
|
177
|
+
try:
|
|
178
|
+
data_source_configurations = {}
|
|
179
|
+
if "data_sources" in config_dict:
|
|
180
|
+
data_source_configurations = DataSourceConfigParser().parse(config_list=config_dict["data_sources"])
|
|
181
|
+
validate_configurations = ValidationConfigParser().parse(config_dict)
|
|
182
|
+
|
|
183
|
+
configuration = Configuration(data_sources=data_source_configurations, validations=validate_configurations)
|
|
184
|
+
|
|
185
|
+
return configuration
|
|
186
|
+
except Exception as ex:
|
|
187
|
+
raise DataChecksConfigurationError(message=f"Failed to parse configuration: {str(ex)}")
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def load_configuration_from_yaml_str(yaml_string: str, configuration: Optional[Configuration] = None) -> Configuration:
|
|
191
|
+
"""
|
|
192
|
+
Load configuration from a yaml string
|
|
193
|
+
"""
|
|
194
|
+
try:
|
|
195
|
+
config_dict: Dict = parse_config(data=yaml_string)
|
|
196
|
+
except Exception as ex:
|
|
197
|
+
raise DataChecksConfigurationError(message=f"Failed to parse configuration: {str(ex)}")
|
|
198
|
+
from_dict = _parse_configuration_from_dict(config_dict=config_dict)
|
|
199
|
+
if configuration:
|
|
200
|
+
for k, v in from_dict.data_sources.items():
|
|
201
|
+
configuration.data_sources[k] = v
|
|
202
|
+
for k, v in from_dict.validations.items():
|
|
203
|
+
configuration.validations[k] = v
|
|
204
|
+
return from_dict
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def load_configuration(configuration_path: str, configuration: Optional[Configuration] = None) -> Configuration:
|
|
208
|
+
"""
|
|
209
|
+
Load configuration from a yaml file
|
|
210
|
+
:param configuration_path: Configuration file path
|
|
211
|
+
:param configuration: Configuration
|
|
212
|
+
:return:
|
|
213
|
+
"""
|
|
214
|
+
|
|
215
|
+
path = Path(configuration_path)
|
|
216
|
+
if not path.exists():
|
|
217
|
+
raise DataChecksConfigurationError(message=f"Configuration file {configuration_path} does not exist")
|
|
218
|
+
if path.is_file():
|
|
219
|
+
with open(configuration_path) as config_yaml_file:
|
|
220
|
+
yaml_string = config_yaml_file.read()
|
|
221
|
+
return load_configuration_from_yaml_str(yaml_string, configuration=configuration)
|
|
222
|
+
else:
|
|
223
|
+
config_files = glob.glob(f"{configuration_path}/*.yaml")
|
|
224
|
+
if len(config_files) == 0:
|
|
225
|
+
raise DataChecksConfigurationError(message=f"No configuration files found in {configuration_path}")
|
|
226
|
+
else:
|
|
227
|
+
config_dict_list: List[Dict] = []
|
|
228
|
+
for config_file in config_files:
|
|
229
|
+
with open(config_file) as config_yaml_file:
|
|
230
|
+
yaml_string = config_yaml_file.read()
|
|
231
|
+
config_dict: Dict = parse_config(data=yaml_string)
|
|
232
|
+
config_dict_list.append(config_dict)
|
|
233
|
+
|
|
234
|
+
final_config_dict = {
|
|
235
|
+
"data_sources": [],
|
|
236
|
+
"metrics": [],
|
|
237
|
+
"storage": None,
|
|
238
|
+
}
|
|
239
|
+
for config_dict in config_dict_list:
|
|
240
|
+
if "data_sources" in config_dict:
|
|
241
|
+
final_config_dict["data_sources"].extend(config_dict["data_sources"])
|
|
242
|
+
if "metrics" in config_dict:
|
|
243
|
+
final_config_dict["metrics"].extend(config_dict["metrics"])
|
|
244
|
+
if "storage" in config_dict:
|
|
245
|
+
final_config_dict["storage"] = config_dict["storage"]
|
|
246
|
+
|
|
247
|
+
for key, value in config_dict.items():
|
|
248
|
+
if key not in ["data_sources", "metrics", "storage"]:
|
|
249
|
+
if key not in final_config_dict.keys():
|
|
250
|
+
final_config_dict[key] = value
|
|
251
|
+
else:
|
|
252
|
+
if isinstance(final_config_dict[key], list):
|
|
253
|
+
final_config_dict[key].extend(value)
|
|
254
|
+
|
|
255
|
+
from_dict = _parse_configuration_from_dict(final_config_dict)
|
|
256
|
+
if configuration:
|
|
257
|
+
for k, v in from_dict.data_sources.items():
|
|
258
|
+
configuration.data_sources[k] = v
|
|
259
|
+
for k, v in from_dict.validations.items():
|
|
260
|
+
configuration.validations[k] = v
|
|
261
|
+
|
|
262
|
+
return from_dict
|