dcs-sdk 1.6.4__py3-none-any.whl → 1.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. dcs_core/__init__.py +13 -0
  2. dcs_core/__main__.py +17 -0
  3. dcs_core/__version__.py +15 -0
  4. dcs_core/cli/__init__.py +13 -0
  5. dcs_core/cli/cli.py +165 -0
  6. dcs_core/core/__init__.py +19 -0
  7. dcs_core/core/common/__init__.py +13 -0
  8. dcs_core/core/common/errors.py +50 -0
  9. dcs_core/core/common/models/__init__.py +13 -0
  10. dcs_core/core/common/models/configuration.py +284 -0
  11. dcs_core/core/common/models/dashboard.py +24 -0
  12. dcs_core/core/common/models/data_source_resource.py +75 -0
  13. dcs_core/core/common/models/metric.py +160 -0
  14. dcs_core/core/common/models/profile.py +75 -0
  15. dcs_core/core/common/models/validation.py +216 -0
  16. dcs_core/core/common/models/widget.py +44 -0
  17. dcs_core/core/configuration/__init__.py +13 -0
  18. dcs_core/core/configuration/config_loader.py +139 -0
  19. dcs_core/core/configuration/configuration_parser.py +262 -0
  20. dcs_core/core/configuration/configuration_parser_arc.py +328 -0
  21. dcs_core/core/datasource/__init__.py +13 -0
  22. dcs_core/core/datasource/base.py +62 -0
  23. dcs_core/core/datasource/manager.py +112 -0
  24. dcs_core/core/datasource/search_datasource.py +421 -0
  25. dcs_core/core/datasource/sql_datasource.py +1094 -0
  26. dcs_core/core/inspect.py +163 -0
  27. dcs_core/core/logger/__init__.py +13 -0
  28. dcs_core/core/logger/base.py +32 -0
  29. dcs_core/core/logger/default_logger.py +94 -0
  30. dcs_core/core/metric/__init__.py +13 -0
  31. dcs_core/core/metric/base.py +220 -0
  32. dcs_core/core/metric/combined_metric.py +98 -0
  33. dcs_core/core/metric/custom_metric.py +34 -0
  34. dcs_core/core/metric/manager.py +137 -0
  35. dcs_core/core/metric/numeric_metric.py +403 -0
  36. dcs_core/core/metric/reliability_metric.py +90 -0
  37. dcs_core/core/profiling/__init__.py +13 -0
  38. dcs_core/core/profiling/datasource_profiling.py +136 -0
  39. dcs_core/core/profiling/numeric_field_profiling.py +72 -0
  40. dcs_core/core/profiling/text_field_profiling.py +67 -0
  41. dcs_core/core/repository/__init__.py +13 -0
  42. dcs_core/core/repository/metric_repository.py +77 -0
  43. dcs_core/core/utils/__init__.py +13 -0
  44. dcs_core/core/utils/log.py +29 -0
  45. dcs_core/core/utils/tracking.py +105 -0
  46. dcs_core/core/utils/utils.py +44 -0
  47. dcs_core/core/validation/__init__.py +13 -0
  48. dcs_core/core/validation/base.py +230 -0
  49. dcs_core/core/validation/completeness_validation.py +153 -0
  50. dcs_core/core/validation/custom_query_validation.py +24 -0
  51. dcs_core/core/validation/manager.py +282 -0
  52. dcs_core/core/validation/numeric_validation.py +276 -0
  53. dcs_core/core/validation/reliability_validation.py +91 -0
  54. dcs_core/core/validation/uniqueness_validation.py +61 -0
  55. dcs_core/core/validation/validity_validation.py +738 -0
  56. dcs_core/integrations/__init__.py +13 -0
  57. dcs_core/integrations/databases/__init__.py +13 -0
  58. dcs_core/integrations/databases/bigquery.py +187 -0
  59. dcs_core/integrations/databases/databricks.py +51 -0
  60. dcs_core/integrations/databases/db2.py +652 -0
  61. dcs_core/integrations/databases/elasticsearch.py +61 -0
  62. dcs_core/integrations/databases/mssql.py +979 -0
  63. dcs_core/integrations/databases/mysql.py +409 -0
  64. dcs_core/integrations/databases/opensearch.py +64 -0
  65. dcs_core/integrations/databases/oracle.py +719 -0
  66. dcs_core/integrations/databases/postgres.py +570 -0
  67. dcs_core/integrations/databases/redshift.py +53 -0
  68. dcs_core/integrations/databases/snowflake.py +48 -0
  69. dcs_core/integrations/databases/spark_df.py +111 -0
  70. dcs_core/integrations/databases/sybase.py +1069 -0
  71. dcs_core/integrations/storage/__init__.py +13 -0
  72. dcs_core/integrations/storage/local_file.py +149 -0
  73. dcs_core/integrations/utils/__init__.py +13 -0
  74. dcs_core/integrations/utils/utils.py +36 -0
  75. dcs_core/report/__init__.py +13 -0
  76. dcs_core/report/dashboard.py +211 -0
  77. dcs_core/report/models.py +88 -0
  78. dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
  79. dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
  80. dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
  81. dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
  82. dcs_core/report/static/assets/images/docs.svg +6 -0
  83. dcs_core/report/static/assets/images/github.svg +4 -0
  84. dcs_core/report/static/assets/images/logo.svg +7 -0
  85. dcs_core/report/static/assets/images/slack.svg +13 -0
  86. dcs_core/report/static/index.js +2 -0
  87. dcs_core/report/static/index.js.LICENSE.txt +3971 -0
  88. dcs_sdk/__version__.py +1 -1
  89. dcs_sdk/cli/cli.py +3 -0
  90. {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/METADATA +24 -2
  91. dcs_sdk-1.6.6.dist-info/RECORD +159 -0
  92. {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/entry_points.txt +1 -0
  93. dcs_sdk-1.6.4.dist-info/RECORD +0 -72
  94. {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,139 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import re
17
+
18
+ import yaml
19
+
20
+
21
+ def parse_config(
22
+ path=None,
23
+ data=None,
24
+ tag="!ENV",
25
+ default_sep=":",
26
+ default_value="N/A",
27
+ raise_if_na=False,
28
+ loader=yaml.SafeLoader,
29
+ encoding="utf-8",
30
+ ):
31
+ """
32
+ Load yaml configuration from path or from the contents of a file (data)
33
+ and resolve any environment variables. The environment variables
34
+ must have the tag e.g. !ENV *before* them and be in this format to be
35
+ parsed: ${VAR_NAME}
36
+ E.g.:
37
+ databse:
38
+ name: test_db
39
+ username: !ENV ${DB_USER:paws}
40
+ password: !ENV ${DB_PASS:meaw2}
41
+ url: !ENV 'http://${DB_BASE_URL:straight_to_production}:${DB_PORT:12345}'
42
+
43
+ :param str path: the path to the yaml file
44
+ :param str data: the yaml data itself as a stream
45
+ :param str tag: the tag to look for, if None, all env variables will be
46
+ resolved.
47
+ :param str default_sep: if any default values are set, use this field
48
+ to separate them from the enironment variable name. E.g. ':' can be
49
+ used.
50
+ :param str default_value: the tag to look for
51
+ :param bool raise_if_na: raise an exception if there is no default
52
+ value set for the env variable.
53
+ :param Type[yaml.loader] loader: Specify which loader to use. Defaults to
54
+ yaml.SafeLoader
55
+ :param str encoding: the encoding of the data if a path is specified,
56
+ defaults to utf-8
57
+ :return: the dict configuration
58
+ :rtype: dict[str, T]
59
+ """
60
+ default_sep = default_sep or ""
61
+ default_value = default_value or ""
62
+ default_sep_pattern = r"(" + default_sep + "[^}]+)?" if default_sep else ""
63
+ pattern = re.compile(r".*?\$\{([^}{" + default_sep + r"]+)" + default_sep_pattern + r"\}.*?")
64
+ loader = loader or yaml.SafeLoader
65
+
66
+ # the tag will be used to mark where to start searching for the pattern
67
+ # e.g. a_key: !ENV somestring${ENV_VAR}other_stuff_follows
68
+ loader.add_implicit_resolver(tag, pattern, first=[tag])
69
+
70
+ # For inner type conversions because double tags do not work, e.g. !ENV !!float
71
+ type_tag = "tag:yaml.org,2002:"
72
+ type_tag_pattern = re.compile(f"({type_tag}\w+\s)")
73
+
74
+ def constructor_env_variables(loader, node):
75
+ """
76
+ Extracts the environment variable from the yaml node's value
77
+ :param yaml.Loader loader: the yaml loader (as defined above)
78
+ :param node: the current node (key-value) in the yaml
79
+ :return: the parsed string that contains the value of the environment
80
+ variable or the default value if defined for the variable. If no value
81
+ for the variable can be found, then the value is replaced by
82
+ default_value='N/A'
83
+ """
84
+ value = loader.construct_scalar(node)
85
+ match = pattern.findall(value) # to find all env variables in line
86
+ dt = "".join(type_tag_pattern.findall(value)) or ""
87
+ value = value.replace(dt, "")
88
+ if match:
89
+ full_value = value
90
+ for g in match:
91
+ curr_default_value = default_value
92
+ env_var_name = g
93
+ env_var_name_with_default = g
94
+ if default_sep and isinstance(g, tuple) and len(g) > 1:
95
+ env_var_name = g[0]
96
+ env_var_name_with_default = "".join(g)
97
+ found = False
98
+ for each in g:
99
+ if default_sep in each:
100
+ _, curr_default_value = each.split(default_sep, 1)
101
+ found = True
102
+ break
103
+ if not found and raise_if_na:
104
+ raise ValueError(f"Could not find default value for {env_var_name}")
105
+ full_value = full_value.replace(
106
+ f"${{{env_var_name_with_default}}}",
107
+ os.environ.get(env_var_name, curr_default_value),
108
+ )
109
+ if dt:
110
+ # do one more roundtrip with the dt constructor:
111
+ node.value = full_value
112
+ node.tag = dt.strip()
113
+ return loader.yaml_constructors[node.tag](loader, node)
114
+ return full_value
115
+
116
+ return value
117
+
118
+ loader.add_constructor(tag, constructor_env_variables)
119
+
120
+ if path:
121
+ with open(path, encoding=encoding) as conf_data:
122
+ yaml_data_str = conf_data.read()
123
+ elif data:
124
+ yaml_data_str = data
125
+ else:
126
+ raise ValueError("Either a path or data should be defined as input")
127
+
128
+ # yaml_data_str = yaml_data_str.replace("on:", "for_temp:")
129
+ yaml_data_str_replaces = re.sub(r"\bon\b:", "for_temp:", yaml_data_str)
130
+ conf_dict = yaml.load(yaml_data_str_replaces, Loader=loader)
131
+
132
+ for key, value in conf_dict.items():
133
+ if key != "data_sources" and key.startswith("validations"):
134
+ if isinstance(value, list):
135
+ for validation in value:
136
+ for k, v in validation.items():
137
+ if "for_temp" in v:
138
+ v["on"] = v.pop("for_temp")
139
+ return conf_dict
@@ -0,0 +1,262 @@
1
+ # Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import glob
15
+ import re
16
+ from abc import ABC
17
+ from pathlib import Path
18
+ from typing import Dict, List, Optional, TypeVar, Union
19
+
20
+ from pyparsing import Combine, Group, Literal
21
+ from pyparsing import Optional as OptionalParsing
22
+ from pyparsing import Word, delimitedList, nums, oneOf
23
+
24
+ from dcs_core.core.common.errors import DataChecksConfigurationError
25
+ from dcs_core.core.common.models.configuration import (
26
+ Configuration,
27
+ DataSourceConfiguration,
28
+ DataSourceConnectionConfiguration,
29
+ DataSourceLanguageSupport,
30
+ DataSourceType,
31
+ ValidationConfig,
32
+ ValidationConfigByDataset,
33
+ )
34
+ from dcs_core.core.common.models.data_source_resource import Field, Index, Table
35
+ from dcs_core.core.common.models.metric import MetricsType
36
+ from dcs_core.core.common.models.validation import ConditionType, Threshold, Validation
37
+ from dcs_core.core.configuration.config_loader import parse_config
38
+
39
+ CONDITION_TYPE_MAPPING = {
40
+ ">=": ConditionType.GTE,
41
+ "<=": ConditionType.LTE,
42
+ "=": ConditionType.EQ,
43
+ "<": ConditionType.LT,
44
+ ">": ConditionType.GT,
45
+ }
46
+
47
+
48
+ OUTPUT = TypeVar("OUTPUT")
49
+ INPUT = TypeVar("INPUT", Dict, List)
50
+
51
+
52
+ class ConfigParser(ABC):
53
+ def parse(self, config: INPUT) -> OUTPUT:
54
+ raise NotImplementedError
55
+
56
+
57
+ class DataSourceConfigParser(ConfigParser):
58
+ @staticmethod
59
+ def _data_source_connection_config_parser(
60
+ config: Dict,
61
+ ) -> DataSourceConnectionConfiguration:
62
+ connection_config = DataSourceConnectionConfiguration(
63
+ host=config["connection"].get("host"),
64
+ port=config["connection"].get("port"),
65
+ username=config["connection"].get("username"),
66
+ password=config["connection"].get("password"),
67
+ database=config["connection"].get("database"),
68
+ schema=config["connection"].get("schema"),
69
+ project=config["connection"].get("project"),
70
+ dataset=config["connection"].get("dataset"),
71
+ credentials_base64=config["connection"].get("credentials_base64"),
72
+ keyfile=config["connection"].get("keyfile"),
73
+ token=config["connection"].get("token"),
74
+ catalog=config["connection"].get("catalog"),
75
+ http_path=config["connection"].get("http_path"),
76
+ account=config["connection"].get("account"),
77
+ warehouse=config["connection"].get("warehouse"),
78
+ role=config["connection"].get("role"),
79
+ service_name=config["connection"].get("service_name"),
80
+ security=config["connection"].get("security"),
81
+ protocol=config["connection"].get("protocol"),
82
+ driver=config["connection"].get("driver"),
83
+ server=config["connection"].get("server"),
84
+ )
85
+ return connection_config
86
+
87
+ @staticmethod
88
+ def _check_for_duplicate_names(config_list: List):
89
+ names = []
90
+ for config in config_list:
91
+ if config["name"] in names:
92
+ raise DataChecksConfigurationError(f"Duplicate datasource names found: {config['name']}")
93
+ names.append(config["name"])
94
+
95
+ def parse(self, config_list: List[Dict]) -> Dict[str, DataSourceConfiguration]:
96
+ self._check_for_duplicate_names(config_list=config_list)
97
+ data_source_configurations: Dict[str, DataSourceConfiguration] = {}
98
+
99
+ for config in config_list:
100
+ name_ = config["name"]
101
+ data_source_type = DataSourceType(config["type"].lower())
102
+ if data_source_type in [
103
+ DataSourceType.ELASTICSEARCH,
104
+ DataSourceType.OPENSEARCH,
105
+ ]:
106
+ language_support = DataSourceLanguageSupport.DSL_ES
107
+ else:
108
+ language_support = DataSourceLanguageSupport.SQL
109
+ data_source_configuration = DataSourceConfiguration(
110
+ name=name_,
111
+ type=DataSourceType(config["type"].lower()),
112
+ connection_config=self._data_source_connection_config_parser(config=config),
113
+ language_support=language_support,
114
+ )
115
+ data_source_configurations[name_] = data_source_configuration
116
+
117
+ return data_source_configurations
118
+
119
+
120
+ class ValidationConfigParser(ConfigParser):
121
+ def parse(self, config: Dict) -> Dict[str, ValidationConfigByDataset]:
122
+ validation_group: Dict[str, ValidationConfigByDataset] = {}
123
+ for key, validations in config.items():
124
+ match = re.search(r"^(validations for)\s([ \w-]+)\.([ \w-]+)$", key)
125
+ if match:
126
+ data_source, dataset = match.group(2), match.group(3)
127
+ validation_dict = {}
128
+ for validation in validations:
129
+ if not isinstance(validation, dict):
130
+ raise DataChecksConfigurationError(message=f"Validation must be a dictionary")
131
+ if len(validation) != 1:
132
+ raise DataChecksConfigurationError(message=f"Validation must have only one name")
133
+ validation_name, value = next(iter(validation.items()))
134
+
135
+ validation_config = ValidationConfig(
136
+ name=validation_name,
137
+ on=value.get("on"),
138
+ threshold=(
139
+ self._parse_threshold_str(value.get("threshold")) if value.get("threshold") else None
140
+ ),
141
+ where=value.get("where"),
142
+ query=value.get("query"),
143
+ regex=value.get("regex"),
144
+ values=value.get("values"),
145
+ ref=value.get("ref"),
146
+ )
147
+ validation_dict[validation_name] = validation_config
148
+
149
+ validation_group[f"{data_source}.{dataset}"] = ValidationConfigByDataset(
150
+ data_source=data_source,
151
+ dataset=dataset,
152
+ validations=validation_dict,
153
+ )
154
+ return validation_group
155
+
156
+ @staticmethod
157
+ def _parse_threshold_str(threshold: str) -> Threshold:
158
+ try:
159
+ operator = oneOf(">= <= = < >").setParseAction(lambda t: CONDITION_TYPE_MAPPING[t[0]])
160
+ number = Combine(
161
+ OptionalParsing(Literal("-")) + Word(nums) + OptionalParsing(Literal(".") + Word(nums))
162
+ ).setParseAction(lambda t: float(t[0]))
163
+
164
+ condition = operator + number
165
+ conditions = delimitedList(
166
+ Group(condition) | Group(condition + Literal("&") + condition),
167
+ delim="&",
168
+ )
169
+ result = conditions.parseString(threshold)
170
+ return Threshold(**{operator: value for operator, value in result})
171
+
172
+ except Exception as e:
173
+ raise DataChecksConfigurationError(f"Invalid threshold configuration {threshold}: {str(e)}")
174
+
175
+
176
+ def _parse_configuration_from_dict(config_dict: Dict) -> Configuration:
177
+ try:
178
+ data_source_configurations = {}
179
+ if "data_sources" in config_dict:
180
+ data_source_configurations = DataSourceConfigParser().parse(config_list=config_dict["data_sources"])
181
+ validate_configurations = ValidationConfigParser().parse(config_dict)
182
+
183
+ configuration = Configuration(data_sources=data_source_configurations, validations=validate_configurations)
184
+
185
+ return configuration
186
+ except Exception as ex:
187
+ raise DataChecksConfigurationError(message=f"Failed to parse configuration: {str(ex)}")
188
+
189
+
190
+ def load_configuration_from_yaml_str(yaml_string: str, configuration: Optional[Configuration] = None) -> Configuration:
191
+ """
192
+ Load configuration from a yaml string
193
+ """
194
+ try:
195
+ config_dict: Dict = parse_config(data=yaml_string)
196
+ except Exception as ex:
197
+ raise DataChecksConfigurationError(message=f"Failed to parse configuration: {str(ex)}")
198
+ from_dict = _parse_configuration_from_dict(config_dict=config_dict)
199
+ if configuration:
200
+ for k, v in from_dict.data_sources.items():
201
+ configuration.data_sources[k] = v
202
+ for k, v in from_dict.validations.items():
203
+ configuration.validations[k] = v
204
+ return from_dict
205
+
206
+
207
+ def load_configuration(configuration_path: str, configuration: Optional[Configuration] = None) -> Configuration:
208
+ """
209
+ Load configuration from a yaml file
210
+ :param configuration_path: Configuration file path
211
+ :param configuration: Configuration
212
+ :return:
213
+ """
214
+
215
+ path = Path(configuration_path)
216
+ if not path.exists():
217
+ raise DataChecksConfigurationError(message=f"Configuration file {configuration_path} does not exist")
218
+ if path.is_file():
219
+ with open(configuration_path) as config_yaml_file:
220
+ yaml_string = config_yaml_file.read()
221
+ return load_configuration_from_yaml_str(yaml_string, configuration=configuration)
222
+ else:
223
+ config_files = glob.glob(f"{configuration_path}/*.yaml")
224
+ if len(config_files) == 0:
225
+ raise DataChecksConfigurationError(message=f"No configuration files found in {configuration_path}")
226
+ else:
227
+ config_dict_list: List[Dict] = []
228
+ for config_file in config_files:
229
+ with open(config_file) as config_yaml_file:
230
+ yaml_string = config_yaml_file.read()
231
+ config_dict: Dict = parse_config(data=yaml_string)
232
+ config_dict_list.append(config_dict)
233
+
234
+ final_config_dict = {
235
+ "data_sources": [],
236
+ "metrics": [],
237
+ "storage": None,
238
+ }
239
+ for config_dict in config_dict_list:
240
+ if "data_sources" in config_dict:
241
+ final_config_dict["data_sources"].extend(config_dict["data_sources"])
242
+ if "metrics" in config_dict:
243
+ final_config_dict["metrics"].extend(config_dict["metrics"])
244
+ if "storage" in config_dict:
245
+ final_config_dict["storage"] = config_dict["storage"]
246
+
247
+ for key, value in config_dict.items():
248
+ if key not in ["data_sources", "metrics", "storage"]:
249
+ if key not in final_config_dict.keys():
250
+ final_config_dict[key] = value
251
+ else:
252
+ if isinstance(final_config_dict[key], list):
253
+ final_config_dict[key].extend(value)
254
+
255
+ from_dict = _parse_configuration_from_dict(final_config_dict)
256
+ if configuration:
257
+ for k, v in from_dict.data_sources.items():
258
+ configuration.data_sources[k] = v
259
+ for k, v in from_dict.validations.items():
260
+ configuration.validations[k] = v
261
+
262
+ return from_dict