dcs-sdk 1.6.4__py3-none-any.whl → 1.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +979 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +570 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__version__.py +1 -1
- dcs_sdk/cli/cli.py +3 -0
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/METADATA +24 -2
- dcs_sdk-1.6.6.dist-info/RECORD +159 -0
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/entry_points.txt +1 -0
- dcs_sdk-1.6.4.dist-info/RECORD +0 -72
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import datetime
|
|
16
|
+
import json
|
|
17
|
+
import sys
|
|
18
|
+
import traceback
|
|
19
|
+
from abc import ABC, abstractmethod
|
|
20
|
+
from typing import Optional, Tuple, Union
|
|
21
|
+
|
|
22
|
+
from loguru import logger
|
|
23
|
+
|
|
24
|
+
from dcs_core.core.common.models.configuration import (
|
|
25
|
+
DataSourceLanguageSupport,
|
|
26
|
+
ValidationConfig,
|
|
27
|
+
)
|
|
28
|
+
from dcs_core.core.common.models.validation import (
|
|
29
|
+
ConditionType,
|
|
30
|
+
DeltaValidationInfo,
|
|
31
|
+
ValidationFunction,
|
|
32
|
+
ValidationInfo,
|
|
33
|
+
)
|
|
34
|
+
from dcs_core.core.datasource.manager import DataSource
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class ValidationIdentity:
|
|
38
|
+
@staticmethod
|
|
39
|
+
def generate_identity(
|
|
40
|
+
validation_function: ValidationFunction,
|
|
41
|
+
validation_name: str,
|
|
42
|
+
data_source_name: str = None,
|
|
43
|
+
dataset_name: str = None,
|
|
44
|
+
field_name: str = None,
|
|
45
|
+
) -> str:
|
|
46
|
+
"""
|
|
47
|
+
Generate a unique identifier for a metric
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
identifiers = []
|
|
51
|
+
|
|
52
|
+
if data_source_name is not None:
|
|
53
|
+
identifiers.append(data_source_name)
|
|
54
|
+
if dataset_name:
|
|
55
|
+
identifiers.append(dataset_name)
|
|
56
|
+
if field_name:
|
|
57
|
+
identifiers.append(field_name)
|
|
58
|
+
if validation_function:
|
|
59
|
+
identifiers.append(validation_function.value)
|
|
60
|
+
if validation_name:
|
|
61
|
+
identifiers.append(validation_name)
|
|
62
|
+
return ".".join([str(p) for p in identifiers])
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class Validation(ABC):
|
|
66
|
+
"""
|
|
67
|
+
Validation is a class that represents a validation that is generated by a data source.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
name: str,
|
|
73
|
+
validation_config: ValidationConfig,
|
|
74
|
+
data_source: DataSource,
|
|
75
|
+
dataset_name: str,
|
|
76
|
+
field_name: str = None,
|
|
77
|
+
**kwargs,
|
|
78
|
+
):
|
|
79
|
+
self.name = name
|
|
80
|
+
self.validation_config = validation_config
|
|
81
|
+
self.data_source = data_source
|
|
82
|
+
self.dataset_name = dataset_name
|
|
83
|
+
self.field_name = field_name
|
|
84
|
+
|
|
85
|
+
self.query = validation_config.query
|
|
86
|
+
|
|
87
|
+
self.threshold = validation_config.threshold
|
|
88
|
+
self.where_filter = None
|
|
89
|
+
self.values = None
|
|
90
|
+
self.regex_pattern = validation_config.regex
|
|
91
|
+
|
|
92
|
+
if validation_config.where:
|
|
93
|
+
if data_source.language_support == DataSourceLanguageSupport.DSL_ES:
|
|
94
|
+
self.where_filter = json.loads(validation_config.where)
|
|
95
|
+
elif data_source.language_support == DataSourceLanguageSupport.SQL:
|
|
96
|
+
self.where_filter = validation_config.where
|
|
97
|
+
if validation_config.values:
|
|
98
|
+
if data_source.language_support == DataSourceLanguageSupport.SQL:
|
|
99
|
+
self.values = validation_config.values
|
|
100
|
+
|
|
101
|
+
def get_validation_identity(self) -> str:
|
|
102
|
+
return ValidationIdentity.generate_identity(
|
|
103
|
+
validation_function=self.validation_config.get_validation_function,
|
|
104
|
+
validation_name=self.name,
|
|
105
|
+
data_source_name=self.data_source.data_source_name,
|
|
106
|
+
dataset_name=self.dataset_name,
|
|
107
|
+
field_name=self.field_name,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def _validate_threshold(self, metric_value) -> Tuple[bool, Optional[str]]:
|
|
111
|
+
for operator, value in self.threshold.__dict__.items():
|
|
112
|
+
if value is not None:
|
|
113
|
+
if ConditionType.GTE == operator:
|
|
114
|
+
if metric_value < value:
|
|
115
|
+
return (
|
|
116
|
+
False,
|
|
117
|
+
f"Less than threshold value of {value}",
|
|
118
|
+
)
|
|
119
|
+
elif ConditionType.LTE == operator:
|
|
120
|
+
if metric_value > value:
|
|
121
|
+
return (
|
|
122
|
+
False,
|
|
123
|
+
f"Greater than threshold value of {value}",
|
|
124
|
+
)
|
|
125
|
+
elif ConditionType.GT == operator:
|
|
126
|
+
if metric_value <= value:
|
|
127
|
+
return (
|
|
128
|
+
False,
|
|
129
|
+
f"Less than or equal to threshold value of {value}",
|
|
130
|
+
)
|
|
131
|
+
elif ConditionType.LT == operator:
|
|
132
|
+
if metric_value >= value:
|
|
133
|
+
return (
|
|
134
|
+
False,
|
|
135
|
+
f"Greater than or equal to threshold value of {value}",
|
|
136
|
+
)
|
|
137
|
+
elif ConditionType.EQ == operator:
|
|
138
|
+
if metric_value != value:
|
|
139
|
+
return (
|
|
140
|
+
False,
|
|
141
|
+
f"Not equal to the value of {value}",
|
|
142
|
+
)
|
|
143
|
+
return True, None
|
|
144
|
+
|
|
145
|
+
@abstractmethod
|
|
146
|
+
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
def get_validation_info(self, **kwargs) -> Union[ValidationInfo, None]:
|
|
150
|
+
try:
|
|
151
|
+
metric_value = self._generate_metric_value(**kwargs)
|
|
152
|
+
tags = {
|
|
153
|
+
"name": self.name,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
value = ValidationInfo(
|
|
157
|
+
name=self.name,
|
|
158
|
+
identity=self.get_validation_identity(),
|
|
159
|
+
data_source_name=self.data_source.data_source_name,
|
|
160
|
+
dataset=self.dataset_name,
|
|
161
|
+
validation_function=self.validation_config.get_validation_function,
|
|
162
|
+
field=self.field_name,
|
|
163
|
+
value=metric_value,
|
|
164
|
+
timestamp=datetime.datetime.utcnow(),
|
|
165
|
+
tags=tags,
|
|
166
|
+
)
|
|
167
|
+
if self.threshold is not None:
|
|
168
|
+
value.is_valid, value.reason = self._validate_threshold(metric_value)
|
|
169
|
+
|
|
170
|
+
return value
|
|
171
|
+
except Exception as e:
|
|
172
|
+
traceback.print_exc(file=sys.stdout)
|
|
173
|
+
logger.error(f"Failed to generate metric {self.name}: {str(e)}")
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class DeltaValidation(Validation, ABC):
|
|
178
|
+
def __init__(
|
|
179
|
+
self,
|
|
180
|
+
name: str,
|
|
181
|
+
validation_config: ValidationConfig,
|
|
182
|
+
data_source: DataSource,
|
|
183
|
+
dataset_name: str,
|
|
184
|
+
reference_data_source: DataSource,
|
|
185
|
+
reference_dataset_name: str,
|
|
186
|
+
reference_field_name: str = None,
|
|
187
|
+
**kwargs,
|
|
188
|
+
):
|
|
189
|
+
super().__init__(name, validation_config, data_source, dataset_name, **kwargs)
|
|
190
|
+
self.reference_data_source = reference_data_source
|
|
191
|
+
self.reference_dataset_name = reference_dataset_name
|
|
192
|
+
self.reference_field_name = reference_field_name
|
|
193
|
+
|
|
194
|
+
@abstractmethod
|
|
195
|
+
def _generate_reference_metric_value(self, **kwargs) -> Union[float, int]:
|
|
196
|
+
pass
|
|
197
|
+
|
|
198
|
+
def get_validation_info(self, **kwargs) -> Union[ValidationInfo, None]:
|
|
199
|
+
try:
|
|
200
|
+
metric_value = self._generate_metric_value(**kwargs)
|
|
201
|
+
reference_metric_value = self._generate_reference_metric_value(**kwargs)
|
|
202
|
+
delta_value = abs(metric_value - reference_metric_value)
|
|
203
|
+
|
|
204
|
+
tags = {
|
|
205
|
+
"name": self.name,
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
value = DeltaValidationInfo(
|
|
209
|
+
name=self.name,
|
|
210
|
+
identity=self.get_validation_identity(),
|
|
211
|
+
data_source_name=self.data_source.data_source_name,
|
|
212
|
+
dataset=self.dataset_name,
|
|
213
|
+
validation_function=self.validation_config.get_validation_function,
|
|
214
|
+
field=self.field_name,
|
|
215
|
+
value=delta_value,
|
|
216
|
+
source_value=metric_value,
|
|
217
|
+
reference_value=reference_metric_value,
|
|
218
|
+
reference_datasource_name=self.reference_data_source.data_source_name,
|
|
219
|
+
reference_dataset=self.reference_dataset_name,
|
|
220
|
+
timestamp=datetime.datetime.utcnow(),
|
|
221
|
+
tags=tags,
|
|
222
|
+
)
|
|
223
|
+
if self.threshold is not None:
|
|
224
|
+
value.is_valid, value.reason = self._validate_threshold(delta_value)
|
|
225
|
+
|
|
226
|
+
return value
|
|
227
|
+
except Exception as e:
|
|
228
|
+
traceback.print_exc(file=sys.stdout)
|
|
229
|
+
logger.error(f"Failed to generate metric {self.name}: {str(e)}")
|
|
230
|
+
return None
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
from dcs_core.core.datasource.search_datasource import SearchIndexDataSource
|
|
19
|
+
from dcs_core.core.datasource.sql_datasource import SQLDataSource
|
|
20
|
+
from dcs_core.core.validation.base import Validation
|
|
21
|
+
from dcs_core.integrations.databases.oracle import OracleDataSource
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class CountNullValidation(Validation):
|
|
25
|
+
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
|
|
26
|
+
if isinstance(self.data_source, SQLDataSource):
|
|
27
|
+
if isinstance(self.data_source, OracleDataSource) and self.where_filter:
|
|
28
|
+
self.where_filter = re.sub(r"(\b[a-zA-Z_]+\b)(?=\s*[=<>])", r'"\1"', self.where_filter)
|
|
29
|
+
return self.data_source.query_get_null_count(
|
|
30
|
+
table=self.dataset_name,
|
|
31
|
+
field=f'"{self.field_name}"' if isinstance(self.data_source, OracleDataSource) else self.field_name,
|
|
32
|
+
filters=self.where_filter if self.where_filter is not None else None,
|
|
33
|
+
)
|
|
34
|
+
elif isinstance(self.data_source, SearchIndexDataSource):
|
|
35
|
+
return self.data_source.query_get_null_count(
|
|
36
|
+
index_name=self.dataset_name,
|
|
37
|
+
field=self.field_name,
|
|
38
|
+
filters=self.where_filter if self.where_filter else None,
|
|
39
|
+
)
|
|
40
|
+
else:
|
|
41
|
+
raise ValueError("Invalid data source type")
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class PercentageNullValidation(Validation):
|
|
45
|
+
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
|
|
46
|
+
if isinstance(self.data_source, SQLDataSource):
|
|
47
|
+
if isinstance(self.data_source, OracleDataSource) and self.where_filter:
|
|
48
|
+
self.where_filter = re.sub(r"(\b[a-zA-Z_]+\b)(?=\s*[=<>])", r'"\1"', self.where_filter)
|
|
49
|
+
return self.data_source.query_get_null_percentage(
|
|
50
|
+
table=self.dataset_name,
|
|
51
|
+
field=f'"{self.field_name}"' if isinstance(self.data_source, OracleDataSource) else self.field_name,
|
|
52
|
+
filters=self.where_filter if self.where_filter is not None else None,
|
|
53
|
+
)
|
|
54
|
+
elif isinstance(self.data_source, SearchIndexDataSource):
|
|
55
|
+
return self.data_source.query_get_null_percentage(
|
|
56
|
+
index_name=self.dataset_name,
|
|
57
|
+
field=self.field_name,
|
|
58
|
+
filters=self.where_filter if self.where_filter else None,
|
|
59
|
+
)
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError("Invalid data source type")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class CountEmptyStringValidation(Validation):
|
|
65
|
+
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
|
|
66
|
+
if isinstance(self.data_source, SQLDataSource):
|
|
67
|
+
if isinstance(self.data_source, OracleDataSource) and self.where_filter:
|
|
68
|
+
self.where_filter = re.sub(r"(\b[a-zA-Z_]+\b)(?=\s*[=<>])", r'"\1"', self.where_filter)
|
|
69
|
+
return self.data_source.query_get_empty_string_count(
|
|
70
|
+
table=self.dataset_name,
|
|
71
|
+
field=f'"{self.field_name}"' if isinstance(self.data_source, OracleDataSource) else self.field_name,
|
|
72
|
+
filters=self.where_filter if self.where_filter is not None else None,
|
|
73
|
+
)
|
|
74
|
+
elif isinstance(self.data_source, SearchIndexDataSource):
|
|
75
|
+
return self.data_source.query_get_empty_string_count(
|
|
76
|
+
index_name=self.dataset_name,
|
|
77
|
+
field=self.field_name,
|
|
78
|
+
filters=self.where_filter if self.where_filter else None,
|
|
79
|
+
)
|
|
80
|
+
else:
|
|
81
|
+
raise ValueError("Invalid data source type")
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class PercentageEmptyStringValidation(Validation):
|
|
85
|
+
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
|
|
86
|
+
if isinstance(self.data_source, SQLDataSource):
|
|
87
|
+
if isinstance(self.data_source, OracleDataSource) and self.where_filter:
|
|
88
|
+
self.where_filter = re.sub(r"(\b[a-zA-Z_]+\b)(?=\s*[=<>])", r'"\1"', self.where_filter)
|
|
89
|
+
return self.data_source.query_get_empty_string_percentage(
|
|
90
|
+
table=self.dataset_name,
|
|
91
|
+
field=f'"{self.field_name}"' if isinstance(self.data_source, OracleDataSource) else self.field_name,
|
|
92
|
+
filters=self.where_filter if self.where_filter is not None else None,
|
|
93
|
+
)
|
|
94
|
+
elif isinstance(self.data_source, SearchIndexDataSource):
|
|
95
|
+
return self.data_source.query_get_empty_string_percentage(
|
|
96
|
+
index_name=self.dataset_name,
|
|
97
|
+
field=self.field_name,
|
|
98
|
+
filters=self.where_filter if self.where_filter else None,
|
|
99
|
+
)
|
|
100
|
+
else:
|
|
101
|
+
raise ValueError("Invalid data source type")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class CountAllSpaceValidation(Validation):
|
|
105
|
+
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
|
|
106
|
+
if isinstance(self.data_source, SQLDataSource):
|
|
107
|
+
return self.data_source.query_get_all_space_count(
|
|
108
|
+
table=self.dataset_name,
|
|
109
|
+
field=self.field_name,
|
|
110
|
+
operation="count",
|
|
111
|
+
filters=self.where_filter if self.where_filter is not None else None,
|
|
112
|
+
)
|
|
113
|
+
else:
|
|
114
|
+
raise ValueError("Invalid data source type")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class PercentageAllSpaceValidation(Validation):
|
|
118
|
+
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
|
|
119
|
+
if isinstance(self.data_source, SQLDataSource):
|
|
120
|
+
return self.data_source.query_get_all_space_count(
|
|
121
|
+
table=self.dataset_name,
|
|
122
|
+
field=self.field_name,
|
|
123
|
+
operation="percent",
|
|
124
|
+
filters=self.where_filter if self.where_filter is not None else None,
|
|
125
|
+
)
|
|
126
|
+
else:
|
|
127
|
+
raise ValueError("Invalid data source type")
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class CountNullKeywordValidation(Validation):
|
|
131
|
+
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
|
|
132
|
+
if isinstance(self.data_source, SQLDataSource):
|
|
133
|
+
return self.data_source.query_get_null_keyword_count(
|
|
134
|
+
table=self.dataset_name,
|
|
135
|
+
field=self.field_name,
|
|
136
|
+
operation="count",
|
|
137
|
+
filters=self.where_filter if self.where_filter is not None else None,
|
|
138
|
+
)
|
|
139
|
+
else:
|
|
140
|
+
raise ValueError("Invalid data source type")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class PercentageNullKeywordValidation(Validation):
|
|
144
|
+
def _generate_metric_value(self, **kwargs) -> Union[float, int]:
|
|
145
|
+
if isinstance(self.data_source, SQLDataSource):
|
|
146
|
+
return self.data_source.query_get_null_keyword_count(
|
|
147
|
+
table=self.dataset_name,
|
|
148
|
+
field=self.field_name,
|
|
149
|
+
operation="percent",
|
|
150
|
+
filters=self.where_filter if self.where_filter is not None else None,
|
|
151
|
+
)
|
|
152
|
+
else:
|
|
153
|
+
raise ValueError("Invalid data source type")
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from dcs_core.core.datasource.sql_datasource import SQLDataSource
|
|
16
|
+
from dcs_core.core.validation.base import Validation
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CustomSqlValidation(Validation):
|
|
20
|
+
def _generate_metric_value(self):
|
|
21
|
+
if isinstance(self.data_source, SQLDataSource):
|
|
22
|
+
return self.data_source.query_get_custom_sql(query=self.query)
|
|
23
|
+
else:
|
|
24
|
+
raise ValueError("Invalid data source type")
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import Dict
|
|
16
|
+
|
|
17
|
+
from dcs_core.core.common.models.configuration import (
|
|
18
|
+
Configuration,
|
|
19
|
+
ValidationConfigByDataset,
|
|
20
|
+
)
|
|
21
|
+
from dcs_core.core.common.models.validation import ValidationFunction
|
|
22
|
+
from dcs_core.core.datasource.manager import DataSourceManager
|
|
23
|
+
from dcs_core.core.validation.base import DeltaValidation, Validation
|
|
24
|
+
from dcs_core.core.validation.completeness_validation import ( # noqa F401 this is used in globals
|
|
25
|
+
CountAllSpaceValidation,
|
|
26
|
+
CountEmptyStringValidation,
|
|
27
|
+
CountNullKeywordValidation,
|
|
28
|
+
CountNullValidation,
|
|
29
|
+
PercentageAllSpaceValidation,
|
|
30
|
+
PercentageEmptyStringValidation,
|
|
31
|
+
PercentageNullKeywordValidation,
|
|
32
|
+
PercentageNullValidation,
|
|
33
|
+
)
|
|
34
|
+
from dcs_core.core.validation.custom_query_validation import ( # noqa F401 this is used in globals
|
|
35
|
+
CustomSqlValidation,
|
|
36
|
+
)
|
|
37
|
+
from dcs_core.core.validation.numeric_validation import ( # noqa F401 this is used in globals
|
|
38
|
+
AvgValidation,
|
|
39
|
+
CountNegativeValidation,
|
|
40
|
+
CountZeroValidation,
|
|
41
|
+
MaxValidation,
|
|
42
|
+
MinValidation,
|
|
43
|
+
Percentile20Validation,
|
|
44
|
+
Percentile40Validation,
|
|
45
|
+
Percentile60Validation,
|
|
46
|
+
Percentile80Validation,
|
|
47
|
+
Percentile90Validation,
|
|
48
|
+
PercentNegativeValidation,
|
|
49
|
+
PercentZeroValidation,
|
|
50
|
+
StdDevValidation,
|
|
51
|
+
SumValidation,
|
|
52
|
+
VarianceValidation,
|
|
53
|
+
)
|
|
54
|
+
from dcs_core.core.validation.reliability_validation import ( # noqa F401 this is used in globals
|
|
55
|
+
CountDocumentsValidation,
|
|
56
|
+
CountRowValidation,
|
|
57
|
+
DeltaCountRowValidation,
|
|
58
|
+
FreshnessValueMetric,
|
|
59
|
+
)
|
|
60
|
+
from dcs_core.core.validation.uniqueness_validation import ( # noqa F401 this is used in globals
|
|
61
|
+
CountDistinctValidation,
|
|
62
|
+
CountDuplicateValidation,
|
|
63
|
+
)
|
|
64
|
+
from dcs_core.core.validation.validity_validation import ( # noqa F401 this is used in globals
|
|
65
|
+
CountCUSIPValidation,
|
|
66
|
+
CountDateNotInFutureValidation,
|
|
67
|
+
CountEmailValidation,
|
|
68
|
+
CountFIGIValidation,
|
|
69
|
+
CountInvalidRegex,
|
|
70
|
+
CountInvalidValues,
|
|
71
|
+
CountISINValidation,
|
|
72
|
+
CountLatitudeValidation,
|
|
73
|
+
CountLEIValidation,
|
|
74
|
+
CountLongitudeValidation,
|
|
75
|
+
CountNotInFutureValidation,
|
|
76
|
+
CountPermIDValidation,
|
|
77
|
+
CountSEDOLValidation,
|
|
78
|
+
CountSSNValidation,
|
|
79
|
+
CountTimeStampValidation,
|
|
80
|
+
CountUSAPhoneValidation,
|
|
81
|
+
CountUSAStateCodeValidation,
|
|
82
|
+
CountUSAZipCodeValidation,
|
|
83
|
+
CountUUIDValidation,
|
|
84
|
+
CountValidRegex,
|
|
85
|
+
CountValidValues,
|
|
86
|
+
PercentCUSIPValidation,
|
|
87
|
+
PercentDateNotInFutureValidation,
|
|
88
|
+
PercentEmailValidation,
|
|
89
|
+
PercentFIGIValidation,
|
|
90
|
+
PercentInvalidRegex,
|
|
91
|
+
PercentInvalidValues,
|
|
92
|
+
PercentISINValidation,
|
|
93
|
+
PercentLatitudeValidation,
|
|
94
|
+
PercentLEIValidation,
|
|
95
|
+
PercentLongitudeValidation,
|
|
96
|
+
PercentNotInFutureValidation,
|
|
97
|
+
PercentPermIDValidation,
|
|
98
|
+
PercentSEDOLValidation,
|
|
99
|
+
PercentSSNValidation,
|
|
100
|
+
PercentTimeStampValidation,
|
|
101
|
+
PercentUSAPhoneValidation,
|
|
102
|
+
PercentUSAStateCodeValidation,
|
|
103
|
+
PercentUSAZipCodeValidation,
|
|
104
|
+
PercentUUIDValidation,
|
|
105
|
+
PercentValidRegex,
|
|
106
|
+
PercentValidValues,
|
|
107
|
+
StringLengthAverageValidation,
|
|
108
|
+
StringLengthMaxValidation,
|
|
109
|
+
StringLengthMinValidation,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class ValidationManager:
|
|
114
|
+
VALIDATION_CLASS_MAPPING = {
|
|
115
|
+
ValidationFunction.MIN.value: "MinValidation",
|
|
116
|
+
ValidationFunction.MAX.value: "MaxValidation",
|
|
117
|
+
ValidationFunction.AVG.value: "AvgValidation",
|
|
118
|
+
ValidationFunction.SUM.value: "SumValidation",
|
|
119
|
+
ValidationFunction.VARIANCE.value: "VarianceValidation",
|
|
120
|
+
ValidationFunction.STDDEV.value: "StdDevValidation",
|
|
121
|
+
ValidationFunction.COUNT_DUPLICATE.value: "CountDuplicateValidation",
|
|
122
|
+
ValidationFunction.COUNT_DISTINCT.value: "CountDistinctValidation",
|
|
123
|
+
ValidationFunction.COUNT_NULL.value: "CountNullValidation",
|
|
124
|
+
ValidationFunction.PERCENT_NULL.value: "PercentageNullValidation",
|
|
125
|
+
ValidationFunction.COUNT_EMPTY_STRING.value: "CountEmptyStringValidation",
|
|
126
|
+
ValidationFunction.PERCENT_EMPTY_STRING.value: "PercentageEmptyStringValidation",
|
|
127
|
+
ValidationFunction.CUSTOM_SQL.value: "CustomSqlValidation",
|
|
128
|
+
ValidationFunction.COUNT_DOCUMENTS.value: "CountDocumentsValidation",
|
|
129
|
+
ValidationFunction.COUNT_ROWS.value: "CountRowValidation",
|
|
130
|
+
ValidationFunction.DELTA_COUNT_ROWS.value: "DeltaCountRowValidation",
|
|
131
|
+
ValidationFunction.FRESHNESS.value: "FreshnessValueMetric",
|
|
132
|
+
ValidationFunction.COUNT_UUID.value: "CountUUIDValidation",
|
|
133
|
+
ValidationFunction.PERCENT_UUID.value: "PercentUUIDValidation",
|
|
134
|
+
ValidationFunction.COUNT_INVALID_VALUES.value: "CountInvalidValues",
|
|
135
|
+
ValidationFunction.PERCENT_INVALID_VALUES.value: "PercentInvalidValues",
|
|
136
|
+
ValidationFunction.COUNT_VALID_VALUES.value: "CountValidValues",
|
|
137
|
+
ValidationFunction.PERCENT_VALID_VALUES.value: "PercentValidValues",
|
|
138
|
+
ValidationFunction.COUNT_INVALID_REGEX.value: "CountInvalidRegex",
|
|
139
|
+
ValidationFunction.PERCENT_INVALID_REGEX.value: "PercentInvalidRegex",
|
|
140
|
+
ValidationFunction.COUNT_VALID_REGEX.value: "CountValidRegex",
|
|
141
|
+
ValidationFunction.PERCENT_VALID_REGEX.value: "PercentValidRegex",
|
|
142
|
+
ValidationFunction.COUNT_USA_PHONE.value: "CountUSAPhoneValidation",
|
|
143
|
+
ValidationFunction.PERCENT_USA_PHONE.value: "PercentUSAPhoneValidation",
|
|
144
|
+
ValidationFunction.COUNT_EMAIL.value: "CountEmailValidation",
|
|
145
|
+
ValidationFunction.PERCENT_EMAIL.value: "PercentEmailValidation",
|
|
146
|
+
ValidationFunction.STRING_LENGTH_MAX.value: "StringLengthMaxValidation",
|
|
147
|
+
ValidationFunction.STRING_LENGTH_MIN.value: "StringLengthMinValidation",
|
|
148
|
+
ValidationFunction.STRING_LENGTH_AVERAGE.value: "StringLengthAverageValidation",
|
|
149
|
+
ValidationFunction.COUNT_USA_STATE_CODE.value: "CountUSAStateCodeValidation",
|
|
150
|
+
ValidationFunction.PERCENT_USA_STATE_CODE.value: "PercentUSAStateCodeValidation",
|
|
151
|
+
ValidationFunction.COUNT_USA_ZIP_CODE.value: "CountUSAZipCodeValidation",
|
|
152
|
+
ValidationFunction.PERCENT_USA_ZIP_CODE.value: "PercentUSAZipCodeValidation",
|
|
153
|
+
ValidationFunction.COUNT_LATITUDE.value: "CountLatitudeValidation",
|
|
154
|
+
ValidationFunction.PERCENT_LATITUDE.value: "PercentLatitudeValidation",
|
|
155
|
+
ValidationFunction.COUNT_LONGITUDE.value: "CountLongitudeValidation",
|
|
156
|
+
ValidationFunction.PERCENT_LONGITUDE.value: "PercentLongitudeValidation",
|
|
157
|
+
ValidationFunction.COUNT_SSN.value: "CountSSNValidation",
|
|
158
|
+
ValidationFunction.PERCENT_SSN.value: "PercentSSNValidation",
|
|
159
|
+
ValidationFunction.COUNT_SEDOL.value: "CountSEDOLValidation",
|
|
160
|
+
ValidationFunction.PERCENT_SEDOL.value: "PercentSEDOLValidation",
|
|
161
|
+
ValidationFunction.COUNT_CUSIP.value: "CountCUSIPValidation",
|
|
162
|
+
ValidationFunction.PERCENT_CUSIP.value: "PercentCUSIPValidation",
|
|
163
|
+
ValidationFunction.COUNT_LEI.value: "CountLEIValidation",
|
|
164
|
+
ValidationFunction.PERCENT_LEI.value: "PercentLEIValidation",
|
|
165
|
+
ValidationFunction.COUNT_FIGI.value: "CountFIGIValidation",
|
|
166
|
+
ValidationFunction.PERCENT_FIGI.value: "PercentFIGIValidation",
|
|
167
|
+
ValidationFunction.COUNT_ISIN.value: "CountISINValidation",
|
|
168
|
+
ValidationFunction.PERCENT_ISIN.value: "PercentISINValidation",
|
|
169
|
+
ValidationFunction.COUNT_PERM_ID.value: "CountPermIDValidation",
|
|
170
|
+
ValidationFunction.PERCENT_PERM_ID.value: "PercentPermIDValidation",
|
|
171
|
+
ValidationFunction.PERCENTILE_20.value: "Percentile20Validation",
|
|
172
|
+
ValidationFunction.PERCENTILE_40.value: "Percentile40Validation",
|
|
173
|
+
ValidationFunction.PERCENTILE_60.value: "Percentile60Validation",
|
|
174
|
+
ValidationFunction.PERCENTILE_80.value: "Percentile80Validation",
|
|
175
|
+
ValidationFunction.PERCENTILE_90.value: "Percentile90Validation",
|
|
176
|
+
ValidationFunction.COUNT_ZERO.value: "CountZeroValidation",
|
|
177
|
+
ValidationFunction.PERCENT_ZERO.value: "PercentZeroValidation",
|
|
178
|
+
ValidationFunction.COUNT_NEGATIVE.value: "CountNegativeValidation",
|
|
179
|
+
ValidationFunction.PERCENT_NEGATIVE.value: "PercentNegativeValidation",
|
|
180
|
+
ValidationFunction.COUNT_ALL_SPACE.value: "CountAllSpaceValidation",
|
|
181
|
+
ValidationFunction.PERCENT_ALL_SPACE.value: "PercentageAllSpaceValidation",
|
|
182
|
+
ValidationFunction.COUNT_NULL_KEYWORD.value: "CountNullKeywordValidation",
|
|
183
|
+
ValidationFunction.PERCENT_NULL_KEYWORD.value: "PercentageNullKeywordValidation",
|
|
184
|
+
ValidationFunction.COUNT_TIMESTAMP_STRING.value: "CountTimeStampValidation",
|
|
185
|
+
ValidationFunction.PERCENT_TIMESTAMP_STRING.value: "PercentTimeStampValidation",
|
|
186
|
+
ValidationFunction.COUNT_NOT_IN_FUTURE.value: "CountNotInFutureValidation",
|
|
187
|
+
ValidationFunction.PERCENT_NOT_IN_FUTURE.value: "PercentNotInFutureValidation",
|
|
188
|
+
ValidationFunction.COUNT_DATE_NOT_IN_FUTURE.value: "CountDateNotInFutureValidation",
|
|
189
|
+
ValidationFunction.PERCENT_DATE_NOT_IN_FUTURE.value: "PercentDateNotInFutureValidation",
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
def __init__(
|
|
193
|
+
self,
|
|
194
|
+
application_configs: Configuration,
|
|
195
|
+
data_source_manager: DataSourceManager,
|
|
196
|
+
):
|
|
197
|
+
self.data_source_manager = data_source_manager
|
|
198
|
+
self.application_configs = application_configs
|
|
199
|
+
self.validation_configs: Dict[str, ValidationConfigByDataset] = application_configs.validations
|
|
200
|
+
|
|
201
|
+
"""
|
|
202
|
+
Will store the validations in the following format:
|
|
203
|
+
{
|
|
204
|
+
"data_source_name": {
|
|
205
|
+
"dataset_name": {
|
|
206
|
+
"validation_name": Validation
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
"""
|
|
211
|
+
self.validations: Dict[str, Dict[str, Dict[str, Validation]]] = {}
|
|
212
|
+
|
|
213
|
+
def set_validation_configs(self, validations: Dict[str, ValidationConfigByDataset]):
|
|
214
|
+
self.validation_configs = validations
|
|
215
|
+
|
|
216
|
+
def build_validations(self):
|
|
217
|
+
for _, validation_by_dataset in self.validation_configs.items():
|
|
218
|
+
data_source_name = validation_by_dataset.data_source
|
|
219
|
+
dataset_name = validation_by_dataset.dataset
|
|
220
|
+
|
|
221
|
+
if data_source_name not in self.validations:
|
|
222
|
+
self.validations[data_source_name] = {}
|
|
223
|
+
|
|
224
|
+
if dataset_name not in self.validations[data_source_name]:
|
|
225
|
+
self.validations[data_source_name][dataset_name] = {}
|
|
226
|
+
|
|
227
|
+
for (
|
|
228
|
+
validation_name,
|
|
229
|
+
validation_config,
|
|
230
|
+
) in validation_by_dataset.validations.items():
|
|
231
|
+
data_source = self.data_source_manager.get_data_source(data_source_name)
|
|
232
|
+
params = {}
|
|
233
|
+
if validation_config.get_is_delta_validation:
|
|
234
|
+
reference_data_source = self.data_source_manager.get_data_source(
|
|
235
|
+
validation_config.get_ref_data_source_name
|
|
236
|
+
)
|
|
237
|
+
base_class_name = self.VALIDATION_CLASS_MAPPING[validation_config.get_validation_function]
|
|
238
|
+
validation: DeltaValidation = globals()[base_class_name](
|
|
239
|
+
name=validation_name,
|
|
240
|
+
data_source=data_source,
|
|
241
|
+
dataset_name=dataset_name,
|
|
242
|
+
validation_name=validation_name,
|
|
243
|
+
validation_config=validation_config,
|
|
244
|
+
field_name=validation_config.get_validation_field_name,
|
|
245
|
+
reference_data_source=reference_data_source,
|
|
246
|
+
reference_dataset_name=validation_config.get_ref_dataset_name,
|
|
247
|
+
reference_field_name=validation_config.get_ref_field_name,
|
|
248
|
+
**params,
|
|
249
|
+
)
|
|
250
|
+
self.validations[data_source_name][dataset_name][validation_name] = validation
|
|
251
|
+
else:
|
|
252
|
+
validation: Validation = globals()[
|
|
253
|
+
self.VALIDATION_CLASS_MAPPING[validation_config.get_validation_function]
|
|
254
|
+
](
|
|
255
|
+
name=validation_name,
|
|
256
|
+
data_source=data_source,
|
|
257
|
+
dataset_name=dataset_name,
|
|
258
|
+
validation_name=validation_name,
|
|
259
|
+
validation_config=validation_config,
|
|
260
|
+
field_name=validation_config.get_validation_field_name,
|
|
261
|
+
**params,
|
|
262
|
+
)
|
|
263
|
+
self.validations[data_source_name][dataset_name][validation_name] = validation
|
|
264
|
+
|
|
265
|
+
def add_validation(self, validation: Validation):
|
|
266
|
+
data_source_name = validation.data_source.data_source_name
|
|
267
|
+
dataset_name = validation.dataset_name
|
|
268
|
+
validation_name = validation.name
|
|
269
|
+
if data_source_name not in self.validations:
|
|
270
|
+
self.validations[data_source_name] = {}
|
|
271
|
+
|
|
272
|
+
if dataset_name not in self.validations[data_source_name]:
|
|
273
|
+
self.validations[data_source_name][dataset_name] = {}
|
|
274
|
+
|
|
275
|
+
self.validations[data_source_name][dataset_name][validation_name] = validation
|
|
276
|
+
|
|
277
|
+
@property
|
|
278
|
+
def get_validations(self):
|
|
279
|
+
return self.validations
|
|
280
|
+
|
|
281
|
+
def get_validation(self, data_source_name: str, dataset_name: str, validation_name: str) -> Validation:
|
|
282
|
+
return self.validations[data_source_name][dataset_name][validation_name]
|