dcs-sdk 1.6.4__py3-none-any.whl → 1.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dcs_core/__init__.py +13 -0
- dcs_core/__main__.py +17 -0
- dcs_core/__version__.py +15 -0
- dcs_core/cli/__init__.py +13 -0
- dcs_core/cli/cli.py +165 -0
- dcs_core/core/__init__.py +19 -0
- dcs_core/core/common/__init__.py +13 -0
- dcs_core/core/common/errors.py +50 -0
- dcs_core/core/common/models/__init__.py +13 -0
- dcs_core/core/common/models/configuration.py +284 -0
- dcs_core/core/common/models/dashboard.py +24 -0
- dcs_core/core/common/models/data_source_resource.py +75 -0
- dcs_core/core/common/models/metric.py +160 -0
- dcs_core/core/common/models/profile.py +75 -0
- dcs_core/core/common/models/validation.py +216 -0
- dcs_core/core/common/models/widget.py +44 -0
- dcs_core/core/configuration/__init__.py +13 -0
- dcs_core/core/configuration/config_loader.py +139 -0
- dcs_core/core/configuration/configuration_parser.py +262 -0
- dcs_core/core/configuration/configuration_parser_arc.py +328 -0
- dcs_core/core/datasource/__init__.py +13 -0
- dcs_core/core/datasource/base.py +62 -0
- dcs_core/core/datasource/manager.py +112 -0
- dcs_core/core/datasource/search_datasource.py +421 -0
- dcs_core/core/datasource/sql_datasource.py +1094 -0
- dcs_core/core/inspect.py +163 -0
- dcs_core/core/logger/__init__.py +13 -0
- dcs_core/core/logger/base.py +32 -0
- dcs_core/core/logger/default_logger.py +94 -0
- dcs_core/core/metric/__init__.py +13 -0
- dcs_core/core/metric/base.py +220 -0
- dcs_core/core/metric/combined_metric.py +98 -0
- dcs_core/core/metric/custom_metric.py +34 -0
- dcs_core/core/metric/manager.py +137 -0
- dcs_core/core/metric/numeric_metric.py +403 -0
- dcs_core/core/metric/reliability_metric.py +90 -0
- dcs_core/core/profiling/__init__.py +13 -0
- dcs_core/core/profiling/datasource_profiling.py +136 -0
- dcs_core/core/profiling/numeric_field_profiling.py +72 -0
- dcs_core/core/profiling/text_field_profiling.py +67 -0
- dcs_core/core/repository/__init__.py +13 -0
- dcs_core/core/repository/metric_repository.py +77 -0
- dcs_core/core/utils/__init__.py +13 -0
- dcs_core/core/utils/log.py +29 -0
- dcs_core/core/utils/tracking.py +105 -0
- dcs_core/core/utils/utils.py +44 -0
- dcs_core/core/validation/__init__.py +13 -0
- dcs_core/core/validation/base.py +230 -0
- dcs_core/core/validation/completeness_validation.py +153 -0
- dcs_core/core/validation/custom_query_validation.py +24 -0
- dcs_core/core/validation/manager.py +282 -0
- dcs_core/core/validation/numeric_validation.py +276 -0
- dcs_core/core/validation/reliability_validation.py +91 -0
- dcs_core/core/validation/uniqueness_validation.py +61 -0
- dcs_core/core/validation/validity_validation.py +738 -0
- dcs_core/integrations/__init__.py +13 -0
- dcs_core/integrations/databases/__init__.py +13 -0
- dcs_core/integrations/databases/bigquery.py +187 -0
- dcs_core/integrations/databases/databricks.py +51 -0
- dcs_core/integrations/databases/db2.py +652 -0
- dcs_core/integrations/databases/elasticsearch.py +61 -0
- dcs_core/integrations/databases/mssql.py +979 -0
- dcs_core/integrations/databases/mysql.py +409 -0
- dcs_core/integrations/databases/opensearch.py +64 -0
- dcs_core/integrations/databases/oracle.py +719 -0
- dcs_core/integrations/databases/postgres.py +570 -0
- dcs_core/integrations/databases/redshift.py +53 -0
- dcs_core/integrations/databases/snowflake.py +48 -0
- dcs_core/integrations/databases/spark_df.py +111 -0
- dcs_core/integrations/databases/sybase.py +1069 -0
- dcs_core/integrations/storage/__init__.py +13 -0
- dcs_core/integrations/storage/local_file.py +149 -0
- dcs_core/integrations/utils/__init__.py +13 -0
- dcs_core/integrations/utils/utils.py +36 -0
- dcs_core/report/__init__.py +13 -0
- dcs_core/report/dashboard.py +211 -0
- dcs_core/report/models.py +88 -0
- dcs_core/report/static/assets/fonts/DMSans-Bold.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Medium.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-Regular.ttf +0 -0
- dcs_core/report/static/assets/fonts/DMSans-SemiBold.ttf +0 -0
- dcs_core/report/static/assets/images/docs.svg +6 -0
- dcs_core/report/static/assets/images/github.svg +4 -0
- dcs_core/report/static/assets/images/logo.svg +7 -0
- dcs_core/report/static/assets/images/slack.svg +13 -0
- dcs_core/report/static/index.js +2 -0
- dcs_core/report/static/index.js.LICENSE.txt +3971 -0
- dcs_sdk/__version__.py +1 -1
- dcs_sdk/cli/cli.py +3 -0
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/METADATA +24 -2
- dcs_sdk-1.6.6.dist-info/RECORD +159 -0
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/entry_points.txt +1 -0
- dcs_sdk-1.6.4.dist-info/RECORD +0 -72
- {dcs_sdk-1.6.4.dist-info → dcs_sdk-1.6.6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
# Copyright 2022-present, the Waterdip Labs Pvt. Ltd.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from datetime import datetime, timezone
|
|
16
|
+
from typing import Dict, List
|
|
17
|
+
|
|
18
|
+
from dateutil import parser
|
|
19
|
+
|
|
20
|
+
from dcs_core.core.datasource.base import DataSource
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class SearchIndexDataSource(DataSource):
|
|
24
|
+
"""
|
|
25
|
+
Abstract class for search index data sources
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
FIELD_TYPE_MAPPING = {
|
|
29
|
+
"text": str,
|
|
30
|
+
"keyword": str,
|
|
31
|
+
"date": datetime,
|
|
32
|
+
"long": int,
|
|
33
|
+
"integer": int,
|
|
34
|
+
"short": int,
|
|
35
|
+
"byte": int,
|
|
36
|
+
"double": float,
|
|
37
|
+
"float": float,
|
|
38
|
+
"half_float": float,
|
|
39
|
+
"boolean": bool,
|
|
40
|
+
"binary": str,
|
|
41
|
+
"nested": dict,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
def __init__(self, data_source_name: str, data_connection: Dict):
|
|
45
|
+
super().__init__(data_source_name, data_connection)
|
|
46
|
+
|
|
47
|
+
self.client = None
|
|
48
|
+
|
|
49
|
+
def query_get_index_metadata(self) -> List[str]:
|
|
50
|
+
"""
|
|
51
|
+
Get the index metadata
|
|
52
|
+
:return: query for index metadata
|
|
53
|
+
"""
|
|
54
|
+
return [index for index in self.client.indices.get("*")]
|
|
55
|
+
|
|
56
|
+
def query_get_field_metadata(self, index_name: str) -> Dict[str, str]:
|
|
57
|
+
"""
|
|
58
|
+
Get the field metadata
|
|
59
|
+
:param index_name: name of the index
|
|
60
|
+
:return: query for field metadata
|
|
61
|
+
"""
|
|
62
|
+
results_: Dict[str, str] = {}
|
|
63
|
+
mappings = self.client.indices.get_mapping(index=index_name)
|
|
64
|
+
properties = mappings[index_name]["mappings"]["properties"]
|
|
65
|
+
|
|
66
|
+
for field, value in properties.items():
|
|
67
|
+
if "type" in value:
|
|
68
|
+
results_[field] = self.FIELD_TYPE_MAPPING[value["type"]]
|
|
69
|
+
elif "properties" in value:
|
|
70
|
+
results_[field] = self.FIELD_TYPE_MAPPING["nested"]
|
|
71
|
+
|
|
72
|
+
return results_
|
|
73
|
+
|
|
74
|
+
def query_get_field_type(self, index_name: str, field: str) -> str:
|
|
75
|
+
"""
|
|
76
|
+
Get the field type
|
|
77
|
+
:param index_name: name of the index
|
|
78
|
+
:param field: field name
|
|
79
|
+
:return: field type
|
|
80
|
+
"""
|
|
81
|
+
types = self.query_get_field_metadata(index_name=index_name)
|
|
82
|
+
return types[field]
|
|
83
|
+
|
|
84
|
+
def query_get_document_count(self, index_name: str, filters: Dict = None) -> int:
|
|
85
|
+
"""
|
|
86
|
+
Get the document count
|
|
87
|
+
:param index_name: name of the index
|
|
88
|
+
:param filters: optional filter
|
|
89
|
+
:return: count of documents
|
|
90
|
+
"""
|
|
91
|
+
body = {"query": filters} if filters else {}
|
|
92
|
+
response = self.client.count(index=index_name, body=body)
|
|
93
|
+
return response["count"]
|
|
94
|
+
|
|
95
|
+
def query_get_max(self, index_name: str, field: str, filters: Dict = None) -> int:
|
|
96
|
+
"""
|
|
97
|
+
Get the max value
|
|
98
|
+
:param index_name: name of the index
|
|
99
|
+
:param field: field name
|
|
100
|
+
:param filters: optional filter
|
|
101
|
+
:return: max value
|
|
102
|
+
"""
|
|
103
|
+
query = {"aggs": {"max_value": {"max": {"field": field}}}}
|
|
104
|
+
if filters:
|
|
105
|
+
query["query"] = filters
|
|
106
|
+
|
|
107
|
+
response = self.client.search(index=index_name, body=query)
|
|
108
|
+
return response["aggregations"]["max_value"]["value"]
|
|
109
|
+
|
|
110
|
+
def query_get_min(self, index_name: str, field: str, filters: Dict = None) -> int:
|
|
111
|
+
"""
|
|
112
|
+
Get the min value of a field
|
|
113
|
+
:param index_name:
|
|
114
|
+
:param field:
|
|
115
|
+
:param filters:
|
|
116
|
+
:return:
|
|
117
|
+
"""
|
|
118
|
+
query = {"aggs": {"min_value": {"min": {"field": field}}}}
|
|
119
|
+
if filters:
|
|
120
|
+
query["query"] = filters
|
|
121
|
+
|
|
122
|
+
response = self.client.search(index=index_name, body=query)
|
|
123
|
+
return response["aggregations"]["min_value"]["value"]
|
|
124
|
+
|
|
125
|
+
def query_get_avg(self, index_name: str, field: str, filters: Dict = None) -> int:
|
|
126
|
+
"""
|
|
127
|
+
Get the average value of a field
|
|
128
|
+
:param index_name:
|
|
129
|
+
:param field:
|
|
130
|
+
:param filters:
|
|
131
|
+
:return:
|
|
132
|
+
"""
|
|
133
|
+
query = {"aggs": {"avg_value": {"avg": {"field": field}}}}
|
|
134
|
+
if filters:
|
|
135
|
+
query["query"] = filters
|
|
136
|
+
|
|
137
|
+
response = self.client.search(index=index_name, body=query)
|
|
138
|
+
return round(response["aggregations"]["avg_value"]["value"], 2)
|
|
139
|
+
|
|
140
|
+
def query_get_sum(self, index_name: str, field: str, filters: Dict = None) -> int:
|
|
141
|
+
"""
|
|
142
|
+
Get the sum value of a field
|
|
143
|
+
:param index_name:
|
|
144
|
+
:param field:
|
|
145
|
+
:param filters:
|
|
146
|
+
:return:
|
|
147
|
+
"""
|
|
148
|
+
query = {"aggs": {"sum_value": {"sum": {"field": field}}}}
|
|
149
|
+
if filters:
|
|
150
|
+
query["query"] = filters
|
|
151
|
+
|
|
152
|
+
response = self.client.search(index=index_name, body=query)
|
|
153
|
+
return round(response["aggregations"]["sum_value"]["value"], 2)
|
|
154
|
+
|
|
155
|
+
def query_get_variance(self, index_name: str, field: str, filters: Dict = None) -> int:
|
|
156
|
+
"""
|
|
157
|
+
Get the variance value of a field
|
|
158
|
+
:param index_name:
|
|
159
|
+
:param field:
|
|
160
|
+
:param filters:
|
|
161
|
+
:return:
|
|
162
|
+
"""
|
|
163
|
+
query = {"aggs": {"stats": {"extended_stats": {"field": field}}}}
|
|
164
|
+
if filters:
|
|
165
|
+
query["query"] = filters
|
|
166
|
+
|
|
167
|
+
response = self.client.search(index=index_name, body=query)["aggregations"]
|
|
168
|
+
return round(response["stats"]["variance_sampling"], 2)
|
|
169
|
+
|
|
170
|
+
def query_get_stddev(self, index_name: str, field: str, filters: Dict = None) -> float:
|
|
171
|
+
"""
|
|
172
|
+
Get the standard deviation value of a field
|
|
173
|
+
:param index_name:
|
|
174
|
+
:param field:
|
|
175
|
+
:param filters:
|
|
176
|
+
:return:
|
|
177
|
+
"""
|
|
178
|
+
query = {"aggs": {"stats": {"extended_stats": {"field": field}}}}
|
|
179
|
+
if filters:
|
|
180
|
+
query["query"] = filters
|
|
181
|
+
|
|
182
|
+
response = self.client.search(index=index_name, body=query)["aggregations"]
|
|
183
|
+
return round(response["stats"]["std_deviation_sampling"], 2)
|
|
184
|
+
|
|
185
|
+
def query_get_distinct_count(self, index_name: str, field: str, filters: Dict = None) -> int:
|
|
186
|
+
"""
|
|
187
|
+
Get the distinct count value of a field
|
|
188
|
+
:param index_name:
|
|
189
|
+
:param field:
|
|
190
|
+
:param filters:
|
|
191
|
+
:return:
|
|
192
|
+
"""
|
|
193
|
+
query = {"aggs": {"distinct_count": {"cardinality": {"field": field}}}}
|
|
194
|
+
if filters:
|
|
195
|
+
query["query"] = filters
|
|
196
|
+
|
|
197
|
+
response = self.client.search(index=index_name, body=query)["aggregations"]
|
|
198
|
+
return response["distinct_count"]["value"]
|
|
199
|
+
|
|
200
|
+
def query_get_time_diff(self, index_name: str, field: str) -> int:
|
|
201
|
+
"""
|
|
202
|
+
Get the time difference
|
|
203
|
+
:param index_name: name of the index
|
|
204
|
+
:param field: field name
|
|
205
|
+
:param filters: optional filter
|
|
206
|
+
:return: time difference in milliseconds
|
|
207
|
+
"""
|
|
208
|
+
query = {"query": {"match_all": {}}, "sort": [{f"{field}": {"order": "desc"}}]}
|
|
209
|
+
|
|
210
|
+
response = self.client.search(index=index_name, body=query)
|
|
211
|
+
|
|
212
|
+
if response["hits"]["hits"]:
|
|
213
|
+
last_updated = response["hits"]["hits"][0]["_source"][field]
|
|
214
|
+
|
|
215
|
+
last_updated = parser.parse(timestr=last_updated).timestamp()
|
|
216
|
+
now = datetime.now(timezone.utc).timestamp()
|
|
217
|
+
return int(now - last_updated)
|
|
218
|
+
|
|
219
|
+
return 0
|
|
220
|
+
|
|
221
|
+
def query_get_null_count(self, index_name: str, field: str, filters: Dict = None) -> int:
|
|
222
|
+
"""
|
|
223
|
+
Get the null count
|
|
224
|
+
:param index_name: name of the index
|
|
225
|
+
:param field: field name
|
|
226
|
+
:param filters: optional filter
|
|
227
|
+
:return: null count
|
|
228
|
+
"""
|
|
229
|
+
query = {"query": {"bool": {"must_not": {"exists": {"field": field}}}}}
|
|
230
|
+
if filters:
|
|
231
|
+
query["query"]["bool"]["filter"] = filters
|
|
232
|
+
response = self.client.search(index=index_name, body=query)
|
|
233
|
+
return response["hits"]["total"]["value"]
|
|
234
|
+
|
|
235
|
+
def query_get_null_percentage(self, index_name: str, field: str, filters: Dict = None) -> float:
|
|
236
|
+
"""
|
|
237
|
+
Get the null percentage
|
|
238
|
+
:param index_name: name of the index
|
|
239
|
+
:param field: field name
|
|
240
|
+
:param filters: optional filter
|
|
241
|
+
:return: null percentage
|
|
242
|
+
"""
|
|
243
|
+
query = {
|
|
244
|
+
"size": 0,
|
|
245
|
+
"aggs": {
|
|
246
|
+
"null_count": {"missing": {"field": field}},
|
|
247
|
+
"total_count": {"value_count": {"field": field}},
|
|
248
|
+
},
|
|
249
|
+
}
|
|
250
|
+
if filters:
|
|
251
|
+
query["query"] = filters
|
|
252
|
+
|
|
253
|
+
response = self.client.search(index=index_name, body=query)["aggregations"]
|
|
254
|
+
return round(
|
|
255
|
+
(response["null_count"]["doc_count"] / response["total_count"]["value"]) * 100,
|
|
256
|
+
2,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
def query_get_empty_string_count(self, index_name: str, field: str, filters: Dict = None) -> int:
|
|
260
|
+
"""
|
|
261
|
+
Get the count of empty strings
|
|
262
|
+
:param index_name: name of the index
|
|
263
|
+
:param field: field name
|
|
264
|
+
:param filters: optional filter
|
|
265
|
+
:return: count of empty strings
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
query = {"query": {"bool": {"must": {"match": {f"{field}.keyword": ""}}}}}
|
|
269
|
+
if filters:
|
|
270
|
+
query["query"]["bool"]["filter"] = filters
|
|
271
|
+
response = self.client.search(index=index_name, body=query)
|
|
272
|
+
return response["hits"]["total"]["value"]
|
|
273
|
+
|
|
274
|
+
def query_get_empty_string_percentage(self, index_name: str, field: str, filters: Dict = None) -> float:
|
|
275
|
+
"""
|
|
276
|
+
Get the empty string percentage
|
|
277
|
+
:param index_name: name of the index
|
|
278
|
+
:param field: field name
|
|
279
|
+
:param filters: optional filter
|
|
280
|
+
:return: empty string percentage
|
|
281
|
+
"""
|
|
282
|
+
query = {
|
|
283
|
+
"size": 0,
|
|
284
|
+
"aggs": {
|
|
285
|
+
"empty_string_count": {
|
|
286
|
+
"filter": {"match": {f"{field}.keyword": ""}},
|
|
287
|
+
},
|
|
288
|
+
"total_count": {"value_count": {"field": f"{field}.keyword"}},
|
|
289
|
+
},
|
|
290
|
+
}
|
|
291
|
+
if filters:
|
|
292
|
+
query["query"] = filters
|
|
293
|
+
|
|
294
|
+
response = self.client.search(index=index_name, body=query)["aggregations"]
|
|
295
|
+
total_count = response["total_count"]["value"]
|
|
296
|
+
empty_string_count = response["empty_string_count"]["doc_count"]
|
|
297
|
+
|
|
298
|
+
if total_count == 0:
|
|
299
|
+
return 0.0
|
|
300
|
+
|
|
301
|
+
return round((empty_string_count / total_count) * 100, 2)
|
|
302
|
+
|
|
303
|
+
def profiling_search_aggregates_numeric(self, index_name: str, field: str) -> Dict:
|
|
304
|
+
"""
|
|
305
|
+
Get the aggregates for a numeric field
|
|
306
|
+
:param index_name: name of the index
|
|
307
|
+
:param field: field name
|
|
308
|
+
:return: aggregates
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
query = {
|
|
312
|
+
"aggs": {
|
|
313
|
+
"stats": {"extended_stats": {"field": field}},
|
|
314
|
+
"distinct_count": {"cardinality": {"field": field}},
|
|
315
|
+
"missing_count": {"missing": {"field": field}},
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
response = self.client.search(index=index_name, body=query)["aggregations"]
|
|
319
|
+
|
|
320
|
+
return {
|
|
321
|
+
"avg": response["stats"]["avg"],
|
|
322
|
+
"min": response["stats"]["min"],
|
|
323
|
+
"max": response["stats"]["max"],
|
|
324
|
+
"sum": response["stats"]["sum"],
|
|
325
|
+
"stddev": response["stats"]["std_deviation"],
|
|
326
|
+
"variance": response["stats"]["variance_sampling"],
|
|
327
|
+
"distinct_count": response["distinct_count"]["value"],
|
|
328
|
+
"missing_count": response["missing_count"]["doc_count"],
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
def profiling_search_aggregates_string(self, index_name: str, field: str) -> Dict:
|
|
332
|
+
"""
|
|
333
|
+
Get the aggregates for a text field
|
|
334
|
+
:param index_name: name of the index
|
|
335
|
+
:param field: field name
|
|
336
|
+
:return: aggregates
|
|
337
|
+
"""
|
|
338
|
+
script = {"script": {"source": f"params._source.containsKey('{field}')? params._source.{field}.length(): 0"}}
|
|
339
|
+
query = {
|
|
340
|
+
"aggs": {
|
|
341
|
+
"max_length": {"max": script},
|
|
342
|
+
"min_length": {"min": script},
|
|
343
|
+
"avg_length": {"avg": script},
|
|
344
|
+
"distinct_count": {"cardinality": {"field": f"{field}.keyword"}},
|
|
345
|
+
"missing_count": {"missing": {"field": f"{field}.keyword"}},
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
response = self.client.search(index=index_name, body=query)["aggregations"]
|
|
350
|
+
|
|
351
|
+
return {
|
|
352
|
+
"distinct_count": response["distinct_count"]["value"],
|
|
353
|
+
"missing_count": response["missing_count"]["doc_count"],
|
|
354
|
+
"max_length": response["max_length"]["value"],
|
|
355
|
+
"min_length": response["min_length"]["value"],
|
|
356
|
+
"avg_length": response["avg_length"]["value"],
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
def query_get_duplicate_count(self, index_name: str, field: str, filters: Dict = None) -> int:
|
|
360
|
+
"""
|
|
361
|
+
Get the duplicate count
|
|
362
|
+
:param index_name: name of the index
|
|
363
|
+
:param field: field name
|
|
364
|
+
:return: duplicate count
|
|
365
|
+
"""
|
|
366
|
+
field_type = self.query_get_field_type(index_name=index_name, field=field)
|
|
367
|
+
query = {
|
|
368
|
+
"aggs": {
|
|
369
|
+
"duplicate_count": {
|
|
370
|
+
"terms": {
|
|
371
|
+
"field": field if field_type != "str" else f"{field}.keyword",
|
|
372
|
+
"size": 10000,
|
|
373
|
+
"min_doc_count": 2,
|
|
374
|
+
},
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
if filters:
|
|
379
|
+
query["query"] = filters
|
|
380
|
+
response = self.client.search(index=index_name, body=query)["aggregations"]
|
|
381
|
+
|
|
382
|
+
return len(response["duplicate_count"]["buckets"])
|
|
383
|
+
|
|
384
|
+
def query_string_pattern_validity(
|
|
385
|
+
self,
|
|
386
|
+
index_name: str,
|
|
387
|
+
field: str,
|
|
388
|
+
regex_pattern: str = None,
|
|
389
|
+
predefined_regex_pattern: str = None,
|
|
390
|
+
filters: Dict = None,
|
|
391
|
+
) -> int:
|
|
392
|
+
"""
|
|
393
|
+
Get the count of string pattern validity
|
|
394
|
+
:param index_name: name of the index
|
|
395
|
+
:param field: field name
|
|
396
|
+
:param regex_pattern: regex pattern
|
|
397
|
+
:param predefined_regex_pattern: predefined regex pattern
|
|
398
|
+
:param filters: filter condition
|
|
399
|
+
:return: count of valid values, count of total row count
|
|
400
|
+
"""
|
|
401
|
+
regex_patterns = {"usa_phone": "\\+?1?[-.\\s]?\\(?[0-9]{3}\\)?[-.\\s]?[0-9]{3}[-.\\s]?[0-9]{4}"}
|
|
402
|
+
|
|
403
|
+
if not regex_pattern and not predefined_regex_pattern:
|
|
404
|
+
raise ValueError("Either regex_pattern or predefined_regex_pattern should be provided")
|
|
405
|
+
|
|
406
|
+
if predefined_regex_pattern:
|
|
407
|
+
regex_string = regex_patterns[predefined_regex_pattern]
|
|
408
|
+
else:
|
|
409
|
+
regex_string = regex_pattern
|
|
410
|
+
|
|
411
|
+
query = {
|
|
412
|
+
"track_total_hits": True,
|
|
413
|
+
"query": {"regexp": {f"{field}.keyword": regex_string}},
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
if filters:
|
|
417
|
+
query["query"]["bool"]["filter"] = filters
|
|
418
|
+
|
|
419
|
+
response = self.client.search(index=index_name, body=query)
|
|
420
|
+
total_count = self.client.count(index=index_name, body={"query": {"match_all": {}}})
|
|
421
|
+
return response["hits"]["total"]["value"], total_count["count"]
|