icsDataValidation 1.0.378__py3-none-any.whl → 1.0.415__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icsDataValidation/configuration.py +0 -0
- icsDataValidation/connection_setups/__init__.py +0 -0
- icsDataValidation/connection_setups/azure_connection_setup.py +2 -1
- icsDataValidation/connection_setups/databricks_connection_setup.py +0 -0
- icsDataValidation/connection_setups/exasol_connection_setup.py +0 -0
- icsDataValidation/connection_setups/oracle_connection_setup.py +0 -0
- icsDataValidation/connection_setups/snowflake_connection_setup.py +0 -0
- icsDataValidation/connection_setups/sqlserver_connection_setup.py +20 -0
- icsDataValidation/connection_setups/teradata_connection_setup.py +0 -0
- icsDataValidation/core/__init__.py +0 -0
- icsDataValidation/core/database_objects.py +0 -0
- icsDataValidation/core/object_comparison.py +0 -0
- icsDataValidation/input_parameters/__init__.py +0 -0
- icsDataValidation/input_parameters/testing_tool_params.py +4 -3
- icsDataValidation/main.py +15 -11
- icsDataValidation/output_parameters/__init__.py +0 -0
- icsDataValidation/output_parameters/result_params.py +0 -0
- icsDataValidation/services/__init__.py +0 -0
- icsDataValidation/services/comparison_service.py +80 -76
- icsDataValidation/services/database_services/__init__.py +0 -0
- icsDataValidation/services/database_services/azure_service.py +69 -43
- icsDataValidation/services/database_services/databricks_hive_metastore_service.py +20 -7
- icsDataValidation/services/database_services/databricks_unity_catalog_service.py +20 -12
- icsDataValidation/services/database_services/exasol_service.py +26 -23
- icsDataValidation/services/database_services/oracle_service.py +64 -55
- icsDataValidation/services/database_services/snowflake_service.py +85 -36
- icsDataValidation/services/database_services/sqlserver_service.py +868 -0
- icsDataValidation/services/database_services/teradata_service.py +54 -37
- icsDataValidation/services/initialization_service.py +0 -0
- icsDataValidation/services/result_service.py +0 -0
- icsDataValidation/services/system_service.py +4 -0
- icsDataValidation/services/testset_service.py +0 -0
- icsDataValidation/utils/__init__.py +0 -0
- icsDataValidation/utils/file_util.py +0 -0
- icsDataValidation/utils/logger_util.py +0 -0
- icsDataValidation/utils/pandas_util.py +0 -0
- icsDataValidation/utils/parallelization_util.py +0 -0
- icsDataValidation/utils/sql_util.py +0 -0
- icsdatavalidation-1.0.415.dist-info/METADATA +298 -0
- {icsDataValidation-1.0.378.dist-info → icsdatavalidation-1.0.415.dist-info}/RECORD +18 -18
- {icsDataValidation-1.0.378.dist-info → icsdatavalidation-1.0.415.dist-info}/WHEEL +1 -1
- icsdatavalidation-1.0.415.dist-info/top_level.txt +1 -0
- examples/ics_data_validation.py +0 -7
- examples/manual_execution_params.template.py +0 -44
- icsDataValidation-1.0.378.dist-info/METADATA +0 -20
- icsDataValidation-1.0.378.dist-info/top_level.txt +0 -4
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import pyodbc
|
|
2
|
-
import pandas.io.sql
|
|
2
|
+
import pandas.io.sql
|
|
3
3
|
import pandas as pd
|
|
4
4
|
import logging
|
|
5
5
|
|
|
6
|
-
from typing import Union, List, Dict
|
|
6
|
+
from typing import Union, List, Dict
|
|
7
7
|
|
|
8
8
|
from icsDataValidation.utils.logger_util import configure_dev_ops_logger
|
|
9
9
|
from icsDataValidation.core.database_objects import DatabaseObject
|
|
@@ -17,7 +17,7 @@ logger = logging.getLogger('Azure_Service')
|
|
|
17
17
|
logger.setLevel(logging.INFO)
|
|
18
18
|
configure_dev_ops_logger(logger)
|
|
19
19
|
|
|
20
|
-
class AzureService
|
|
20
|
+
class AzureService:
|
|
21
21
|
def __init__(self, connection_params: dict):
|
|
22
22
|
self.connection_params =connection_params
|
|
23
23
|
self.azure_connection = None
|
|
@@ -34,12 +34,17 @@ class AzureService(object):
|
|
|
34
34
|
self.azure_connection.close()
|
|
35
35
|
|
|
36
36
|
def _connect_to_azure(self):
|
|
37
|
-
azure_connection_string =
|
|
38
|
-
|
|
37
|
+
azure_connection_string = (
|
|
38
|
+
f"DRIVER={self.connection_params['Driver']};"
|
|
39
|
+
f"SERVER={self.connection_params['Server']};"
|
|
40
|
+
f"PORT={self.connection_params['Port']};"
|
|
41
|
+
f"DATABASE={self.connection_params['Database']};"
|
|
42
|
+
f"UID={self.connection_params['User']};"
|
|
43
|
+
f"PWD={self.connection_params['Password']}"
|
|
44
|
+
)
|
|
39
45
|
self.azure_connection = pyodbc.connect(azure_connection_string)
|
|
40
46
|
return self.azure_connection
|
|
41
47
|
|
|
42
|
-
|
|
43
48
|
@staticmethod
|
|
44
49
|
def _get_error_message(excepction: Exception, statement: str) -> None:
|
|
45
50
|
"""
|
|
@@ -65,21 +70,20 @@ class AzureService(object):
|
|
|
65
70
|
|
|
66
71
|
if object_type_restriction=='include_all' or object_type_restriction=='include_only_tables':
|
|
67
72
|
if schema:
|
|
68
|
-
query_db_tables=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.tables t where schema_name(t.schema_id) = '{
|
|
69
|
-
else:
|
|
73
|
+
query_db_tables=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.tables t where schema_name(t.schema_id) = '{schema}' order by schema_name;"
|
|
74
|
+
else:
|
|
70
75
|
query_db_tables=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.tables t order by schema_name;"
|
|
71
76
|
|
|
72
77
|
all_database_tables = self.execute_queries(query_db_tables)
|
|
73
78
|
|
|
74
|
-
|
|
75
79
|
elif object_type_restriction=='include_all' or object_type_restriction=='include_only_views':
|
|
76
80
|
if schema:
|
|
77
|
-
query_db_views=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.views t where schema_name(t.schema_id) = '{
|
|
78
|
-
else:
|
|
81
|
+
query_db_views=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.views t where schema_name(t.schema_id) = '{schema}' order by schema_name;"
|
|
82
|
+
else:
|
|
79
83
|
query_db_views=f"select schema_name(t.schema_id) as schema_name, t.name as table_name from sys.views t order by schema_name;"
|
|
80
84
|
|
|
81
85
|
all_database_views = self.execute_queries(query_db_views)
|
|
82
|
-
|
|
86
|
+
|
|
83
87
|
database_objects=[]
|
|
84
88
|
for row in all_database_tables:
|
|
85
89
|
database_table=f'{database}.{row[0].upper()}.{row[1].upper()}'
|
|
@@ -110,13 +114,13 @@ class AzureService(object):
|
|
|
110
114
|
def get_row_count_from_object(self, object : DatabaseObject) -> int:
|
|
111
115
|
if self.azure_connection is None:
|
|
112
116
|
self._connect_to_azure()
|
|
113
|
-
|
|
117
|
+
|
|
114
118
|
query_get_row_count = f"select count(*) as ROW_COUNT from {object.schema}.{object.name};"
|
|
115
119
|
|
|
116
120
|
row_count = self.execute_queries(query_get_row_count).fetchall()[0][0]
|
|
117
121
|
|
|
118
122
|
return row_count
|
|
119
|
-
|
|
123
|
+
|
|
120
124
|
def get_data_types_from_object(self, object : DatabaseObject, column_intersections: list) -> dict:
|
|
121
125
|
results = []
|
|
122
126
|
|
|
@@ -124,27 +128,25 @@ class AzureService(object):
|
|
|
124
128
|
self._connect_to_azure()
|
|
125
129
|
|
|
126
130
|
column_intersections = str(column_intersections)[1:-1]
|
|
127
|
-
query_get_data_types_from_object=f"select col.name, t.name as data_type from sys.tables as tab inner join sys.columns as col on tab.object_id = col.object_id left join sys.types as t on col.user_type_id = t.user_type_id where tab.name = '{object.name}' and schema_name(tab.schema_id) = '{object.schema}'"
|
|
131
|
+
query_get_data_types_from_object=f"select col.name, t.name as data_type from sys.tables as tab inner join sys.columns as col on tab.object_id = col.object_id left join sys.types as t on col.user_type_id = t.user_type_id where tab.name = '{object.name}' and schema_name(tab.schema_id) = '{object.schema}'"
|
|
128
132
|
dict_colummns_datatype=self.execute_queries(query_get_data_types_from_object).fetchall()
|
|
129
133
|
|
|
130
|
-
for row in dict_colummns_datatype
|
|
131
|
-
# logger.info(type(row))
|
|
132
|
-
row_to_list = [elem for elem in row]
|
|
133
|
-
results.append({"COLUMN_NAME":row_to_list[0],"DATA_TYPE":row_to_list[1]})
|
|
134
|
+
results = [{"COLUMN_NAME":row[0],"DATA_TYPE":row[1]} for row in dict_colummns_datatype]
|
|
134
135
|
|
|
135
136
|
return results
|
|
136
137
|
|
|
137
|
-
def get_count_distincts_from_object(self, object : DatabaseObject, column_intersections: list
|
|
138
|
+
def get_count_distincts_from_object(self, object : DatabaseObject, column_intersections: list,
|
|
139
|
+
enclose_column_by_double_quotes: bool = False) -> dict:
|
|
138
140
|
if self.azure_connection is None:
|
|
139
141
|
self._connect_to_azure()
|
|
140
142
|
|
|
141
143
|
unions=""
|
|
142
144
|
for column in column_intersections:
|
|
143
145
|
unions +=f"UNION SELECT '{column}' AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.schema}.{object.name}"
|
|
144
|
-
|
|
146
|
+
|
|
145
147
|
query_get_count_distincts_from_object=f"{unions[5:]} ORDER BY COUNT_DISTINCT;"
|
|
146
148
|
dict_count_distincts=self.execute_queries(query_get_count_distincts_from_object).fetchall()
|
|
147
|
-
|
|
149
|
+
|
|
148
150
|
return dict_count_distincts
|
|
149
151
|
|
|
150
152
|
def get_table_size(self, object : DatabaseObject) -> int:
|
|
@@ -154,7 +156,25 @@ class AzureService(object):
|
|
|
154
156
|
|
|
155
157
|
return size
|
|
156
158
|
|
|
157
|
-
def create_checksums(
|
|
159
|
+
def create_checksums(
|
|
160
|
+
self,
|
|
161
|
+
object : DatabaseObject,
|
|
162
|
+
column_intersections: list,
|
|
163
|
+
enclose_column_by_double_quotes: bool = False
|
|
164
|
+
) -> List[Dict]:
|
|
165
|
+
"""creates checksums for given object in compliance with given conditions
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
object (DatabaseObject): table or view
|
|
169
|
+
column_intersections (list): columns that are used for checksums
|
|
170
|
+
where_clause (str, optional): Optional filter criteria given as sql-usable string. Defaults to "".
|
|
171
|
+
exclude_columns (list, optional): columns to exlude from calculation. Defaults to [].
|
|
172
|
+
numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
List[Dict]: checksums for columns of object
|
|
176
|
+
"""
|
|
177
|
+
|
|
158
178
|
if self.azure_connection is None:
|
|
159
179
|
self._connect_to_azure()
|
|
160
180
|
|
|
@@ -162,7 +182,7 @@ class AzureService(object):
|
|
|
162
182
|
|
|
163
183
|
dict_colummns_datatype=self.get_data_types_from_object(object, column_intersections)
|
|
164
184
|
|
|
165
|
-
# dict_colummns_datatype_dict = dict(zip(dict_colummns_datatype[::2], dict_colummns_datatype[1::2]))
|
|
185
|
+
# dict_colummns_datatype_dict = dict(zip(dict_colummns_datatype[::2], dict_colummns_datatype[1::2]))
|
|
166
186
|
|
|
167
187
|
aggregates = ""
|
|
168
188
|
count_nulls = ""
|
|
@@ -174,13 +194,13 @@ class AzureService(object):
|
|
|
174
194
|
count_nulls += f", sum(case when {column} is null then 1 else 0 end) as countnulls_{column}"
|
|
175
195
|
|
|
176
196
|
if column_datatype.lower() == 'tinyint' or column_datatype.lower() == 'smallint' or column_datatype.lower() == 'int' or column_datatype.lower() == 'bigint' or column_datatype.lower() == 'decimal' or column_datatype.lower() == 'numeric' or column_datatype.lower() == 'smallmoney' or column_datatype.lower() == 'money' or column_datatype.lower() == 'float' or column_datatype.lower() == 'real':
|
|
177
|
-
|
|
197
|
+
|
|
178
198
|
aggregates += f", sum({column}) as sum_{column}"
|
|
179
199
|
|
|
180
200
|
elif column_datatype.lower() == 'char' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'text' or column_datatype.lower() == 'nchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'ntext' or column_datatype.lower() == 'binary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'datetime' or column_datatype.lower() == 'datetime2' or column_datatype.lower() == 'smalldatetime' or column_datatype.lower() == 'date' or column_datatype.lower() == 'time' or column_datatype.lower() == 'datetimeoffset' or column_datatype.lower() == 'timestamp':
|
|
181
|
-
|
|
201
|
+
|
|
182
202
|
aggregates += f", count(distinct lower({column})) as countdistinct_{column}"
|
|
183
|
-
|
|
203
|
+
|
|
184
204
|
elif column_datatype.lower() == 'bit':
|
|
185
205
|
|
|
186
206
|
aggregates += f", (SELECT CONCAT ((select count(*) as val FROM {object.schema}.{object.name} WHERE {column} = 1),'_',(select count(*) as val from {object.schema}.{object.name} WHERE {column} = 0))) AS aggregateboolean_{column}"
|
|
@@ -210,26 +230,27 @@ class AzureService(object):
|
|
|
210
230
|
agg_result = 0
|
|
211
231
|
else:
|
|
212
232
|
agg_result = aggregation_results[i]
|
|
213
|
-
|
|
233
|
+
|
|
214
234
|
if countnulls_results[i] is None:
|
|
215
235
|
cnt_result = 0
|
|
216
236
|
else:
|
|
217
237
|
cnt_result = countnulls_results[i]
|
|
218
|
-
|
|
238
|
+
|
|
219
239
|
test_list.append([[item.split("_", 1)[0] for item in aggregation_columns][i],agg_result,cnt_result])
|
|
220
240
|
|
|
221
241
|
checksums = dict(zip([item.split("_", 1)[1] for item in aggregation_columns] , test_list))
|
|
222
242
|
|
|
223
243
|
return checksums
|
|
224
244
|
|
|
225
|
-
def create_pandas_df_from_group_by(self, object : DatabaseObject, column_intersections: list, group_by_column: str
|
|
245
|
+
def create_pandas_df_from_group_by(self, object : DatabaseObject, column_intersections: list, group_by_column: str,
|
|
246
|
+
enclose_column_by_double_quotes: bool = False) -> List[Dict]:
|
|
226
247
|
|
|
227
248
|
if self.teradata_connection is None:
|
|
228
249
|
self._connect_to_teradata()
|
|
229
250
|
|
|
230
251
|
aggregation_columns= [f"{column.upper()}" for column in column_intersections if column != group_by_column]
|
|
231
252
|
|
|
232
|
-
dict_colummns_datatype=self.get_data_types_from_object(object, aggregation_columns)
|
|
253
|
+
dict_colummns_datatype=self.get_data_types_from_object(object, aggregation_columns)
|
|
233
254
|
|
|
234
255
|
aggregates = ""
|
|
235
256
|
|
|
@@ -238,13 +259,13 @@ class AzureService(object):
|
|
|
238
259
|
column_datatype = column_datatype.split('(')[0]
|
|
239
260
|
|
|
240
261
|
if column_datatype.lower() == 'tinyint' or column_datatype.lower() == 'smallint' or column_datatype.lower() == 'int' or column_datatype.lower() == 'bigint' or column_datatype.lower() == 'decimal' or column_datatype.lower() == 'numeric' or column_datatype.lower() == 'smallmoney' or column_datatype.lower() == 'money' or column_datatype.lower() == 'float' or column_datatype.lower() == 'real':
|
|
241
|
-
|
|
262
|
+
|
|
242
263
|
aggregates += f", sum({column}) as sum_{column}"
|
|
243
264
|
|
|
244
265
|
elif column_datatype.lower() == 'char' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'varchar' or column_datatype.lower() == 'text' or column_datatype.lower() == 'nchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'nvarchar' or column_datatype.lower() == 'ntext' or column_datatype.lower() == 'binary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'varbinary' or column_datatype.lower() == 'datetime' or column_datatype.lower() == 'datetime2' or column_datatype.lower() == 'smalldatetime' or column_datatype.lower() == 'date' or column_datatype.lower() == 'time' or column_datatype.lower() == 'datetimeoffset' or column_datatype.lower() == 'timestamp':
|
|
245
|
-
|
|
266
|
+
|
|
246
267
|
aggregates += f", count(distinct lower({column})) as countdistinct_{column}"
|
|
247
|
-
|
|
268
|
+
|
|
248
269
|
elif column_datatype.lower() == 'bit':
|
|
249
270
|
|
|
250
271
|
aggregates += f", (SELECT CONCAT ((select count(*) as val FROM {object.schema}.{object.name} WHERE {column} = 1),'_',(select count(*) as val from {object.schema}.{object.name} WHERE {column} = 0))) AS aggregateboolean_{column}"
|
|
@@ -257,15 +278,22 @@ class AzureService(object):
|
|
|
257
278
|
|
|
258
279
|
return group_by_aggregation_pdf
|
|
259
280
|
|
|
260
|
-
def create_pandas_df(
|
|
261
|
-
|
|
281
|
+
def create_pandas_df(
|
|
282
|
+
self,
|
|
283
|
+
object : DatabaseObject,
|
|
284
|
+
intersection_columns_trgt_src: list,
|
|
285
|
+
where_clause:str="",
|
|
286
|
+
exclude_columns:list=[],
|
|
287
|
+
enclose_column_by_double_quotes: bool = False
|
|
288
|
+
) -> pd.DataFrame:
|
|
289
|
+
|
|
262
290
|
if self.azure_connection is None:
|
|
263
291
|
self._connect_to_azure()
|
|
264
|
-
|
|
292
|
+
|
|
265
293
|
intersection_columns_trgt_src_ = ', '.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
|
|
266
294
|
|
|
267
295
|
df_query = f"select {intersection_columns_trgt_src_} from {object.schema}.{object.name} {where_clause};"
|
|
268
|
-
|
|
296
|
+
|
|
269
297
|
pdf = self.execute_queries(df_query,True)
|
|
270
298
|
|
|
271
299
|
return pdf
|
|
@@ -274,7 +302,7 @@ class AzureService(object):
|
|
|
274
302
|
def execute_queries(self, query: Union[str, List[str]],return_as_pdf:bool=False) -> Union[List[Dict], List[List[Dict]]]:
|
|
275
303
|
if self.azure_connection is None:
|
|
276
304
|
self._connect_to_azure()
|
|
277
|
-
|
|
305
|
+
|
|
278
306
|
query_list: List[str] = query if isinstance(query, list) else [query]
|
|
279
307
|
|
|
280
308
|
results = []
|
|
@@ -285,7 +313,7 @@ class AzureService(object):
|
|
|
285
313
|
query_result = pandas.io.sql.read_sql(single_query, self.azure_connection)
|
|
286
314
|
else:
|
|
287
315
|
query_result=self.azure_connection.execute(single_query)
|
|
288
|
-
|
|
316
|
+
|
|
289
317
|
results.append(query_result)
|
|
290
318
|
|
|
291
319
|
except Exception as err:
|
|
@@ -311,10 +339,8 @@ class AzureService(object):
|
|
|
311
339
|
for single_statement in statement_list:
|
|
312
340
|
stripped_statement = (
|
|
313
341
|
single_statement.strip()
|
|
314
|
-
)
|
|
342
|
+
)
|
|
315
343
|
_ = self.azure_connection.execute(stripped_statement)
|
|
316
344
|
|
|
317
345
|
except Exception as err:
|
|
318
346
|
raise Exception(self._get_error_message(err, single_statement)) from err
|
|
319
|
-
|
|
320
|
-
|
|
@@ -85,6 +85,7 @@ class DatabricksHiveMetastoreService(object):
|
|
|
85
85
|
numeric_columns: list,
|
|
86
86
|
numeric_scale: int,
|
|
87
87
|
where_exists: bool = True,
|
|
88
|
+
enclose_column_by_double_quotes: bool = False,
|
|
88
89
|
) -> str:
|
|
89
90
|
"""generates in_clause from list ready to expand the where clause, numeric values are rounded
|
|
90
91
|
|
|
@@ -110,7 +111,7 @@ class DatabricksHiveMetastoreService(object):
|
|
|
110
111
|
in_clause_cols = f" WHERE (("
|
|
111
112
|
for key in key_filters.keys():
|
|
112
113
|
if key in numeric_columns:
|
|
113
|
-
in_clause_cols += f"""ROUND({key.replace("'", "")},
|
|
114
|
+
in_clause_cols += f"""ROUND({key.replace("'", "")}, {numeric_scale})""" + ","
|
|
114
115
|
else:
|
|
115
116
|
in_clause_cols += key.replace("'", "") + ","
|
|
116
117
|
in_clause_cols = in_clause_cols[:-1] + ")"
|
|
@@ -118,7 +119,8 @@ class DatabricksHiveMetastoreService(object):
|
|
|
118
119
|
return in_clause
|
|
119
120
|
|
|
120
121
|
def _get_column_clause(
|
|
121
|
-
self, column_list: list, columns_datatype: list, numeric_scale, key_columns
|
|
122
|
+
self, column_list: list, columns_datatype: list, numeric_scale, key_columns,
|
|
123
|
+
enclose_column_by_double_quotes: bool = False
|
|
122
124
|
) -> dict:
|
|
123
125
|
"""turns list of desired columns into a sql compatible string
|
|
124
126
|
|
|
@@ -336,6 +338,7 @@ class DatabricksHiveMetastoreService(object):
|
|
|
336
338
|
column_intersections: list,
|
|
337
339
|
where_clause: str = "",
|
|
338
340
|
exclude_columns: list = [],
|
|
341
|
+
enclose_column_by_double_quotes: bool = False,
|
|
339
342
|
) -> dict:
|
|
340
343
|
"""get distinct count for every column in a database object that is in column intersections list
|
|
341
344
|
|
|
@@ -412,6 +415,7 @@ class DatabricksHiveMetastoreService(object):
|
|
|
412
415
|
where_clause: str = "",
|
|
413
416
|
exclude_columns: list = [],
|
|
414
417
|
numeric_scale: int = None,
|
|
418
|
+
enclose_column_by_double_quotes: bool = False,
|
|
415
419
|
) -> List[Dict]:
|
|
416
420
|
"""creates checksums for given object in compliance with given conditions
|
|
417
421
|
|
|
@@ -514,6 +518,7 @@ class DatabricksHiveMetastoreService(object):
|
|
|
514
518
|
where_clause: str,
|
|
515
519
|
exclude_columns: list,
|
|
516
520
|
numeric_scale: int = None,
|
|
521
|
+
enclose_column_by_double_quotes: bool = False,
|
|
517
522
|
) -> List[Dict]:
|
|
518
523
|
"""execution of multiple aggregations at once
|
|
519
524
|
|
|
@@ -660,7 +665,12 @@ class DatabricksHiveMetastoreService(object):
|
|
|
660
665
|
)
|
|
661
666
|
|
|
662
667
|
def create_pandas_df(
|
|
663
|
-
self,
|
|
668
|
+
self,
|
|
669
|
+
object: DatabaseObject,
|
|
670
|
+
intersection_columns_trgt_src: list,
|
|
671
|
+
where_clause:str="",
|
|
672
|
+
exclude_columns:list=[],
|
|
673
|
+
enclose_column_by_double_quotes: bool = False
|
|
664
674
|
) -> pd.DataFrame:
|
|
665
675
|
"""creates pandas dataframes with all data from given object in given columns
|
|
666
676
|
|
|
@@ -693,6 +703,7 @@ class DatabricksHiveMetastoreService(object):
|
|
|
693
703
|
dedicated_columns: list = [],
|
|
694
704
|
sample_count: int = 10,
|
|
695
705
|
numeric_scale: int = None,
|
|
706
|
+
enclose_column_by_double_quotes: bool = False,
|
|
696
707
|
) -> List[Dict]:
|
|
697
708
|
if self.databricks_connection is None:
|
|
698
709
|
self._connect_to_databricks()
|
|
@@ -742,7 +753,7 @@ class DatabricksHiveMetastoreService(object):
|
|
|
742
753
|
values = list(key_filters.values())
|
|
743
754
|
if values[0] != []:
|
|
744
755
|
in_clause = self._get_in_clause(
|
|
745
|
-
key_filters, numeric_columns, numeric_scale, where_exists
|
|
756
|
+
key_filters, numeric_columns, numeric_scale, where_exists, enclose_column_by_double_quotes
|
|
746
757
|
)
|
|
747
758
|
else:
|
|
748
759
|
in_clause = ""
|
|
@@ -750,13 +761,14 @@ class DatabricksHiveMetastoreService(object):
|
|
|
750
761
|
elif key_intersection != [] and not is_dedicated:
|
|
751
762
|
keys = str(key_intersection)[1:-1].replace("'", "")
|
|
752
763
|
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
753
|
-
column_intersections, dict_colummns_datatype, numeric_scale, key_columns
|
|
764
|
+
column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
|
|
765
|
+
enclose_column_by_double_quotes
|
|
754
766
|
)
|
|
755
767
|
if (key_filters != {}) & (filter_intersection != []):
|
|
756
768
|
values = list(key_filters.values())
|
|
757
769
|
if values[0] != []:
|
|
758
770
|
in_clause = self._get_in_clause(
|
|
759
|
-
key_filters, numeric_columns, numeric_scale, where_exists
|
|
771
|
+
key_filters, numeric_columns, numeric_scale, where_exists, enclose_column_by_double_quotes
|
|
760
772
|
)
|
|
761
773
|
else:
|
|
762
774
|
in_clause = ""
|
|
@@ -767,7 +779,8 @@ class DatabricksHiveMetastoreService(object):
|
|
|
767
779
|
)
|
|
768
780
|
column_intersections.sort()
|
|
769
781
|
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
770
|
-
column_intersections, dict_colummns_datatype, numeric_scale, key_columns
|
|
782
|
+
column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
|
|
783
|
+
enclose_column_by_double_quotes
|
|
771
784
|
)
|
|
772
785
|
sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause};"
|
|
773
786
|
|
|
@@ -85,6 +85,7 @@ class DatabricksUnityCatalogService(object):
|
|
|
85
85
|
numeric_columns: list,
|
|
86
86
|
numeric_scale: int,
|
|
87
87
|
where_exists: bool = True,
|
|
88
|
+
enclose_column_by_double_quotes: bool = False,
|
|
88
89
|
) -> str:
|
|
89
90
|
"""generates in_clause from list ready to expand the where clause, numeric values are rounded
|
|
90
91
|
|
|
@@ -110,7 +111,7 @@ class DatabricksUnityCatalogService(object):
|
|
|
110
111
|
in_clause_cols = f" WHERE (("
|
|
111
112
|
for key in key_filters.keys():
|
|
112
113
|
if key in numeric_columns:
|
|
113
|
-
in_clause_cols += f"""ROUND({key.replace("'", "")},
|
|
114
|
+
in_clause_cols += f"""ROUND({key.replace("'", "")}, {numeric_scale})""" + ","
|
|
114
115
|
else:
|
|
115
116
|
in_clause_cols += key.replace("'", "") + ","
|
|
116
117
|
in_clause_cols = in_clause_cols[:-1] + ")"
|
|
@@ -118,7 +119,8 @@ class DatabricksUnityCatalogService(object):
|
|
|
118
119
|
return in_clause
|
|
119
120
|
|
|
120
121
|
def _get_column_clause(
|
|
121
|
-
self, column_list: list, columns_datatype: list, numeric_scale, key_columns
|
|
122
|
+
self, column_list: list, columns_datatype: list, numeric_scale, key_columns,
|
|
123
|
+
enclose_column_by_double_quotes: bool = False
|
|
122
124
|
) -> dict:
|
|
123
125
|
"""turns list of desired columns into a sql compatible string
|
|
124
126
|
|
|
@@ -334,6 +336,7 @@ class DatabricksUnityCatalogService(object):
|
|
|
334
336
|
column_intersections: list,
|
|
335
337
|
where_clause: str = "",
|
|
336
338
|
exclude_columns: list = [],
|
|
339
|
+
enclose_column_by_double_quotes: bool = False,
|
|
337
340
|
) -> dict:
|
|
338
341
|
"""get distinct count for every column in a database object that is in column intersections list
|
|
339
342
|
|
|
@@ -410,6 +413,7 @@ class DatabricksUnityCatalogService(object):
|
|
|
410
413
|
where_clause: str = "",
|
|
411
414
|
exclude_columns: list = [],
|
|
412
415
|
numeric_scale: int = None,
|
|
416
|
+
enclose_column_by_double_quotes: bool = False,
|
|
413
417
|
) -> List[Dict]:
|
|
414
418
|
"""creates checksums for given object in compliance with given conditions
|
|
415
419
|
|
|
@@ -504,6 +508,7 @@ class DatabricksUnityCatalogService(object):
|
|
|
504
508
|
where_clause: str,
|
|
505
509
|
exclude_columns: list,
|
|
506
510
|
numeric_scale: int = None,
|
|
511
|
+
enclose_column_by_double_quotes: bool = False,
|
|
507
512
|
) -> List[Dict]:
|
|
508
513
|
"""execution of multiple aggregations at once
|
|
509
514
|
|
|
@@ -648,10 +653,11 @@ class DatabricksUnityCatalogService(object):
|
|
|
648
653
|
grouping_columns_final,
|
|
649
654
|
error_dict
|
|
650
655
|
)
|
|
651
|
-
|
|
656
|
+
|
|
652
657
|
|
|
653
658
|
def create_pandas_df(
|
|
654
|
-
self, object: DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[]
|
|
659
|
+
self, object: DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[],
|
|
660
|
+
enclose_column_by_double_quotes: bool = False
|
|
655
661
|
) -> pd.DataFrame:
|
|
656
662
|
"""creates pandas dataframes with all data from given object in given columns
|
|
657
663
|
|
|
@@ -684,6 +690,7 @@ class DatabricksUnityCatalogService(object):
|
|
|
684
690
|
dedicated_columns: list = [],
|
|
685
691
|
sample_count: int = 10,
|
|
686
692
|
numeric_scale: int = None,
|
|
693
|
+
enclose_column_by_double_quotes: bool = False,
|
|
687
694
|
) -> List[Dict]:
|
|
688
695
|
if self.databricks_connection is None:
|
|
689
696
|
self._connect_to_databricks()
|
|
@@ -728,12 +735,13 @@ class DatabricksUnityCatalogService(object):
|
|
|
728
735
|
dict_colummns_datatype,
|
|
729
736
|
numeric_scale,
|
|
730
737
|
key_columns,
|
|
738
|
+
enclose_column_by_double_quotes
|
|
731
739
|
)
|
|
732
740
|
if (key_filters != {}) & (filter_intersection != []):
|
|
733
741
|
values = list(key_filters.values())
|
|
734
742
|
if values[0] != []:
|
|
735
743
|
in_clause = self._get_in_clause(
|
|
736
|
-
key_filters, numeric_columns, numeric_scale, where_exists
|
|
744
|
+
key_filters, numeric_columns, numeric_scale, where_exists, enclose_column_by_double_quotes
|
|
737
745
|
)
|
|
738
746
|
else:
|
|
739
747
|
in_clause = ""
|
|
@@ -741,13 +749,14 @@ class DatabricksUnityCatalogService(object):
|
|
|
741
749
|
elif key_intersection != [] and not is_dedicated:
|
|
742
750
|
keys = str(key_intersection)[1:-1].replace("'", "")
|
|
743
751
|
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
744
|
-
column_intersections, dict_colummns_datatype, numeric_scale, key_columns
|
|
752
|
+
column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
|
|
753
|
+
enclose_column_by_double_quotes
|
|
745
754
|
)
|
|
746
755
|
if (key_filters != {}) & (filter_intersection != []):
|
|
747
756
|
values = list(key_filters.values())
|
|
748
757
|
if values[0] != []:
|
|
749
758
|
in_clause = self._get_in_clause(
|
|
750
|
-
key_filters, numeric_columns, numeric_scale, where_exists
|
|
759
|
+
key_filters, numeric_columns, numeric_scale, where_exists, enclose_column_by_double_quotes
|
|
751
760
|
)
|
|
752
761
|
else:
|
|
753
762
|
in_clause = ""
|
|
@@ -758,7 +767,8 @@ class DatabricksUnityCatalogService(object):
|
|
|
758
767
|
)
|
|
759
768
|
column_intersections.sort()
|
|
760
769
|
column_clause, numeric_columns, used_columns = self._get_column_clause(
|
|
761
|
-
column_intersections, dict_colummns_datatype, numeric_scale, key_columns
|
|
770
|
+
column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
|
|
771
|
+
enclose_column_by_double_quotes
|
|
762
772
|
)
|
|
763
773
|
sample_query = f"SELECT {column_clause} FROM {object.database}.{object.schema}.{object.name} TABLESAMPLE ({sample_count} ROWS) {where_clause};"
|
|
764
774
|
|
|
@@ -1095,7 +1105,7 @@ class DatabricksUnityCatalogService(object):
|
|
|
1095
1105
|
src_filter = object_result['SRC_FILTER']
|
|
1096
1106
|
trgt_filter = object_result['TRGT_FILTER']
|
|
1097
1107
|
excluded_columns = object_result['EXCLUDED_COLUMNS']
|
|
1098
|
-
columns_equal = object_result['COLUMNS_EQUAL']
|
|
1108
|
+
columns_equal = object_result['COLUMNS_EQUAL']
|
|
1099
1109
|
column_intersection = str(object_result['COLUMN_INTERSECTION'])
|
|
1100
1110
|
src_columns_minus_trgt_columns = object_result['SRC_COLUMNS_MINUS_TRGT_COLUMNS']
|
|
1101
1111
|
trgt_columns_minus_src_columns = object_result['TRGT_COLUMNS_MINUS_SRC_COLUMNS']
|
|
@@ -1262,7 +1272,7 @@ class DatabricksUnityCatalogService(object):
|
|
|
1262
1272
|
|
|
1263
1273
|
self.execute_statement(statement)
|
|
1264
1274
|
|
|
1265
|
-
|
|
1275
|
+
|
|
1266
1276
|
|
|
1267
1277
|
# extract the information needed for the table on object level
|
|
1268
1278
|
for object_result in results['OBJECTS']:
|
|
@@ -1375,5 +1385,3 @@ class DatabricksUnityCatalogService(object):
|
|
|
1375
1385
|
;"""
|
|
1376
1386
|
|
|
1377
1387
|
self.execute_statement(insert_statement)
|
|
1378
|
-
|
|
1379
|
-
|