icsDataValidation 1.0.358__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. icsDataValidation/configuration.py +19 -0
  2. icsDataValidation/connection_setups/__init__.py +0 -0
  3. icsDataValidation/connection_setups/azure_connection_setup.py +19 -0
  4. icsDataValidation/connection_setups/databricks_connection_setup.py +28 -0
  5. icsDataValidation/connection_setups/exasol_connection_setup.py +17 -0
  6. icsDataValidation/connection_setups/oracle_connection_setup.py +26 -0
  7. icsDataValidation/connection_setups/snowflake_connection_setup.py +35 -0
  8. icsDataValidation/connection_setups/teradata_connection_setup.py +18 -0
  9. icsDataValidation/core/__init__.py +0 -0
  10. icsDataValidation/core/database_objects.py +18 -0
  11. icsDataValidation/core/object_comparison.py +239 -0
  12. icsDataValidation/input_parameters/__init__.py +0 -0
  13. icsDataValidation/input_parameters/testing_tool_params.py +81 -0
  14. icsDataValidation/main.py +250 -0
  15. icsDataValidation/output_parameters/__init__.py +0 -0
  16. icsDataValidation/output_parameters/result_params.py +94 -0
  17. icsDataValidation/services/__init__.py +0 -0
  18. icsDataValidation/services/comparison_service.py +582 -0
  19. icsDataValidation/services/database_services/__init__.py +0 -0
  20. icsDataValidation/services/database_services/azure_service.py +320 -0
  21. icsDataValidation/services/database_services/databricks_hive_metastore_service.py +1694 -0
  22. icsDataValidation/services/database_services/databricks_unity_catalog_service.py +1379 -0
  23. icsDataValidation/services/database_services/exasol_service.py +261 -0
  24. icsDataValidation/services/database_services/oracle_service.py +713 -0
  25. icsDataValidation/services/database_services/snowflake_service.py +1100 -0
  26. icsDataValidation/services/database_services/teradata_service.py +665 -0
  27. icsDataValidation/services/initialization_service.py +103 -0
  28. icsDataValidation/services/result_service.py +573 -0
  29. icsDataValidation/services/system_service.py +61 -0
  30. icsDataValidation/services/testset_service.py +257 -0
  31. icsDataValidation/utils/__init__.py +0 -0
  32. icsDataValidation/utils/file_util.py +96 -0
  33. icsDataValidation/utils/logger_util.py +96 -0
  34. icsDataValidation/utils/pandas_util.py +159 -0
  35. icsDataValidation/utils/parallelization_util.py +52 -0
  36. icsDataValidation/utils/sql_util.py +14 -0
  37. icsDataValidation-1.0.358.dist-info/METADATA +21 -0
  38. icsDataValidation-1.0.358.dist-info/RECORD +40 -0
  39. icsDataValidation-1.0.358.dist-info/WHEEL +5 -0
  40. icsDataValidation-1.0.358.dist-info/top_level.txt +1 -0
@@ -0,0 +1,665 @@
1
+ import teradatasql
2
+ import pandas as pd
3
+ import logging
4
+
5
+ from typing import Union, List, Dict
6
+
7
+ from icsDataValidation.utils.logger_util import configure_dev_ops_logger
8
+ from icsDataValidation.core.database_objects import DatabaseObject
9
+ #########################################################################################
10
+ #########################################################################################
11
+
12
+ # Configure Dev Ops Logger
13
+
14
+ logger = logging.getLogger('Teradata_Service')
15
+ logger.setLevel(logging.INFO)
16
+ configure_dev_ops_logger(logger)
17
+
18
+
19
+ class TeradataService(object):
20
+ def __init__(self, connection_params: dict):
21
+ self.connection_params =connection_params
22
+ self.teradata_connection = None
23
+
24
+ def __enter__(self):
25
+ return self
26
+
27
+ def __exit__(self, exception_type, exception_value, traceback):
28
+ if self.teradata_connection is not None:
29
+ self.teradata_connection.close()
30
+
31
+ # def __del__(self):
32
+ # if self.teradata_connection is not None:
33
+ # self.teradata_connection.close()
34
+
35
+ def _connect_to_teradata(self):
36
+ self.teradata_connection = teradatasql.connect(host=self.connection_params['host'], user=self.connection_params['user'], password=self.connection_params['password'], dbs_port=self.connection_params['dbs_port'])
37
+ return self.teradata_connection
38
+
39
+ @staticmethod
40
+ def _get_error_message(excepction: Exception, statement: str) -> None:
41
+ """
42
+ Compose error message if the execution of a statement or query fails.
43
+ """
44
+ if hasattr(excepction, "raw_msg"):
45
+ message = excepction.raw_msg.replace("\n", " ")
46
+ else:
47
+ message = str(
48
+ excepction
49
+ ) # this makes sure that all kinds of errors can have a message, even if they do not have raw_msg attribute
50
+ if hasattr(excepction, "sfqid"):
51
+ message = message + f"\nQuery ID: {excepction.sfqid}"
52
+ return f"Teradata ERROR: {message}\nFailed statement:\n{statement}"
53
+
54
+ def get_database_objects(self, database: str, schema: str=None, object_type_restriction: str='include_all') -> dict:
55
+ if self.teradata_connection is None:
56
+ self._connect_to_teradata()
57
+
58
+ all_database_tables=[]
59
+ all_database_views=[]
60
+ if object_type_restriction=='include_all' or object_type_restriction=='include_only_tables':
61
+ if schema:
62
+ query_db_tables=f"SELECT DataBaseName as schema_name, TableName as table_name FROM dbc.TablesV WHERE TableKind in ('O', 'T') and DatabaseName = '{object.schema}';"
63
+ else:
64
+ query_db_tables=f"SELECT DataBaseName as schema_name, TableName as table_name FROM dbc.TablesV WHERE TableKind in ('O', 'T');"
65
+
66
+ all_database_tables = self.execute_queries(query_db_tables)
67
+
68
+ elif object_type_restriction=='include_all' or object_type_restriction=='include_only_views':
69
+ if schema:
70
+ query_db_views=f"SELECT DataBaseName as schema_name, TableName as table_name FROM dbc.TablesV WHERE TableKind in ('V') and DatabaseName = '{object.schema}';"
71
+ else:
72
+ query_db_views=f"SELECT DataBaseName as schema_name, TableName as table_name FROM dbc.TablesV WHERE TableKind in ('V');"
73
+
74
+ all_database_views = self.execute_queries(query_db_views)
75
+
76
+ database_objects=[]
77
+ for row in all_database_tables:
78
+ table_identifier=f'{database}.{row[0].upper()}.{row[1].upper()}'
79
+ database_objects.append({"object_identifier": table_identifier, "object_type": "table"})
80
+ for row in all_database_views:
81
+ view_identifier=f'{database}.{row[0].upper()}.{row[1].upper()}'
82
+ database_objects.append({"object_identifier": view_identifier, "object_type": "view"})
83
+ return database_objects
84
+
85
+ def get_columns_from_object(self, object : DatabaseObject) -> list:
86
+
87
+ if self.teradata_connection is None:
88
+ self._connect_to_teradata()
89
+
90
+ queries_get_columns = [f"SELECT ColumnName FROM dbc.COLUMNSV WHERE DatabaseName = '{object.schema}' AND TableName = '{object.name}';"]
91
+
92
+ all_columns = self.execute_queries(queries_get_columns)[0]
93
+
94
+ columns=[]
95
+
96
+ for row in all_columns:
97
+ columns.append(row[0].strip())
98
+
99
+ return columns
100
+
101
+ def get_row_count_from_object(self, object : DatabaseObject, where_clause: str="") -> int:
102
+
103
+ if self.teradata_connection is None:
104
+ self._connect_to_teradata()
105
+
106
+ query_get_row_count = f"SELECT COUNT(*) AS ROW_COUNT FROM {object.schema}.{object.name} {where_clause};"
107
+ row_count = -1
108
+ error_list = []
109
+
110
+ try:
111
+
112
+ row_count = self.execute_queries(query_get_row_count).fetchall()[0][0]
113
+
114
+ except Exception as err:
115
+ error_list.append(str(err))
116
+ error_list.append(query_get_row_count)
117
+
118
+ return row_count, error_list
119
+
120
+ def get_data_types_from_object(self, object : DatabaseObject, column_intersections: list) -> dict:
121
+
122
+ results = []
123
+
124
+ if self.teradata_connection is None:
125
+ self._connect_to_teradata()
126
+
127
+
128
+ column_intersections = str(column_intersections)[1:-1]
129
+ if object.type=='table':
130
+ if column_intersections == '':
131
+ column_intersections = "''"
132
+ query_get_data_types_from_table=f"SELECT COLUMNNAME, COLUMNTYPE FROM DBC.COLUMNSV WHERE DATABASENAME = '{object.schema}' AND TableName = '{object.name}' AND ColumnName IN ({column_intersections});"
133
+ dict_colummns_datatype=self.execute_queries(query_get_data_types_from_table).fetchall()
134
+
135
+ elif object.type=='view':
136
+ query_get_data_types_from_table=f"HELP COLUMN {object.schema}.{object.name}.*" # TODO: hier fehlt der filter auf die column_intersections und das resultat muss auf column_name und type eingeschränkt werden
137
+ dict_colummns_datatype=self.execute_queries(query_get_data_types_from_table).fetchall()
138
+
139
+ for row in dict_colummns_datatype:
140
+ # logger.info(type(row))
141
+ row_to_list = [elem.strip() for elem in row]
142
+ results.append({"COLUMN_NAME":row_to_list[0],"DATA_TYPE":row_to_list[1]})
143
+
144
+ return results
145
+
146
+ def get_count_distincts_from_object(self, object : DatabaseObject, column_intersections: list, where_clause: str="", exclude_columns:list=[]) -> dict:
147
+
148
+ if self.teradata_connection is None:
149
+ self._connect_to_teradata()
150
+
151
+ unions=""
152
+ for column in column_intersections:
153
+ if column not in exclude_columns:
154
+ unions +=f"UNION SELECT CAST('{column}' AS VARCHAR(500)) AS COLUMN_NAME, COUNT(DISTINCT {column}) AS COUNT_DISTINCT FROM {object.schema}.{object.name} {where_clause}"
155
+
156
+ query_get_count_distincts_from_object=f"{unions[5:]} ORDER BY 2;"
157
+ error_list = []
158
+ dict_count_distincts = []
159
+
160
+ try:
161
+ count_distincts=self.execute_queries(query_get_count_distincts_from_object).fetchall()
162
+ for result in count_distincts:
163
+
164
+ single_dict = {
165
+ 'COLUMN_NAME': result[0]
166
+ , 'COUNT_DISTINCT': result[1]
167
+ }
168
+
169
+ dict_count_distincts.append(single_dict)
170
+
171
+ except Exception as err:
172
+ #raise err
173
+ error_list.append(["ERROR", str(err).split('|||')[0], str(err).split('|||')[1]])
174
+ return dict_count_distincts, error_list
175
+
176
+
177
+ def get_table_size(self, object: DatabaseObject) -> int:
178
+
179
+ query_get_table_size = f"select SUM(CURRENTPERM) FROM DBC.TABLESIZE WHERE DatabaseName = '{object.schema}' AND tablename = '{object.name}';"
180
+
181
+ size = self.execute_queries(query_get_table_size).fetchall()[0][0]
182
+
183
+ return size
184
+
185
+ def create_checksums(self, object: DatabaseObject, column_intersections: list, where_clause:str="", exclude_columns:list=[]) -> List[Dict]:
186
+
187
+ if self.teradata_connection is None:
188
+ self._connect_to_teradata()
189
+
190
+ # column_intersections= [f"{x.upper()}" for x in column_intersections]
191
+
192
+ dict_colummns_datatype=self.get_data_types_from_object(object, column_intersections)
193
+
194
+ # dict_colummns_datatype_dict = dict(zip(dict_colummns_datatype[::2], dict_colummns_datatype[1::2]))
195
+
196
+ aggregates = ""
197
+ count_nulls = ""
198
+
199
+ for column in column_intersections:
200
+ if column not in exclude_columns:
201
+ column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
202
+ column_datatype = column_datatype.split('(')[0]
203
+
204
+ count_nulls += f", sum(case when {column} is null then 1 else 0 end) as countnulls_{column}"
205
+
206
+ if column_datatype.lower() == 'i8' or column_datatype.lower() == 'i1' or column_datatype.lower() == 'i' or column_datatype.lower() == 'i2':
207
+ aggregates += f", sum(cast ({column} as decimal(30,0))) as SUM_{column}"
208
+ elif column_datatype.lower() == 'bf' or column_datatype.lower() == 'bv' or column_datatype.lower() == 'd' or column_datatype.lower() == 'f' or column_datatype.lower() == 'dy' or column_datatype.lower() == 'dh' or column_datatype.lower() == 'dm' or column_datatype.lower() == 'ds' or column_datatype.lower() == 'hr' or column_datatype.lower() == 'hs' or column_datatype.lower() == 'mi' or column_datatype.lower() == 'ms' or column_datatype.lower() == 'mo' or column_datatype.lower() == 'sc' or column_datatype.lower() == 'yr' or column_datatype.lower() == 'ym' or column_datatype.lower() == 'n' or column_datatype.lower() == 'd' :
209
+ aggregates += f", sum({column}) as SUM_{column}"
210
+ elif column_datatype.lower() == 'cv' or column_datatype.lower() == 'cf' or column_datatype.lower() == 'co' or column_datatype.lower() == 'da' or column_datatype.lower() == 'pd' or column_datatype.lower() == 'pt' or column_datatype.lower() == 'pz' or column_datatype.lower() == 'pm' or column_datatype.lower() == 'at' or column_datatype.lower() == 'ts' or column_datatype.lower() == 'tz' or column_datatype.lower() == 'sz':
211
+ aggregates += f", count(distinct {column}) as countdistinct_{column}"
212
+ elif column_datatype.lower() == 'i1' and 1 == 0:
213
+ aggregates += f", (SELECT CONCAT ((select trim(count(*)) as val FROM {object.schema}.{object.name} WHERE {column} = 1),'_',(select trim(count(*)) as val from {object.schema}.{object.name} WHERE {column} = 0))) AS aggregateboolean_{column}"
214
+ #else: Additional Data Types: ++ TD_ANYTYPE, a1 ARRAY, AN ARRAY , bo BINARY LARGE OBJECT, us USER‑DEFINED TYPE (all types),xm XML
215
+
216
+ query_checksums = f"select {aggregates[1:]} from {object.schema}.{object.name} {where_clause};"
217
+
218
+ query_countnulls = f"select {count_nulls[1:]} from {object.schema}.{object.name} {where_clause};"
219
+
220
+ error_list = []
221
+ test_list=[]
222
+ aggregation_columns = []
223
+
224
+ try:
225
+
226
+ aggregation_cursor = self.execute_queries(query_checksums)
227
+
228
+ aggregation_columns = [column[0].upper() for column in aggregation_cursor.description]
229
+
230
+ aggregation_results = aggregation_cursor.fetchall()[0]
231
+
232
+ countnulls_cursor = self.execute_queries(query_countnulls)
233
+
234
+ countnulls_results = countnulls_cursor.fetchall()[0]
235
+
236
+ for i in range(0,len(aggregation_results)):
237
+
238
+ if aggregation_results[i] is None:
239
+ agg_result = 0
240
+ else:
241
+ agg_result = aggregation_results[i]
242
+
243
+ if countnulls_results[i] is None:
244
+ cnt_result = 0
245
+ else:
246
+ cnt_result = countnulls_results[i]
247
+
248
+ test_list.append([[item.split("_", 1)[0] for item in aggregation_columns][i],agg_result,cnt_result])
249
+
250
+
251
+
252
+ except Exception as err:
253
+ error_list.append(["ERROR", str(err).split('|||')[0], str(err).split('|||')[1]])
254
+ checksums = dict(zip([item.split("_", 1)[1] for item in aggregation_columns] , test_list))
255
+ checksums['TESTATM_ERRORS'] = error_list
256
+
257
+ return checksums
258
+
259
+
260
+ def create_pandas_df_from_group_by(self, object: DatabaseObject, column_intersections: list, group_by_columns: list, group_by_aggregation_columns: list, group_by_aggregation_type: str, only_numeric: bool, where_clause: str, exclude_columns: list, numeric_scale: int=None) -> List[Dict]:
261
+
262
+ if self.teradata_connection is None:
263
+ self._connect_to_teradata()
264
+
265
+ if group_by_aggregation_columns == ["all"]:
266
+ aggregation_columns= [f"{column.upper()}" for column in column_intersections if (column not in group_by_columns and column not in exclude_columns)]
267
+ else:
268
+ aggregation_columns= [f"{column.upper()}" for column in column_intersections if (column in group_by_aggregation_columns and column not in exclude_columns)]
269
+
270
+ dict_colummns_datatype_grouping=self.get_data_types_from_object(object, group_by_columns)
271
+
272
+ group_by_query_columns_string = " "
273
+ grouping_columns_final = []
274
+ error_dict = {}
275
+ try:
276
+ for column in group_by_columns:
277
+ column_datatype_grouping=next(x for x in dict_colummns_datatype_grouping if x["COLUMN_NAME"] == column)["DATA_TYPE"]
278
+ column_datatype_grouping = column_datatype_grouping.split('(')[0]
279
+ if column in column_intersections and column not in exclude_columns:
280
+
281
+ if column_datatype_grouping.lower() == 'cv' or column_datatype_grouping.lower() == 'cf' or column_datatype_grouping.lower() == 'co':
282
+ group_by_query_columns_string += f"TRIM({column}) AS {column} ,"
283
+ else:
284
+ group_by_query_columns_string += f"{column} ,"
285
+ grouping_columns_final.append(column)
286
+
287
+ group_by_query_columns_string = group_by_query_columns_string[:-1]
288
+
289
+ dict_colummns_datatype=self.get_data_types_from_object(object, aggregation_columns)
290
+
291
+ aggregates = ""
292
+ aggregates_min = ""
293
+
294
+ for column in aggregation_columns:
295
+ column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
296
+ column_datatype = column_datatype.split('(')[0]
297
+
298
+ if column_datatype.lower() == 'i8' or column_datatype.lower() == 'i1' or column_datatype.lower() == 'i' or column_datatype.lower() == 'i2':
299
+
300
+ if not numeric_scale:
301
+ aggregates += f", sum(cast ({column} as decimal(30,0))) as sum_{column}"
302
+ else:
303
+ aggregates += f", CASE WHEN TRIM(TO_CHAR(CAST(ROUND(sum(cast ({column} as decimal(30,0))), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND(sum(cast ({column} as decimal(30,0))), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) ELSE TRIM(TO_CHAR(CAST(ROUND(sum(cast ({column} as decimal(30,0))), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) END as SUM_{column}"
304
+ aggregates_min += f", CASE WHEN TRIM(TO_CHAR(CAST(ROUND(min({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND(min({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) ELSE TRIM(TO_CHAR(CAST(ROUND(min({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) END as MIN_{column}, CASE WHEN TRIM(TO_CHAR(CAST(ROUND(max({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND(max({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) ELSE TRIM(TO_CHAR(CAST(ROUND(max({column}), {numeric_scale}) as decimal(38,{numeric_scale})), '999999999999999999.{'0'*numeric_scale}')) END as MAX_{column}"
305
+
306
+ elif column_datatype.lower() == 'bf' or column_datatype.lower() == 'bv' or column_datatype.lower() == 'd' or column_datatype.lower() == 'f' or column_datatype.lower() == 'dy' or column_datatype.lower() == 'dh' or column_datatype.lower() == 'dm' or column_datatype.lower() == 'ds' or column_datatype.lower() == 'hr' or column_datatype.lower() == 'hs' or column_datatype.lower() == 'mi' or column_datatype.lower() == 'ms' or column_datatype.lower() == 'mo' or column_datatype.lower() == 'sc' or column_datatype.lower() == 'yr' or column_datatype.lower() == 'ym' or column_datatype.lower() == 'n' or column_datatype.lower() == 'd' :
307
+ if not numeric_scale:
308
+ aggregates += f", sum(({column} )) as sum_{column}"
309
+
310
+ if not numeric_scale:
311
+ aggregates += f", CASE WHEN TRIM(TO_CHAR(CAST(ROUND(sum({column}), 4) as decimal(38,4)), '999999999999999999.0000')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND(sum({column}), 4) as decimal(38,4)), '999999999999999999.0000')) ELSE TRIM(TO_CHAR(CAST(ROUND(sum({column}), 4) as decimal(38,4)), '999999999999999999.0000')) END as SUM_{column}"
312
+ aggregates_min += f", CASE WHEN TRIM(TO_CHAR(CAST(ROUND(min({column}), 4) as decimal(38,4)), '999999999999999999.0000')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND(min({column}), 4) as decimal(38,4)), '999999999999999999.0000')) ELSE TRIM(TO_CHAR(CAST(ROUND(min({column}), 4) as decimal(38,4)), '999999999999999999.0000')) END as MIN_{column}, CASE WHEN TRIM(TO_CHAR(CAST(ROUND(max({column}), 4) as decimal(38,4)), '999999999999999999.0000')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND(max({column}), 4) as decimal(38,4)), '999999999999999999.0000')) ELSE TRIM(TO_CHAR(CAST(ROUND(max({column}), 4) as decimal(38,4)), '999999999999999999.0000')) END as MAX_{column}"
313
+
314
+
315
+ elif not only_numeric and ( column_datatype.lower() == 'da' or column_datatype.lower() == 'pd' or column_datatype.lower() == 'pt' or column_datatype.lower() == 'pz' or column_datatype.lower() == 'pm' or column_datatype.lower() == 'at' or column_datatype.lower() == 'ts' or column_datatype.lower() == 'tz' or column_datatype.lower() == 'sz'):
316
+
317
+ aggregates += f", count(distinct {column}) as COUNTDISTINCT_{column}"
318
+ aggregates_min += f", min({column}) as MIN_{column}, max({column}) as MAX_{column}"
319
+
320
+ elif not only_numeric and (column_datatype.lower() == 'cv' or column_datatype.lower() == 'cf' or column_datatype.lower() == 'co'):
321
+
322
+ aggregates += f", count(distinct {column}) as COUNTDISTINCT_{column}"
323
+ aggregates_min += f", min(TRIM({column})) as MIN_{column}, max(TRIM({column})) as MAX_{column}"
324
+
325
+ elif not only_numeric and column_datatype.lower() == 'i1' and 1 == 0:
326
+
327
+ aggregates += f", (SELECT CONCAT ((select trim(count(*)) as val FROM {object.schema}.{object.name} WHERE {column} = 1),'_',(select trim(count(*)) as val from {object.schema}.{object.name} WHERE {column} = 0))) AS AGGREGATEBOOLEAN_{column}"
328
+
329
+ #else: Additional Data Types: ++ TD_ANYTYPE, a1 ARRAY, AN ARRAY , bo BINARY LARGE OBJECT, us USER‑DEFINED TYPE (all types),xm XML
330
+
331
+ # CASE 1: min_max
332
+ if group_by_aggregation_type == "only_min_max":
333
+ group_by_query_aggregation_string = aggregates_min
334
+
335
+ # CASE 2; sum, count_distinct, aggregate_boolean
336
+ elif group_by_aggregation_type == "various":
337
+ group_by_query_aggregation_string = aggregates
338
+
339
+ # CASE 3: sum, count_distinct, aggregate_boolean, min_max
340
+ elif group_by_aggregation_type == "various_and_min_max":
341
+ group_by_query_aggregation_string = f"{aggregates_min[1:]}{aggregates}"
342
+
343
+ query_group_by_aggregation = f"select {group_by_query_columns_string}, count(*) as COUNT_OF_GROUP_BY_VALUE {group_by_query_aggregation_string} from {object.schema}.{object.name} {filter} GROUP BY {group_by_query_columns_string} order by {group_by_query_columns_string};"
344
+
345
+ group_by_aggregation_pdf = self.execute_queries(query_group_by_aggregation,True)
346
+
347
+ except Exception as err:
348
+ group_by_aggregation_pdf = pd.DataFrame()
349
+ group_by_aggregation_pdf["TESTATM_ERROR"] = [1]
350
+ if not grouping_columns_final:
351
+ error_dict = {
352
+ "QUERY": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table",
353
+ "ERROR": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table"
354
+ }
355
+ group_by_query_aggregation_string = ""
356
+ elif '|||' in str(err):
357
+ error_dict = {
358
+ "QUERY": str(err).split('|||')[0],
359
+ "ERROR": str(err).split('|||')[1]
360
+ }
361
+ else:
362
+ error_dict = {
363
+ "QUERY": "NO Query generated. Please check if the configurated Grouping Columns exist in the Table",
364
+ "ERROR": str(err)
365
+ }
366
+
367
+ return group_by_aggregation_pdf, group_by_query_aggregation_string, group_by_query_columns_string, grouping_columns_final, error_dict
368
+
369
+
370
+ def create_pandas_df(self, object : DatabaseObject, intersection_columns_trgt_src: list, where_clause:str="", exclude_columns:list=[]) -> pd.DataFrame:
371
+
372
+ if self.teradata_connection is None:
373
+ self._connect_to_teradata()
374
+
375
+ intersection_columns_trgt_src_ = ', '.join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
376
+
377
+ df_query = f"select {intersection_columns_trgt_src_} from {object.schema}.{object.name} {where_clause};"
378
+
379
+ src_pdf = self.execute_queries(df_query,True)
380
+
381
+ return src_pdf
382
+
383
+ def create_pandas_df_from_sample(self, object: DatabaseObject, column_intersections: list, key_columns: list, where_clause: str="", exclude_columns:list=[], key_filters: dict={}, dedicated_columns: list=[], sample_count :int=10) -> List[Dict]:
384
+
385
+ if self.teradata_connection is None:
386
+ self._connect_to_teradata()
387
+
388
+ sample_count = str(sample_count)
389
+ key_intersection = list((set(column_intersections) & set(key_columns)) - set(exclude_columns))
390
+ filter_intersection = list((set(column_intersections) & set(key_filters.keys())) - set(exclude_columns))
391
+ dedicated_intersection = list((set(column_intersections) & set(dedicated_columns)) - set(exclude_columns))
392
+
393
+ key_intersection.sort()
394
+ filter_intersection.sort()
395
+ dedicated_intersection.sort()
396
+
397
+ if dedicated_intersection != []:
398
+ is_dedicated = True
399
+ dict_colummns_datatype=self.get_data_types_from_object(object, dedicated_intersection)
400
+ # datatype_query = f"""select column_name, data_type, ordinal_position
401
+ # from {object.database}.information_schema.columns
402
+ # where table_schema = '{object.schema}'
403
+ # and table_name = '{object.name}'
404
+ # and data_type not like 'TIMESTAMP%'
405
+ # and data_type != 'DATE'
406
+ # order by ordinal_position
407
+ # ;"""
408
+ else:
409
+ is_dedicated = False
410
+ dict_colummns_datatype=self.get_data_types_from_object(object, column_intersections)
411
+
412
+ if key_intersection != [] and is_dedicated:
413
+ column_intersecions_new = []
414
+ used_columns = []
415
+ numeric_columns = []
416
+ for column in dedicated_intersection:
417
+ column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
418
+ column_datatype = column_datatype.split('(')[0]
419
+
420
+ if column_datatype.lower() == 'i8' or column_datatype.lower() == 'i1' or column_datatype.lower() == 'i' or column_datatype.lower() == 'i2':
421
+ column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
422
+ used_columns.append(column)
423
+ numeric_columns.append(column)
424
+
425
+ elif column_datatype.lower() == 'bf' or column_datatype.lower() == 'bv' or column_datatype.lower() == 'd' or column_datatype.lower() == 'f' or column_datatype.lower() == 'dy' or column_datatype.lower() == 'dh' or column_datatype.lower() == 'dm' or column_datatype.lower() == 'ds' or column_datatype.lower() == 'hr' or column_datatype.lower() == 'hs' or column_datatype.lower() == 'mi' or column_datatype.lower() == 'ms' or column_datatype.lower() == 'mo' or column_datatype.lower() == 'sc' or column_datatype.lower() == 'yr' or column_datatype.lower() == 'ym' or column_datatype.lower() == 'n' or column_datatype.lower() == 'd' :
426
+ column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
427
+ used_columns.append(column)
428
+ numeric_columns.append(column)
429
+
430
+ elif column_datatype.lower() == 'cv' or column_datatype.lower() == 'cf' or column_datatype.lower() == 'cf':
431
+ column_intersecions_new.append(f'TRIM({column}) AS {column}')
432
+ used_columns.append(column)
433
+ else:
434
+ column_intersecions_new.append(column)
435
+ used_columns.append(column)
436
+
437
+ column_intersections = column_intersecions_new.copy()
438
+ columns = ""
439
+ for column in column_intersections:
440
+ #columns = str(column_intersections)[1:-1].replace("'", "")
441
+ columns += f"{column}, "
442
+ columns = columns[:-2]
443
+ keys = str(key_intersection)[1:-1].replace("'", "")
444
+
445
+
446
+ ##
447
+ ## Filter from Sample Logic
448
+ if key_filters == {}:
449
+ sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
450
+ else:
451
+ if filter_intersection != []:
452
+ values = list(key_filters.values())
453
+ if values[0] == []:
454
+ sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
455
+ else:
456
+ where_clause = f'{where_clause} AND (('
457
+ print(key_filters)
458
+ for j in range(len(values[0])):
459
+ for key in key_filters.keys():
460
+ if key == 'TECH_ID' or key in numeric_columns:
461
+ where_clause += f" CAST(ROUND({key}, 2) as decimal(38,2)) = {str(key_filters[key][j])} AND"
462
+ else:
463
+ where_clause += f" {key} = '{str(key_filters[key][j])}' AND"
464
+ where_clause = f" {where_clause[:-3]}) OR ("
465
+ where_clause = f"{where_clause[:-4]})"
466
+
467
+ sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
468
+ else:
469
+ sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
470
+
471
+ elif key_intersection != [] and not is_dedicated:
472
+ column_intersecions_new = []
473
+ used_columns = []
474
+ numeric_columns = []
475
+ column_intersections = list(set(column_intersections) - set(exclude_columns))
476
+ column_intersections.sort()
477
+ for column in column_intersections:
478
+ column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
479
+ column_datatype = column_datatype.split('(')[0]
480
+
481
+ if column_datatype.lower() == 'i8' or column_datatype.lower() == 'i1' or column_datatype.lower() == 'i' or column_datatype.lower() == 'i2':
482
+ #TODO FFR - negativer Fall
483
+ column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
484
+ used_columns.append(column)
485
+ numeric_columns.append(column)
486
+
487
+ elif column_datatype.lower() == 'bf' or column_datatype.lower() == 'bv' or column_datatype.lower() == 'd' or column_datatype.lower() == 'f' or column_datatype.lower() == 'dy' or column_datatype.lower() == 'dh' or column_datatype.lower() == 'dm' or column_datatype.lower() == 'ds' or column_datatype.lower() == 'hr' or column_datatype.lower() == 'hs' or column_datatype.lower() == 'mi' or column_datatype.lower() == 'ms' or column_datatype.lower() == 'mo' or column_datatype.lower() == 'sc' or column_datatype.lower() == 'yr' or column_datatype.lower() == 'ym' or column_datatype.lower() == 'n' or column_datatype.lower() == 'd' :
488
+ column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
489
+ used_columns.append(column)
490
+ numeric_columns.append(column)
491
+
492
+ elif column_datatype.lower() == 'cv' or column_datatype.lower() == 'cf' or column_datatype.lower() == 'cf':
493
+ column_intersecions_new.append(f'TRIM({column}) AS {column}')
494
+ used_columns.append(column)
495
+ else:
496
+ column_intersecions_new.append(column)
497
+ used_columns.append(column)
498
+
499
+ column_intersections = column_intersecions_new.copy()
500
+ columns = ""
501
+ for column in column_intersections:
502
+ #columns = str(column_intersections)[1:-1].replace("'", "")
503
+ columns += f"{column}, "
504
+ columns = columns[:-2]
505
+ keys = str(key_intersection)[1:-1].replace("'", "")
506
+
507
+
508
+ if key_filters == {}:
509
+ sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
510
+ else:
511
+ if filter_intersection != []:
512
+ values = list(key_filters.values())
513
+
514
+ # in_clause = "(('"
515
+ # for j in range(len(values[0])):
516
+ # for value in values:
517
+ # in_clause += str(value[j]) + "','"
518
+ # in_clause = in_clause[:-2] + "),('"
519
+ # in_clause = in_clause[:-3] + ')'
520
+
521
+ # where_clause = "WHERE ("
522
+ # for key in key_filters.keys():
523
+ # where_clause += key.replace("'", "") + ","
524
+ # where_clause = where_clause[:-1] + ")"
525
+ # where_clause += " in " + in_clause
526
+ if values[0] == []:
527
+ sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
528
+ else:
529
+ where_clause = f'{where_clause} AND (('
530
+ print(key_filters)
531
+ for j in range(len(values[0])):
532
+ for key in key_filters.keys():
533
+ if key_filters.keys() in numeric_columns:
534
+ where_clause += f" {key} = {str(key_filters[key][j])} AND"
535
+ else:
536
+ where_clause += f" {key} = '{str(key_filters[key][j])}' AND"
537
+ where_clause += f" {where_clause[:-3]}) OR ("
538
+ where_clause = f"{where_clause[:-4]})"
539
+
540
+ sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
541
+ else:
542
+ sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count} {where_clause} ORDER BY {keys};"
543
+
544
+ else:
545
+ column_intersecions_new = []
546
+ used_columns = []
547
+ numeric_columns = []
548
+ column_intersections = list(set(column_intersections) - set(exclude_columns))
549
+ column_intersections.sort()
550
+ for column in column_intersections:
551
+ print("COLUMN: " + column)
552
+ print(dict_colummns_datatype)
553
+ column_datatype=next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
554
+ column_datatype = column_datatype.split('(')[0]
555
+
556
+ if column_datatype.lower() == 'i8' or column_datatype.lower() == 'i1' or column_datatype.lower() == 'i' or column_datatype.lower() == 'i2':
557
+ column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
558
+ used_columns.append(column)
559
+ numeric_columns.append(column)
560
+
561
+ elif column_datatype.lower() == 'bf' or column_datatype.lower() == 'bv' or column_datatype.lower() == 'd' or column_datatype.lower() == 'f' or column_datatype.lower() == 'dy' or column_datatype.lower() == 'dh' or column_datatype.lower() == 'dm' or column_datatype.lower() == 'ds' or column_datatype.lower() == 'hr' or column_datatype.lower() == 'hs' or column_datatype.lower() == 'mi' or column_datatype.lower() == 'ms' or column_datatype.lower() == 'mo' or column_datatype.lower() == 'sc' or column_datatype.lower() == 'yr' or column_datatype.lower() == 'ym' or column_datatype.lower() == 'n' or column_datatype.lower() == 'd' :
562
+ column_intersecions_new.append(f"CASE WHEN TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) like '.%' THEN '0' || TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) ELSE TRIM(TO_CHAR(CAST(ROUND({column}, 2) as decimal(38,2)), '999999999999999999999999.00')) END as {column}")
563
+ used_columns.append(column)
564
+ numeric_columns.append(column)
565
+
566
+ elif column_datatype.lower() == 'cv' or column_datatype.lower() == 'cf' or column_datatype.lower() == 'cf':
567
+ column_intersecions_new.append(f'TRIM({column}) as decimal(38,2)) AS {column}')
568
+ used_columns.append(column)
569
+ else:
570
+ column_intersecions_new.append(column)
571
+ used_columns.append(column)
572
+ column_intersections = column_intersecions_new.copy()
573
+ columns = ""
574
+ for column in column_intersections:
575
+ #columns = str(column_intersections)[1:-1].replace("'", "")
576
+ columns += f"{column}, "
577
+ columns = columns[:-2]
578
+ sample_query = f"SELECT {columns} FROM {object.schema}.{object.name} SAMPLE {sample_count};"
579
+
580
+ # ##
581
+ # ## Only Filter for last 5 days for LAGERBESTAND_MAERKTE_TAG
582
+ # if object == 'LAGERBESTAND_MAERKTE_TAG':
583
+ # sample_query = sample_query.upper()
584
+ # if 'WHERE ' in sample_query:
585
+ # sample_query = sample_query.replace("WHERE ", " AND (").replace("ORDER BY ", ") ORDER BY ")
586
+ # sample_query = sample_query.replace(f"FROM {object.database}.{object.schema}.{object.name}", f"FROM {object.database}.{object.schema}.{object.name} WHERE dat_jjjjmmtt > to_char(current_date()-6, 'YYYYMMDD')")
587
+
588
+ error_dict = {}
589
+ key_dict = {}
590
+ try:
591
+ sample_pdf = self.execute_queries(sample_query,True)
592
+ for key in key_intersection:
593
+ key_dict[key] = list(sample_pdf[key])
594
+ test = ''
595
+
596
+ except Exception as err:
597
+ sample_pdf = pd.DataFrame()
598
+ sample_pdf["TESTATM_ERROR"] = [1]
599
+ if '|||' in str(err):
600
+ error_dict = {
601
+ "QUERY": str(err).split('|||')[0],
602
+ "ERROR": str(err).split('|||')[1]
603
+ }
604
+ else:
605
+ error_dict = {
606
+ "QUERY": 'No SQL Error',
607
+ "ERROR": str(err)
608
+ }
609
+
610
+ return_list = []
611
+ return_list.append(sample_pdf)
612
+ return_list.append(error_dict)
613
+
614
+
615
+ return return_list , key_dict, used_columns, sample_query.replace("SAMPLE 10", "")
616
+
617
+ def execute_queries(self, query: Union[str, List[str]],return_as_pdf:bool=False) -> Union[List[Dict], List[List[Dict]]]:
618
+ if self.teradata_connection is None:
619
+ self._connect_to_teradata()
620
+
621
+ query_list: List[str] = query if isinstance(query, list) else [query]
622
+
623
+ results = []
624
+
625
+ for single_query in query_list:
626
+ try:
627
+ if return_as_pdf:
628
+ query_result = pd.read_sql(query, self.teradata_connection)
629
+ else:
630
+ cursor=self.teradata_connection.cursor()
631
+ query_result=cursor.execute(single_query)
632
+
633
+ results.append(query_result)
634
+
635
+ except Exception as err:
636
+ #results.append("ERROR: " + err)
637
+ #raise Exception() from err
638
+ raise Exception(single_query + "|||" + str(err))
639
+
640
+
641
+ return results[0] if not isinstance(query, list) else results
642
+
643
+
644
+ def execute_statement(self, statement: Union[str, List[str]]) -> None:
645
+ """
646
+ Executes simple statement against teradata
647
+ Args:
648
+ statement Union[str, List[str]] - a sql statement or a list of sql statements to execute
649
+ """
650
+ if self.teradata_connection is None:
651
+ self._connect_to_teradata()
652
+
653
+ statement_list: List[str] = (
654
+ statement if isinstance(statement, list) else [statement]
655
+ )
656
+
657
+ try:
658
+ for single_statement in statement_list:
659
+ stripped_statement = (
660
+ single_statement.strip()
661
+ )
662
+ _ = self.teradata_connection.execute(stripped_statement)
663
+
664
+ except Exception as err:
665
+ raise Exception(self._get_error_message(err, single_statement)) from err