icsDataValidation 1.0.378__py3-none-any.whl → 1.0.415__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. icsDataValidation/configuration.py +0 -0
  2. icsDataValidation/connection_setups/__init__.py +0 -0
  3. icsDataValidation/connection_setups/azure_connection_setup.py +2 -1
  4. icsDataValidation/connection_setups/databricks_connection_setup.py +0 -0
  5. icsDataValidation/connection_setups/exasol_connection_setup.py +0 -0
  6. icsDataValidation/connection_setups/oracle_connection_setup.py +0 -0
  7. icsDataValidation/connection_setups/snowflake_connection_setup.py +0 -0
  8. icsDataValidation/connection_setups/sqlserver_connection_setup.py +20 -0
  9. icsDataValidation/connection_setups/teradata_connection_setup.py +0 -0
  10. icsDataValidation/core/__init__.py +0 -0
  11. icsDataValidation/core/database_objects.py +0 -0
  12. icsDataValidation/core/object_comparison.py +0 -0
  13. icsDataValidation/input_parameters/__init__.py +0 -0
  14. icsDataValidation/input_parameters/testing_tool_params.py +4 -3
  15. icsDataValidation/main.py +15 -11
  16. icsDataValidation/output_parameters/__init__.py +0 -0
  17. icsDataValidation/output_parameters/result_params.py +0 -0
  18. icsDataValidation/services/__init__.py +0 -0
  19. icsDataValidation/services/comparison_service.py +80 -76
  20. icsDataValidation/services/database_services/__init__.py +0 -0
  21. icsDataValidation/services/database_services/azure_service.py +69 -43
  22. icsDataValidation/services/database_services/databricks_hive_metastore_service.py +20 -7
  23. icsDataValidation/services/database_services/databricks_unity_catalog_service.py +20 -12
  24. icsDataValidation/services/database_services/exasol_service.py +26 -23
  25. icsDataValidation/services/database_services/oracle_service.py +64 -55
  26. icsDataValidation/services/database_services/snowflake_service.py +85 -36
  27. icsDataValidation/services/database_services/sqlserver_service.py +868 -0
  28. icsDataValidation/services/database_services/teradata_service.py +54 -37
  29. icsDataValidation/services/initialization_service.py +0 -0
  30. icsDataValidation/services/result_service.py +0 -0
  31. icsDataValidation/services/system_service.py +4 -0
  32. icsDataValidation/services/testset_service.py +0 -0
  33. icsDataValidation/utils/__init__.py +0 -0
  34. icsDataValidation/utils/file_util.py +0 -0
  35. icsDataValidation/utils/logger_util.py +0 -0
  36. icsDataValidation/utils/pandas_util.py +0 -0
  37. icsDataValidation/utils/parallelization_util.py +0 -0
  38. icsDataValidation/utils/sql_util.py +0 -0
  39. icsdatavalidation-1.0.415.dist-info/METADATA +298 -0
  40. {icsDataValidation-1.0.378.dist-info → icsdatavalidation-1.0.415.dist-info}/RECORD +18 -18
  41. {icsDataValidation-1.0.378.dist-info → icsdatavalidation-1.0.415.dist-info}/WHEEL +1 -1
  42. icsdatavalidation-1.0.415.dist-info/top_level.txt +1 -0
  43. examples/ics_data_validation.py +0 -7
  44. examples/manual_execution_params.template.py +0 -44
  45. icsDataValidation-1.0.378.dist-info/METADATA +0 -20
  46. icsDataValidation-1.0.378.dist-info/top_level.txt +0 -4
@@ -0,0 +1,868 @@
1
+ import pyodbc
2
+ import pandas.io.sql
3
+ import logging
4
+ import pandas as pd
5
+
6
+ from pathlib import PurePath
7
+
8
+ from icsDataValidation.core.database_objects import DatabaseObject
9
+ from icsDataValidation.utils.logger_util import configure_dev_ops_logger
10
+
11
+ #########################################################################################
12
+ #########################################################################################
13
+
14
+ # Configure Dev Ops Logger
15
+
16
+ logger = logging.getLogger("SQLServer_Service")
17
+ logger.setLevel(logging.INFO)
18
+ configure_dev_ops_logger(logger)
19
+
20
+ class SQLServerService:
21
+ def __init__(self, connection_params: dict):
22
+ self.connection_params = connection_params
23
+ self.sqlserver_connection = None
24
+ self.sqlserver_datatype_mapping = {
25
+ "string": ["varchar", "nvarchar", "text", "ntext", "char","nchar"],
26
+ "numeric": ["tinyint","smallint","int","bigint","decimal","numeric","smallmoney","money","float","real"],
27
+ "date_and_time": ["date", "time", "datetime", "datetime2", "smalldatetime", "datetimeoffset", "timestamp"],
28
+ "binary": ["varbinary", "binary"],
29
+ "boolean": ["bit"],
30
+ }
31
+
32
+ def __enter__(self):
33
+ return self
34
+
35
+ def __exit__(self, exception_type, exception_value, traceback):
36
+ if self.sqlserver_connection is not None:
37
+ self.sqlserver_connection.close()
38
+
39
+ def __del__(self):
40
+ if self.sqlserver_connection is not None:
41
+ self.sqlserver_connection.close()
42
+
43
+ def _connect_to_sqlserver(self):
44
+ sqlserver_connection_string = (
45
+ f"DRIVER={self.connection_params['Driver']};"
46
+ f"SERVER={self.connection_params['Server']};"
47
+ f"PORT={self.connection_params['Port']};"
48
+ f"DATABASE={self.connection_params['Database']};"
49
+ f"UID={self.connection_params['User']};"
50
+ f"PWD={self.connection_params['Password']}"
51
+ )
52
+ self.sqlserver_connection = pyodbc.connect(sqlserver_connection_string)
53
+ return self.sqlserver_connection
54
+
55
+ @staticmethod
56
+ def _get_error_message(excepction: Exception, statement: str) -> None:
57
+ """
58
+ Compose error message if the execution of a statement or query fails.
59
+ """
60
+ if hasattr(excepction, "raw_msg"):
61
+ message = excepction.raw_msg.replace("\n", " ")
62
+ else:
63
+ message = str(
64
+ excepction
65
+ ) # this makes sure that all kinds of errors can have a message, even if they do not have raw_msg attribute
66
+ if hasattr(excepction, "sfqid"):
67
+ message = message + f"\nQuery ID: {excepction.sfqid}"
68
+ return f"SQLServer ERROR: {message}\nFailed statement:\n{statement}"
69
+
70
+ @staticmethod
71
+ def _get_in_clause(key_filters: list, numeric_columns: list, numeric_scale: int,
72
+ enclose_column_by_double_quotes: bool = False) -> str:
73
+ """generates in_clause from list ready to expand the where clause, numeric values are rounded
74
+
75
+ Args:
76
+ key_filters (list): list of given expected values
77
+ numeric_columns (list): list of all numeric columns
78
+ numeric_scale (int): number of decimal places after rounding
79
+
80
+ Returns:
81
+ str: in clause as string
82
+ """
83
+ values = list(key_filters.values())
84
+ in_clause_values = "('"
85
+ for j in range(len(values[0])):
86
+ for value in values:
87
+ in_clause_values += str(value[j]) + "','"
88
+ in_clause_values = in_clause_values[:-2] + "),('"
89
+ in_clause_values = in_clause_values[:-3] + ")"
90
+
91
+ in_clause_cols = " AND (("
92
+ for key in key_filters.keys():
93
+ if key in numeric_columns:
94
+ in_clause_cols += f"""ROUND({key.replace("'", "")}, {numeric_scale})""" + ","
95
+ else:
96
+ in_clause_cols += key.replace("'", "") + ","
97
+ in_clause_cols = in_clause_cols[:-1] + ")"
98
+ in_clause = in_clause_cols + " in (" + in_clause_values + ")"
99
+ return in_clause
100
+
101
+ def _get_column_clause(self, column_list: list, columns_datatype: list, numeric_scale, key_columns,
102
+ enclose_column_by_double_quotes: bool = False) -> dict:
103
+ """
104
+ Turns list of desired columns into a sql compatible string.
105
+ Columns with a date or time data type are omitted.
106
+
107
+ Args:
108
+ column_list (list): list of all columns
109
+ columns_datatype (list): datatypes of given columns
110
+ numeric_scale (_type_): number of decimal places for numeric columns
111
+ key_columns (_type_):list of columns of interest
112
+
113
+ Returns:
114
+ dict: _description_
115
+ """
116
+ column_intersecions_new = []
117
+ used_columns = []
118
+ numeric_columns = []
119
+ for column in column_list:
120
+ column_datatype = next(x for x in columns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
121
+
122
+ if column in key_columns or column_datatype.lower() not in self.sqlserver_datatype_mapping["date_and_time"]:
123
+ if column_datatype.lower() in self.sqlserver_datatype_mapping["numeric"]:
124
+ if numeric_scale:
125
+ column_intersecions_new.append(
126
+ f"CAST(ROUND({column}, {numeric_scale}) as decimal(38,{numeric_scale})) as {column}"
127
+ )
128
+ else:
129
+ column_intersecions_new.append(f"{column} as {column}")
130
+ used_columns.append(column)
131
+ numeric_columns.append(column)
132
+ elif column_datatype.lower() in self.sqlserver_datatype_mapping["string"]:
133
+ column_intersecions_new.append(f"{column} AS {column}")
134
+ used_columns.append(column)
135
+ else:
136
+ column_intersecions_new.append(column)
137
+ used_columns.append(column)
138
+
139
+ column_intersections = column_intersecions_new.copy()
140
+ column_clause = str(column_intersections)[1:-1].replace("'", "")
141
+ return column_clause, numeric_columns, used_columns
142
+
143
+ def get_database_objects(
144
+ self, database: str, schema: str = None, object_type_restriction: str = "include_all"
145
+ ) -> dict:
146
+ if self.sqlserver_connection is None:
147
+ self._connect_to_sqlserver()
148
+
149
+ all_database_tables = []
150
+ all_database_views = []
151
+
152
+ if object_type_restriction == "include_all" or object_type_restriction == "include_only_tables":
153
+ if schema:
154
+ query_db_tables = f"SELECT SCHEMA_NAME(T.SCHEMA_ID) AS SCHEMA_NAME, T.NAME AS TABLE_NAME FROM SYS.TABLES T WHERE SCHEMA_NAME(T.SCHEMA_ID) = '{schema}' ORDER BY SCHEMA_NAME;"
155
+ else:
156
+ query_db_tables = f"SELECT SCHEMA_NAME(T.SCHEMA_ID) AS SCHEMA_NAME, T.NAME AS TABLE_NAME FROM SYS.TABLES T ORDER BY SCHEMA_NAME;"
157
+
158
+ all_database_tables = self.execute_queries(query_db_tables)
159
+
160
+ if object_type_restriction == "include_all" or object_type_restriction == "include_only_views":
161
+ if schema:
162
+ query_db_views = f"SELECT SCHEMA_NAME(T.SCHEMA_ID) AS SCHEMA_NAME, T.NAME AS TABLE_NAME FROM SYS.VIEWS T WHERE SCHEMA_NAME(T.SCHEMA_ID) = '{schema}' ORDER BY SCHEMA_NAME;"
163
+ else:
164
+ query_db_views = F"SELECT SCHEMA_NAME(T.SCHEMA_ID) AS SCHEMA_NAME, T.NAME AS TABLE_NAME FROM SYS.VIEWS T ORDER BY SCHEMA_NAME;"
165
+
166
+ all_database_views = self.execute_queries(query_db_views)
167
+
168
+ database_objects=[]
169
+ for row in all_database_tables:
170
+ database_table=f"{database}.{row['SCHEMA_NAME'].upper()}.{row['TABLE_NAME'].upper()}"
171
+ database_objects.append({"object_identifier": database_table, "object_type": "table"})
172
+ for row in all_database_views:
173
+ database_view=f"{database}.{row['SCHEMA_NAME'].upper()}.{row['TABLE_NAME'].upper()}"
174
+ database_objects.append({"object_identifier": database_view, "object_type": "view"})
175
+ return database_objects
176
+
177
+ def get_last_altered_timestamp_from_object(self, object: DatabaseObject) -> str:
178
+ """
179
+ queries last_altered timestamp for given object
180
+
181
+ Args:
182
+ object (str): object for comparison
183
+
184
+ Returns:
185
+ str: last_altered timestamp
186
+ """
187
+ if self.sqlserver_connection is None:
188
+ self._connect_to_sqlserver()
189
+
190
+ query_get_last_altered = f"SELECT MODIFY_DATE AS LAST_ALTERED FROM SYS.OBJECTS WHERE NAME = '{object.name}' AND SCHEMA_ID = SCHEMA_ID('{object.schema}');"
191
+
192
+ last_altered = self.execute_queries(query_get_last_altered)[0]
193
+
194
+ return last_altered
195
+
196
+ def get_columns_from_object(self, object: DatabaseObject) -> list:
197
+ """
198
+ returns all columns from given object
199
+
200
+ Args:
201
+ object (DatabaseObject): table or view
202
+
203
+ Returns:
204
+ list: list of all columns
205
+ """
206
+
207
+ if self.sqlserver_connection is None:
208
+ self._connect_to_sqlserver()
209
+
210
+ if object.type == "table":
211
+ query_get_columns = f"""
212
+ SELECT
213
+ COL.NAME
214
+ FROM SYS.TABLES AS TAB
215
+ INNER JOIN SYS.COLUMNS AS COL ON (
216
+ TAB.OBJECT_ID = COL.OBJECT_ID
217
+ AND UPPER(TAB.NAME) = '{object.name.upper()}'
218
+ )
219
+ INNER JOIN (
220
+ SELECT
221
+ OBJECT_ID,
222
+ SCHEMA_ID
223
+ FROM
224
+ SYS.OBJECTS
225
+ ) AS OBJ ON (
226
+ TAB.OBJECT_ID = OBJ.OBJECT_ID
227
+ AND SCHEMA_NAME(OBJ.SCHEMA_ID) = '{object.schema.upper()}'
228
+ )
229
+ ;
230
+ """
231
+
232
+ if object.type == "view":
233
+ query_get_columns = f"""
234
+ SELECT
235
+ COL.NAME
236
+ FROM SYS.VIEWS AS VW
237
+ INNER JOIN SYS.COLUMNS AS COL ON (
238
+ VW.OBJECT_ID = COL.OBJECT_ID
239
+ AND UPPER(VW.NAME) = '{object.name.upper()}'
240
+ )
241
+ INNER JOIN (
242
+ SELECT
243
+ OBJECT_ID,
244
+ SCHEMA_ID
245
+ FROM
246
+ SYS.OBJECTS
247
+ ) AS OBJ ON (
248
+ VW.OBJECT_ID = OBJ.OBJECT_ID
249
+ AND SCHEMA_NAME(OBJ.SCHEMA_ID) = '{object.schema.upper()}'
250
+ )
251
+ ;
252
+ """
253
+
254
+ columns_result = self.execute_queries(query_get_columns)
255
+
256
+ columns = [row['NAME'] for row in columns_result]
257
+
258
+ return columns
259
+
260
+ def get_row_count_from_object(self, object: DatabaseObject, where_clause: str = "") -> int:
261
+ """
262
+ gets row count from given object
263
+
264
+ Args:
265
+ object (DatabaseObject): table or view
266
+
267
+ Returns:
268
+ int: number of rows in object
269
+ """
270
+
271
+ if self.sqlserver_connection is None:
272
+ self._connect_to_sqlserver()
273
+
274
+ # TODO is it more efficient to select the information_schema.table view to get the rows?
275
+ query_get_row_count = (
276
+ f"SELECT COUNT(*) AS ROW_COUNT FROM {object.schema}.{object.name} {where_clause};"
277
+ )
278
+ row_count = -1
279
+ error_list = []
280
+
281
+ try:
282
+ row_count = self.execute_queries(query_get_row_count)[0]['ROW_COUNT']
283
+
284
+ except Exception as err:
285
+ error_list.append(str(err))
286
+ error_list.append(query_get_row_count)
287
+
288
+ return row_count, error_list
289
+
290
+ def get_data_types_from_object(self, object: DatabaseObject, column_intersections: list) -> dict:
291
+ """
292
+ returns datatypes for all intersection columns in a database object
293
+
294
+ Args:
295
+ object (DatabaseObject): table or view
296
+ column_intersections (list): columns for which the data type is queried
297
+
298
+ Returns:
299
+ dict: columns and their datatype
300
+ """
301
+
302
+ if self.sqlserver_connection is None:
303
+ self._connect_to_sqlserver()
304
+
305
+ column_intersections = str(column_intersections)[1:-1]
306
+ if column_intersections == "":
307
+ column_intersections = "''"
308
+
309
+ if object.type == 'table':
310
+ query_get_data_types_from_object = f"""
311
+ SELECT
312
+ COL.NAME,
313
+ T.NAME AS DATA_TYPE
314
+ FROM
315
+ SYS.TABLES AS TAB
316
+ INNER JOIN SYS.COLUMNS AS COL ON TAB.OBJECT_ID = COL.OBJECT_ID
317
+ LEFT JOIN SYS.TYPES AS T ON COL.USER_TYPE_ID = T.USER_TYPE_ID
318
+ WHERE
319
+ TAB.NAME = '{object.name}'
320
+ AND SCHEMA_NAME (TAB.SCHEMA_ID) = '{object.schema}'
321
+ AND COL.NAME IN ({column_intersections})
322
+ ;
323
+ """
324
+ elif object.type == 'view':
325
+ query_get_data_types_from_object = f"""
326
+ SELECT
327
+ COL.NAME,
328
+ T.NAME AS DATA_TYPE
329
+ FROM
330
+ SYS.VIEWS AS VW
331
+ INNER JOIN SYS.COLUMNS AS COL ON VW.OBJECT_ID = COL.OBJECT_ID
332
+ LEFT JOIN SYS.TYPES AS T ON COL.USER_TYPE_ID = T.USER_TYPE_ID
333
+ WHERE
334
+ VW.NAME = '{object.name}'
335
+ AND SCHEMA_NAME (VW.SCHEMA_ID) = '{object.schema}'
336
+ AND COL.NAME IN ({column_intersections})
337
+ ;
338
+ """
339
+
340
+ data_types_result = self.execute_queries(query_get_data_types_from_object)
341
+
342
+ datatypes = [{"COLUMN_NAME":row['NAME'],"DATA_TYPE":row['DATA_TYPE']} for row in data_types_result]
343
+
344
+ return datatypes
345
+
346
+ def get_count_distincts_from_object(
347
+ self,
348
+ object: DatabaseObject,
349
+ column_intersections: list,
350
+ where_clause: str = "",
351
+ exclude_columns: list = [],
352
+ enclose_column_by_double_quotes: bool = False
353
+ ) -> dict:
354
+ """
355
+ get distinct count for every column in a database object that is in column intersections list
356
+
357
+ Args:
358
+ object (DatabaseObject): table or view
359
+ column_intersections (list): columns that are used for distinct count
360
+ where_clause (str, optional): optional further filter. Defaults to "".
361
+ exclude_columns (list, optional): columns to exclude from distinct count. Defaults to [].
362
+
363
+ Returns:
364
+ dict: distinct counts for columns
365
+ error_list: list of failed executions for distinct counts
366
+ """
367
+
368
+ if self.sqlserver_connection is None:
369
+ self._connect_to_sqlserver()
370
+
371
+ unions = ""
372
+
373
+ for column in column_intersections:
374
+ if column not in exclude_columns:
375
+ unions += f"""
376
+ UNION
377
+ SELECT
378
+ '{column}' AS COLUMN_NAME,
379
+ COUNT(DISTINCT {column}) AS COUNT_DISTINCT
380
+ FROM {object.schema}.{object.name}
381
+ {where_clause}
382
+ """
383
+
384
+ query_get_count_distincts_from_object = f"{unions[6:]} ORDER BY COUNT_DISTINCT;"
385
+ error_list = []
386
+
387
+ try:
388
+ dict_count_distincts = self.execute_queries(query_get_count_distincts_from_object)
389
+ except Exception as err:
390
+ dict_count_distincts = [{"COUNT_DISTINCT": 0}]
391
+ error_list.append(["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]])
392
+
393
+ return dict_count_distincts, error_list
394
+
395
+ def get_table_size(self, object: DatabaseObject) -> int:
396
+ """
397
+ returns size of given object
398
+
399
+ Args:
400
+ object (DatabaseObject): table or view
401
+
402
+ Returns:
403
+ int: size of object
404
+ """
405
+
406
+ if self.sqlserver_connection is None:
407
+ self._connect_to_sqlserver()
408
+
409
+ query_get_table_size = f"""
410
+ SELECT
411
+ CAST(
412
+ SUM(SPC.USED_PAGES * 8) / 1024.00 * 1000000 AS INTEGER
413
+ ) AS BYTES
414
+ FROM
415
+ SYS.TABLES TAB
416
+ INNER JOIN SYS.INDEXES IND ON TAB.OBJECT_ID = IND.OBJECT_ID
417
+ INNER JOIN SYS.PARTITIONS PART ON IND.OBJECT_ID = PART.OBJECT_ID
418
+ AND IND.INDEX_ID = PART.INDEX_ID
419
+ INNER JOIN SYS.ALLOCATION_UNITS SPC ON PART.PARTITION_ID = SPC.CONTAINER_ID
420
+ WHERE
421
+ SCHEMA_NAME (TAB.SCHEMA_ID) = '{object.schema}'
422
+ AND TAB.NAME = '{object.name}'
423
+ GROUP BY
424
+ SCHEMA_NAME (TAB.SCHEMA_ID) + '.' + TAB.NAME
425
+ ORDER BY
426
+ SUM(SPC.USED_PAGES) DESC;
427
+ """
428
+ size = self.execute_queries(query_get_table_size)[0]['BYTES']
429
+
430
+ return size
431
+
432
+ def create_checksums(
433
+ self,
434
+ object: DatabaseObject,
435
+ column_intersections: list,
436
+ where_clause: str = "",
437
+ exclude_columns: list = [],
438
+ numeric_scale: int = None,
439
+ enclose_column_by_double_quotes: bool = False,
440
+ ) -> list[dict]:
441
+ """creates checksums for given object in compliance with given conditions
442
+
443
+ Args:
444
+ object (DatabaseObject): table or view
445
+ column_intersections (list): columns that are used for checksums
446
+ where_clause (str, optional): Optional filter criteria given as sql-usable string. Defaults to "".
447
+ exclude_columns (list, optional): columns to exlude from calculation. Defaults to [].
448
+ numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
449
+
450
+ Returns:
451
+ List[Dict]: checksums for columns of object
452
+ """
453
+
454
+ if self.sqlserver_connection is None:
455
+ self._connect_to_sqlserver()
456
+
457
+ column_intersections = [f"{x.upper()}" for x in column_intersections if x not in exclude_columns]
458
+
459
+ dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
460
+
461
+ aggregates = ""
462
+ count_nulls = ""
463
+
464
+ for column in column_intersections:
465
+ column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
466
+
467
+ count_nulls += f", SUM(CASE WHEN {column} IS NULL THEN 1 ELSE 0 END) AS COUNTNULLS_{column}"
468
+
469
+ if column_datatype.lower() in self.sqlserver_datatype_mapping["numeric"]:
470
+ if numeric_scale:
471
+ aggregates += (
472
+ f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS SUM_{column}"
473
+ )
474
+ else:
475
+ aggregates += f", CAST(SUM({column}) AS DECIMAL(38)) AS SUM_{column}"
476
+
477
+ elif (
478
+ column_datatype.lower() in self.sqlserver_datatype_mapping["string"]
479
+ or column_datatype.lower() in self.sqlserver_datatype_mapping["date_and_time"]
480
+ ):
481
+ aggregates += f", COUNT(DISTINCT LOWER({column})) AS COUNTDISTINCT_{column}"
482
+
483
+ elif column_datatype.lower() in self.sqlserver_datatype_mapping["binary"]:
484
+ aggregates += f", COUNT(DISTINCT LOWER(TRY_CONVERT(VARCHAR,{column}))) AS COUNTDISTINCT_{column}"
485
+
486
+ elif column_datatype.lower() in self.sqlserver_datatype_mapping["boolean"]:
487
+ aggregates += f", CONCAT(CONCAT(CONVERT(VARCHAR,COUNT(CASE WHEN {column} = 1 THEN 1 ELSE NULL END)) , '_'), CONVERT(VARCHAR, COUNT(CASE WHEN {column} = 0 THEN 1 ELSE NULL END)))) AS AGGREGATEBOOLEAN_{column}"
488
+
489
+ #else: Additional Data Types: image , sql_variant, uniqueidentifier, xml, cursor, table, column_datatype.lower() == 'bit' or
490
+
491
+ query_checksums = (
492
+ f"SELECT {aggregates[1:]} FROM {object.schema}.{object.name} {where_clause};"
493
+ )
494
+
495
+ query_countnulls = (
496
+ f"SELECT {count_nulls[1:]} FROM {object.schema}.{object.name} {where_clause};"
497
+ )
498
+
499
+ error_list = []
500
+ test_list = []
501
+ aggregation_results = {}
502
+
503
+ try:
504
+ checksums_results = self.execute_queries([query_checksums, query_countnulls])
505
+
506
+ aggregation_results = checksums_results[0][0]
507
+
508
+ countnulls_results = checksums_results[1][0]
509
+
510
+ for i in range(0, len(aggregation_results)):
511
+ if list(aggregation_results.values())[i] is None:
512
+ agg_result = 0
513
+ else:
514
+ agg_result = list(aggregation_results.values())[i]
515
+
516
+ if list(countnulls_results.values())[i] is None:
517
+ cnt_result = 0
518
+ else:
519
+ cnt_result = list(countnulls_results.values())[i]
520
+
521
+ test_list.append(
522
+ [[item.split("_", 1)[0] for item in list(aggregation_results.keys())][i], agg_result, cnt_result]
523
+ )
524
+
525
+ except Exception as err:
526
+ error_list.append(["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]])
527
+
528
+ checksums = dict(zip([item.split("_", 1)[1] for item in aggregation_results.keys()], test_list))
529
+ checksums["TESTATM_ERRORS"] = error_list
530
+
531
+ return checksums
532
+
533
+ def create_pandas_df_from_group_by(
534
+ self,
535
+ object: DatabaseObject,
536
+ column_intersections: list,
537
+ group_by_columns: list,
538
+ group_by_aggregation_columns: list,
539
+ group_by_aggregation_type: str,
540
+ only_numeric: bool,
541
+ where_clause: str,
542
+ exclude_columns: list,
543
+ numeric_scale: int = None,
544
+ enclose_column_by_double_quotes: bool = False,
545
+ ) -> list[dict]:
546
+ """execution of multiple aggregations at once
547
+
548
+ Args:
549
+ object (DatabaseObject): table or view
550
+ column_intersections (list): columns existing in src and trgt
551
+ group_by_columns (list): columns for grouping the aggregations
552
+ group_by_aggregation_columns (list): list of columns that are supposed to be aggregated
553
+ group_by_aggregation_type (str): choice between: only_min_max, various, various_and_min_max
554
+ only_numeric (bool): whether to also include distinct counts or only do numeric aggregations
555
+ where_clause (str): optional filter for aggregations, given as sql compatible where-string
556
+ exclude_columns (list): columns to exclude from comparisons
557
+ numeric_scale (int, optional): number of decimal places for aggregations. Defaults to None.
558
+
559
+ Returns:
560
+ List[Dict]: list of pandas dataframes with results from aggregations, used sql queries
561
+ """
562
+
563
+ if self.sqlserver_connection is None:
564
+ self._connect_to_sqlserver()
565
+
566
+ if group_by_aggregation_columns == ["all"]:
567
+ aggregation_columns = [
568
+ f"{column.upper()}"
569
+ for column in column_intersections
570
+ if (column not in group_by_columns and column not in exclude_columns)
571
+ ]
572
+ else:
573
+ aggregation_columns = [
574
+ f"{column.upper()}"
575
+ for column in column_intersections
576
+ if (column in group_by_aggregation_columns and column not in exclude_columns)
577
+ ]
578
+
579
+ group_by_query_columns_string = " "
580
+ grouping_columns_final = []
581
+ error_dict = {}
582
+
583
+ try:
584
+ for column in group_by_columns:
585
+ if column in column_intersections and column not in exclude_columns:
586
+ group_by_query_columns_string += f"{column} ,"
587
+ grouping_columns_final.append(column)
588
+
589
+ group_by_query_columns_string = group_by_query_columns_string[:-1]
590
+
591
+ dict_colummns_datatype = self.get_data_types_from_object(object, aggregation_columns)
592
+
593
+ aggregates = ""
594
+ aggregates_min = ""
595
+
596
+ for column in aggregation_columns:
597
+ column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
598
+
599
+ if column_datatype.lower() in self.sqlserver_datatype_mapping["numeric"]:
600
+ if numeric_scale:
601
+ aggregates_min += f", CAST(ROUND(MIN({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MIN_{column}, CAST(ROUND(MAX({column}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS MAX_{column}"
602
+ aggregates += f", CAST(ROUND(SUM({column}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS SUM_{column}"
603
+ else:
604
+ aggregates_min += f", MIN({column}) AS MIN_{column}, MAX({column}) AS MAX_{column}"
605
+ aggregates += f", SUM({column}) AS SUM_{column}"
606
+
607
+ elif not only_numeric and (
608
+ column_datatype.lower() in self.sqlserver_datatype_mapping["string"]
609
+ or column_datatype.lower() in self.sqlserver_datatype_mapping["date_and_time"]
610
+ ):
611
+ aggregates += f", COUNT(DISTINCT LOWER({column})) AS COUNTDISTINCT_{column}"
612
+
613
+ elif not only_numeric and column_datatype.lower() in self.sqlserver_datatype_mapping["binary"]:
614
+ aggregates += f", COUNT(DISTINCT LOWER(TRY_CONVERT(VARCHAR,{column}))) AS COUNTDISTINCT_{column}"
615
+
616
+ elif not only_numeric and column_datatype.lower() in self.sqlserver_datatype_mapping["boolean"]:
617
+ aggregates += f", CONCAT(CONCAT(CONVERT(VARCHAR,COUNT(CASE WHEN {column} = 1 THEN 1 ELSE NULL END)) , '_'), CONVERT(VARCHAR, COUNT(CASE WHEN {column} = 0 THEN 1 ELSE NULL END)))) AS AGGREGATEBOOLEAN_{column}"
618
+
619
+ # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
620
+
621
+ # CASE 1: min_max
622
+ if group_by_aggregation_type == "only_min_max":
623
+ group_by_query_aggregation_string = aggregates_min[1:]
624
+
625
+ # CASE 2: sum, count_distinct, aggregate_boolean
626
+ elif group_by_aggregation_type == "various":
627
+ group_by_query_aggregation_string = aggregates[1:]
628
+
629
+ # CASE 3: sum, count_distinct, aggregate_boolean, min_max
630
+ elif group_by_aggregation_type == "various_and_min_max":
631
+ group_by_query_aggregation_string = f"{aggregates_min[1:]}{aggregates}"
632
+
633
+ query_group_by_aggregation = f"SELECT {group_by_query_columns_string}, COUNT(*) AS COUNT_OF_GROUP_BY_VALUE, {group_by_query_aggregation_string} FROM {object.database}.{object.schema}.{object.name} {where_clause} GROUP BY {group_by_query_columns_string} ORDER BY {group_by_query_columns_string};"
634
+
635
+ group_by_aggregation_pdf = self.execute_queries(query_group_by_aggregation, True)
636
+ except Exception as err:
637
+ group_by_aggregation_pdf = pd.DataFrame()
638
+ group_by_aggregation_pdf["TESTATM_ERROR"] = [1]
639
+ if not grouping_columns_final:
640
+ error_dict = {
641
+ "QUERY": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table",
642
+ "ERROR": "NO Group-BY Columns found in the Columns Intersection. Please check if the configurated Group-By Columns exist in the Table",
643
+ }
644
+ group_by_query_aggregation_string = ""
645
+ elif "|||" in str(err):
646
+ error_dict = {"QUERY": str(err).split("|||")[0], "ERROR": str(err).split("|||")[1]}
647
+ else:
648
+ error_dict = {
649
+ "QUERY": "NO Query generated. Please check if the configurated Grouping Columns exist in the Table",
650
+ "ERROR": str(err),
651
+ }
652
+ group_by_query_aggregation_string = ""
653
+
654
+ return (
655
+ group_by_aggregation_pdf,
656
+ group_by_query_aggregation_string,
657
+ group_by_query_columns_string,
658
+ grouping_columns_final,
659
+ error_dict,
660
+ )
661
+
662
+ def create_pandas_df(
663
+ self,
664
+ object: DatabaseObject,
665
+ intersection_columns_trgt_src: list,
666
+ where_clause: str = "",
667
+ exclude_columns: list = [],
668
+ enclose_column_by_double_quotes: bool = False,
669
+ ) -> pd.DataFrame:
670
+ """creates pandas dataframes with all data from given object in given columns
671
+
672
+ Args:
673
+ object (DatabaseObject): table or view
674
+ intersection_columns_trgt_src (list): columns existing in source and target
675
+
676
+ Returns:
677
+ pd.DataFrame: direct result of sql query
678
+ """
679
+
680
+ if self.sqlserver_connection is None:
681
+ self._connect_to_sqlserver()
682
+
683
+ intersection_columns_trgt_src_ = ", ".join(list(set(intersection_columns_trgt_src) - set(exclude_columns)))
684
+
685
+ df_query = f"SELECT {intersection_columns_trgt_src_} FROM {object.schema}.{object.name} {where_clause};"
686
+
687
+ pdf = self.execute_queries(df_query, True)
688
+
689
+ return pdf
690
+
691
+ def create_pandas_df_from_sample(
692
+ self,
693
+ object: DatabaseObject,
694
+ column_intersections: list,
695
+ key_columns: list,
696
+ where_clause: str = "",
697
+ exclude_columns: list = [],
698
+ key_filters: dict = {},
699
+ dedicated_columns: list = [],
700
+ sample_count: int = 10,
701
+ numeric_scale: int = None,
702
+ enclose_column_by_double_quotes: bool = False,
703
+ ) -> list[dict]:
704
+ if self.sqlserver_connection is None:
705
+ self._connect_to_sqlserver()
706
+
707
+ sample_count = str(sample_count)
708
+ key_intersection = list((set(column_intersections) & set(key_columns)) - set(exclude_columns))
709
+ filter_intersection = list((set(column_intersections) & set(key_filters.keys())) - set(exclude_columns))
710
+ dedicated_intersection = list((set(column_intersections) & set(dedicated_columns)) - set(exclude_columns))
711
+
712
+ key_intersection.sort()
713
+ filter_intersection.sort()
714
+ dedicated_intersection.sort()
715
+
716
+ if not where_clause:
717
+ where_clause = "WHERE 1=1 "
718
+
719
+ if dedicated_intersection != []:
720
+ is_dedicated = True
721
+
722
+ dict_colummns_datatype = self.get_data_types_from_object(object, dedicated_intersection)
723
+
724
+ else:
725
+ is_dedicated = False
726
+
727
+ dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
728
+
729
+ if key_intersection != [] and is_dedicated:
730
+ keys = str(key_intersection)[1:-1].replace("'", "")
731
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
732
+ dedicated_intersection, dict_colummns_datatype, numeric_scale, key_columns,
733
+ enclose_column_by_double_quotes
734
+ )
735
+ if (key_filters != {}) & (filter_intersection != []):
736
+ values = list(key_filters.values())
737
+ if values[0] != []:
738
+ in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale, enclose_column_by_double_quotes)
739
+ else:
740
+ in_clause = ""
741
+ else:
742
+ in_clause = ""
743
+ sample_query = f"""
744
+ SELECT TOP ({sample_count}) {column_clause}
745
+ FROM {object.schema}.{object.name}
746
+ {where_clause}{in_clause}
747
+ ORDER BY NEWID(), {keys};
748
+ """
749
+ elif key_intersection != [] and not is_dedicated:
750
+ keys = str(key_intersection)[1:-1].replace("'", "")
751
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
752
+ column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
753
+ enclose_column_by_double_quotes
754
+ )
755
+ if (key_filters != {}) & (filter_intersection != []):
756
+ values = list(key_filters.values())
757
+ if values[0] != []:
758
+ in_clause = self._get_in_clause(key_filters, numeric_columns, numeric_scale, enclose_column_by_double_quotes)
759
+ else:
760
+ in_clause = ""
761
+ else:
762
+ in_clause = ""
763
+ sample_query = f"""
764
+ SELECT TOP ({sample_count}) {column_clause}
765
+ FROM {object.schema}.{object.name}
766
+ {where_clause}{in_clause}
767
+ ORDER BY NEWID(), {keys};
768
+ """
769
+ else:
770
+ column_intersections = list(set(column_intersections) - set(exclude_columns))
771
+ column_intersections.sort()
772
+ column_clause, numeric_columns, used_columns = self._get_column_clause(
773
+ column_intersections, dict_colummns_datatype, numeric_scale, key_columns,
774
+ enclose_column_by_double_quotes
775
+ )
776
+ sample_query = f"""
777
+ SELECT TOP ({sample_count}) {column_clause}
778
+ FROM {object.schema}.{object.name}
779
+ {where_clause}
780
+ ORDER BY NEWID();
781
+ """
782
+
783
+ error_dict = {}
784
+ key_dict = {}
785
+ try:
786
+ sample_pdf = self.execute_queries(sample_query, return_as_pdf=True)
787
+ for key in key_intersection:
788
+ if pd.api.types.is_datetime64_any_dtype(sample_pdf[key]):
789
+ key_dict[key] = list(sample_pdf[key].astype(str))
790
+ else:
791
+ key_dict[key] = list(sample_pdf[key])
792
+
793
+ except Exception as err:
794
+ sample_pdf = pd.DataFrame()
795
+ sample_pdf["TESTATM_ERROR"] = [1]
796
+ if "|||" in str(err):
797
+ error_dict = {"QUERY": str(err).split("|||")[0], "ERROR": str(err).split("|||")[1]}
798
+ else:
799
+ error_dict = {"QUERY": "No SQL Error", "ERROR": str(err)}
800
+
801
+ return_list = []
802
+ return_list.append(sample_pdf)
803
+ return_list.append(error_dict)
804
+
805
+ return return_list, key_dict, used_columns, sample_query
806
+
807
+ def execute_queries(
808
+ self, query: str | list[str], return_as_pdf: bool = False, return_query_ids: bool = False
809
+ ) -> list[dict] | list[list[dict]]:
810
+ """actual execution of defined queries
811
+
812
+ Args:
813
+ query (Union[str, List[str]]): queries to be executed
814
+ return_as_pdf (bool, optional): If true, queries returned as pandas data frames. Defaults to False.
815
+
816
+ Raises:
817
+ Exception: Raises exception if single query cannot be executed.
818
+
819
+ Returns:
820
+ Union[List[Dict], List[List[Dict]]]: returns results
821
+ """
822
+
823
+ if self.sqlserver_connection is None:
824
+ self._connect_to_sqlserver()
825
+
826
+ if query:
827
+ query_list: list[str] = query if isinstance(query, list) else [query]
828
+ else:
829
+ logger.error("Query defined as null - please check input for execute_queries function.")
830
+
831
+ results = []
832
+
833
+ cursor = self.sqlserver_connection.cursor()
834
+
835
+ for single_query in query_list:
836
+ try:
837
+ if return_as_pdf:
838
+ query_result = pandas.io.sql.read_sql(single_query, self.sqlserver_connection)
839
+ else:
840
+ query_result=cursor.execute(single_query).fetchall()
841
+ columns = [column[0] for column in cursor.description]
842
+ query_result = [dict(zip(columns, row)) for row in query_result]
843
+
844
+ results.append(query_result)
845
+ except Exception as err:
846
+ raise Exception(single_query + "|||" + str(err))
847
+
848
+ return results[0] if not isinstance(query, list) else results
849
+
850
+ def execute_statement(self, statement: str | list[str]) -> None:
851
+ """
852
+ Executes simple statement against sqlserver
853
+ Schema and Database settings must be set beforehand
854
+ Args:
855
+ statement Union[str, List[str]] - a sql statement or a list of sql statements to execute
856
+ """
857
+ if self.sqlserver_connection is None:
858
+ self._connect_to_sqlserver()
859
+
860
+ statement_list: list[str] = statement if isinstance(statement, list) else [statement]
861
+
862
+ try:
863
+ for single_statement in statement_list:
864
+ stripped_statement = single_statement.strip()
865
+ _ = self.sqlserver_connection.execute(stripped_statement)
866
+
867
+ except Exception as err:
868
+ raise Exception(self._get_error_message(err, single_statement)) from err