icsDataValidation 1.0.430__py3-none-any.whl → 1.0.439__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. icsDataValidation/connection_setups/sqlserver_connection_setup.py +4 -3
  2. icsDataValidation/input_parameters/testing_tool_params.py +0 -1
  3. icsDataValidation/services/database_services/snowflake_service.py +170 -67
  4. icsDataValidation/services/database_services/sqlserver_service.py +196 -88
  5. {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.439.dist-info}/METADATA +1 -1
  6. {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.439.dist-info}/RECORD +22 -8
  7. {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.439.dist-info}/WHEEL +1 -1
  8. {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.439.dist-info}/top_level.txt +1 -0
  9. tests/snowflake_service/test_create_checksums.py +146 -0
  10. tests/snowflake_service/test_create_pandas_df_from_group_by.py +485 -0
  11. tests/snowflake_service/test_create_pandas_df_from_sample.py +444 -0
  12. tests/snowflake_service/test_get_checksum_statement.py +243 -0
  13. tests/snowflake_service/test_get_column_clause.py +305 -0
  14. tests/snowflake_service/test_get_countnulls_statement.py +128 -0
  15. tests/snowflake_service/test_get_in_clause.py +66 -0
  16. tests/sqlserver_service/test_create_checksums.py +153 -0
  17. tests/sqlserver_service/test_create_pandas_df_from_group_by.py +427 -0
  18. tests/sqlserver_service/test_create_pandas_df_from_sample.py +286 -0
  19. tests/sqlserver_service/test_get_checksum_statement.py +160 -0
  20. tests/sqlserver_service/test_get_column_clause.py +182 -0
  21. tests/sqlserver_service/test_get_countnulls_statement.py +121 -0
  22. tests/sqlserver_service/test_get_in_clause.py +87 -0
@@ -1,8 +1,5 @@
1
1
  import os
2
2
 
3
- from dotenv import load_dotenv
4
- from pathlib import Path
5
-
6
3
  #########################################################################################
7
4
  #########################################################################################
8
5
 
@@ -15,6 +12,10 @@ def load_sqlserver_credentials(system_configs:dict,system_selection:str)->dict:
15
12
  "Password" : os.getenv(system_configs[system_selection]["PASSWORD_NAME"]),
16
13
  "Driver" : system_configs[system_selection]["DRIVER"],
17
14
  "Port" : system_configs[system_selection]["PORT"],
15
+ "Encrypt" : system_configs[system_selection]["Encrypt"],
16
+ "TrustServerCertificate" : system_configs[system_selection]["TrustServerCertificate"]
18
17
  }
19
18
 
19
+
20
+
20
21
  return sqlserver_params
@@ -56,7 +56,6 @@ class TestingToolParams:
56
56
  max_group_by_count_distinct: int = int(os.environ.get('MAX_GROUP_BY_COUNT_DISTINCT','max_group_by_count_distinct env variable not found'))
57
57
  max_group_by_size: int = int(os.environ.get('MAX_GROUP_BY_SIZE','max_group_by_size env variable not found'))
58
58
  numeric_scale: int = int(os.environ.get('NUMERIC_SCALE','numeric_scale env variable not found'))
59
- enclose_column_by_double_quotes: bool = True if os.environ.get('ENCLOSE_COLUMN_BY_DOUBLE_QUOTES','enclose_column_by_double_quotes env variable not found') == 'True' else False
60
59
  branch_name: str = os.environ.get('BRANCH_NAME', 'branch_name env variable not found')
61
60
  source_branch:str = os.environ.get('BUILD_SOURCEBRANCH', 'build_sourcebranch env variable not found')
62
61
  azure_storage_connection_string: str = os.environ.get('AZURE_STORAGE_CONNECTION_STRING','azure_storage_connection_string env variable not found')
@@ -1,9 +1,8 @@
1
- import snowflake.connector
2
1
  import logging
3
- import pandas as pd
4
-
5
2
  from pathlib import PurePath
6
3
 
4
+ import pandas as pd
5
+ import snowflake.connector
7
6
  from cloe_util_snowflake_connector import connection_parameters
8
7
 
9
8
  from icsDataValidation.core.database_objects import DatabaseObject
@@ -69,12 +68,16 @@ class SnowflakeService:
69
68
  key_filters (list): list of given expected values
70
69
  numeric_columns (list): list of all numeric columns
71
70
  numeric_scale (int): number of decimal places after rounding
72
-
71
+ enclose_column_by_double_quotes (bool): whether to enclose column names by double quotes
73
72
  Returns:
74
73
  str: in clause as string
75
74
  """
76
75
  values = list(key_filters.values())
77
76
  in_clause_values = "('"
77
+
78
+ if len(values) == 0:
79
+ return ""
80
+
78
81
  for j in range(len(values[0])):
79
82
  for value in values:
80
83
  in_clause_values += str(value[j]) + "','"
@@ -104,6 +107,7 @@ class SnowflakeService:
104
107
  columns_datatype (list): datatypes of given columns
105
108
  numeric_scale (_type_): number of decimal places for numeric columns
106
109
  key_columns (_type_):list of columns of interest
110
+ enclose_column_by_double_quotes (bool): whether to enclose column names by double quotes
107
111
 
108
112
  Returns:
109
113
  dict: _description_
@@ -141,6 +145,109 @@ class SnowflakeService:
141
145
  column_clause = str(column_intersections)[1:-1].replace("'", "")
142
146
  return column_clause, numeric_columns, used_columns
143
147
 
148
+ def _get_checksum_statement(self,
149
+ object: DatabaseObject,
150
+ column_intersections: list,
151
+ where_clause: str = "",
152
+ exclude_columns: list = [],
153
+ numeric_scale: int = None,
154
+ enclose_column_by_double_quotes: bool = False,
155
+ bool_cast_before_sum: bool = False) -> str:
156
+ """
157
+ Creates checksum sql statement for given object in compliance with given conditions
158
+
159
+ object (DatabaseObject): table or view
160
+ column_intersections (list): columns that are used for checksums
161
+ where_clause (str, optional): Optional filter criteria given as sql-usable string
162
+ exclude_columns (list, optional): columns to exlude from calculation
163
+ numeric_scale (int, optional): number of decimal places for aggregations
164
+ enclose_column_by_double_quotes (bool, optional): whether to enclose column names by double quotes. Defaults to False.
165
+ bool_cast_before_sum (bool, optional): whether to cast before sum
166
+
167
+ Returns:
168
+ str: checksum sql statement
169
+ """
170
+ column_intersections = [f'{x}' for x in column_intersections if x not in exclude_columns]
171
+ logger.debug(f"Column Intersections: {column_intersections}")
172
+ dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
173
+ aggregates = ""
174
+
175
+ for column in column_intersections:
176
+ if enclose_column_by_double_quotes:
177
+ column_identifier = f'"{column}"'
178
+ else:
179
+ column_identifier = column
180
+ column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
181
+
182
+ if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
183
+ if not bool_cast_before_sum:
184
+ if numeric_scale:
185
+ aggregates += (
186
+ f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS "SUM_{column}"'
187
+ )
188
+ else:
189
+ aggregates += f', CAST(SUM({column_identifier}) AS DECIMAL(38)) AS "SUM_{column}"'
190
+ else:
191
+ if numeric_scale:
192
+ aggregates += (
193
+ f', ROUND(SUM(CAST({column_identifier} AS DECIMAL(38, {numeric_scale}))), {numeric_scale}) AS "SUM_{column}"'
194
+ )
195
+ else:
196
+ aggregates += f', SUM(CAST({column_identifier} AS DECIMAL(38))) AS "SUM_{column}"'
197
+ elif (
198
+ column_datatype.lower() in self.snowflake_datatype_mapping["string"]
199
+ or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
200
+ ):
201
+ aggregates += f', COUNT(DISTINCT LOWER({column_identifier})) AS "COUNTDISTINCT_{column}"'
202
+
203
+ elif column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
204
+ aggregates += f', COUNT(DISTINCT LOWER(TRY_CONVERT(VARCHAR,{column_identifier}))) AS "COUNTDISTINCT_{column}"'
205
+
206
+ elif column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
207
+ aggregates += f''', CONCAT(CONCAT(CONVERT(VARCHAR,COUNT(CASE WHEN {column_identifier} = 1 THEN 1 ELSE NULL END)) , '_'), CONVERT(VARCHAR, COUNT(CASE WHEN {column_identifier} = 0 THEN 1 ELSE NULL END))) AS "AGGREGATEBOOLEAN_{column}"'''
208
+
209
+ #else: Additional Data Types: image , sql_variant, uniqueidentifier, xml, cursor, table, column_datatype.lower() == 'bit' or
210
+ query_checksums = (
211
+ f"SELECT {aggregates[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
212
+ )
213
+
214
+ return query_checksums
215
+
216
+ def _get_countnulls_statement(self,
217
+ object: DatabaseObject,
218
+ column_intersections: list,
219
+ where_clause: str = "",
220
+ exclude_columns: list = [],
221
+ enclose_column_by_double_quotes: bool = False):
222
+ """
223
+ Creates countnulls sql statement for given object in compliance with given conditions
224
+
225
+ object (DatabaseObject): table or view
226
+ column_intersections (list): columns that are used for checksums
227
+ where_clause (str, optional): Optional filter criteria given as sql-usable string
228
+ exclude_columns (list, optional): columns to exlude from calculation
229
+ enclose_column_by_double_quotes (bool, optional): whether to enclose column names by double quotes. Defaults to False.
230
+
231
+ Returns:
232
+ str: countnulls sql statement
233
+ """
234
+ column_intersections = [f"{x}" for x in column_intersections if x not in exclude_columns]
235
+ logger.debug(f"Column Intersections: {column_intersections}")
236
+ count_nulls = ""
237
+
238
+ for column in column_intersections:
239
+ if enclose_column_by_double_quotes:
240
+ column_identifier = f'"{column}"'
241
+ else:
242
+ column_identifier = column
243
+ count_nulls += f', SUM(CASE WHEN {column_identifier} IS NULL THEN 1 ELSE 0 END) AS "COUNTNULLS_{column}"'
244
+
245
+ query_countnulls = (
246
+ f"SELECT {count_nulls[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
247
+ )
248
+
249
+ return query_countnulls
250
+
144
251
  def get_database_objects(
145
252
  self, database: str, schema: str = None, object_type_restriction: str = "include_all"
146
253
  ) -> dict:
@@ -308,6 +415,7 @@ class SnowflakeService:
308
415
  column_intersections (list): columns that are used for distinct count
309
416
  where_clause (str, optional): optional further filter. Defaults to "".
310
417
  exclude_columns (list, optional): columns to exclude from distinct count. Defaults to [].
418
+ enclose_column_by_double_quotes (bool): whether to enclose column names by double quotes. Defaults to False.
311
419
 
312
420
  Returns:
313
421
  dict: distinct counts for columns
@@ -383,83 +491,78 @@ class SnowflakeService:
383
491
  if self.snowflake_connection is None:
384
492
  self._connect_to_snowflake()
385
493
 
386
- column_intersections = [f"{x.upper()}" for x in column_intersections if x not in exclude_columns]
387
-
388
- logger.debug(f"Column Intersections: {column_intersections}")
389
-
390
- dict_colummns_datatype = self.get_data_types_from_object(object, column_intersections)
391
-
392
- aggregates = ""
393
- count_nulls = ""
394
-
395
- for column in column_intersections:
396
- if enclose_column_by_double_quotes:
397
- column_identifier = f'"{column}"'
398
- else:
399
- column_identifier = column
400
- column_datatype = next(x for x in dict_colummns_datatype if x["COLUMN_NAME"] == column)["DATA_TYPE"]
401
-
402
- count_nulls += f', SUM(CASE WHEN {column_identifier} IS NULL THEN 1 ELSE 0 END) AS "COUNTNULLS_{column}"'
403
-
404
- if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
405
- if numeric_scale:
406
- aggregates += (
407
- f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38, {numeric_scale})) AS "SUM_{column}"'
408
- )
409
- else:
410
- aggregates += f', CAST(SUM({column_identifier}) AS DECIMAL(38)) AS "SUM_{column}"'
411
-
412
- elif (
413
- column_datatype.lower() in self.snowflake_datatype_mapping["string"]
414
- or column_datatype.lower() in self.snowflake_datatype_mapping["date_and_time"]
415
- ):
416
- aggregates += f', COUNT(DISTINCT LOWER({column_identifier})) AS "COUNTDISTINCT_{column}"'
417
-
418
- elif column_datatype.lower() in self.snowflake_datatype_mapping["binary"]:
419
- aggregates += f', COUNT(DISTINCT LOWER(TRY_TO_NUMBER({column_identifier}::VARCHAR))) AS "COUNTDISTINCT_{column}"'
420
-
421
- elif column_datatype.lower() in self.snowflake_datatype_mapping["boolean"]:
422
- aggregates += f", MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = true)::VARCHAR || '_' || MAX(SELECT COUNT(*) FROM {object.database}.{object.schema}.{object.name} WHERE {column_identifier} = false) :: VARCHAR AS \"AGGREGATEBOOLEAN_{column}\""
423
-
424
-
425
- # else: Additional Data Types: VARIANT OBJECT ARRAY GEOGRAPHY
426
-
427
- query_checksums = (
428
- f"SELECT {aggregates[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
494
+ ## get checksum query
495
+ query_checksums = self._get_checksum_statement(
496
+ object=object,
497
+ column_intersections=column_intersections,
498
+ where_clause=where_clause,
499
+ exclude_columns=exclude_columns,
500
+ numeric_scale=numeric_scale,
501
+ enclose_column_by_double_quotes=enclose_column_by_double_quotes
429
502
  )
430
503
 
431
- query_countnulls = (
432
- f"SELECT {count_nulls[1:]} FROM {object.database}.{object.schema}.{object.name} {where_clause};"
504
+ ## get countnulls query
505
+ query_countnulls = self._get_countnulls_statement(
506
+ object=object,
507
+ column_intersections=column_intersections,
508
+ where_clause=where_clause,
509
+ exclude_columns=exclude_columns,
510
+ enclose_column_by_double_quotes=enclose_column_by_double_quotes
433
511
  )
434
512
 
435
513
  error_list = []
436
514
  test_list = []
437
515
  aggregation_results = {}
516
+ countnulls_results = {}
438
517
 
439
518
  try:
440
519
  checksums_results = self.execute_queries([query_checksums, query_countnulls])
441
-
442
520
  aggregation_results = checksums_results[0][0]
443
-
444
521
  countnulls_results = checksums_results[1][0]
522
+ except Exception as err:
523
+ err_msg = ["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]]
524
+
525
+ if 'Arithmetic overflow' in err_msg[2]:
526
+ # re-calculate queries with bool_cast_before_sum=True in case of error
527
+ query_checksums = self.create_checksum_statement(
528
+ object=object,
529
+ column_intersections=column_intersections,
530
+ where_clause=where_clause,
531
+ exclude_columns=exclude_columns,
532
+ numeric_scale=numeric_scale,
533
+ enclose_column_by_double_quotes=enclose_column_by_double_quotes,
534
+ bool_cast_before_sum=True
535
+ )
536
+ try:
537
+ # if overflow then try again with cast before sum for booleans
538
+ checksums_results = self.execute_queries([query_checksums, query_countnulls])
539
+ aggregation_results = checksums_results[0][0]
540
+ countnulls_results = checksums_results[1][0]
541
+ except Exception as err:
542
+ # handle error if it still occurs
543
+ err_msg = ["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]]
544
+ error_list.append(err_msg)
545
+ else:
546
+ # handle error if it is not an overflow
547
+ error_list.append(err_msg)
548
+ checksums_results = None
549
+
550
+ # if error occured before this will be skipped as aggregation_results would be empty
551
+ for i in range(0, len(aggregation_results)):
552
+ if list(aggregation_results.values())[i] is None:
553
+ agg_result = 0
554
+ else:
555
+ agg_result = list(aggregation_results.values())[i]
445
556
 
446
- for i in range(0, len(aggregation_results)):
447
- if list(aggregation_results.values())[i] is None:
448
- agg_result = 0
449
- else:
450
- agg_result = list(aggregation_results.values())[i]
451
-
452
- if list(countnulls_results.values())[i] is None:
453
- cnt_result = 0
454
- else:
455
- cnt_result = list(countnulls_results.values())[i]
557
+ if list(countnulls_results.values())[i] is None:
558
+ cnt_result = 0
559
+ else:
560
+ cnt_result = list(countnulls_results.values())[i]
456
561
 
457
- test_list.append(
458
- [[item.split("_", 1)[0] for item in list(aggregation_results.keys())][i], agg_result, cnt_result]
459
- )
562
+ test_list.append(
563
+ [[item.split("_", 1)[0] for item in list(aggregation_results.keys())][i], agg_result, cnt_result]
564
+ )
460
565
 
461
- except Exception as err:
462
- error_list.append(["ERROR", str(err).split("|||")[0], str(err).split("|||")[1]])
463
566
 
464
567
  checksums = dict(zip([item.split("_", 1)[1] for item in aggregation_results.keys()], test_list))
465
568
  checksums["TESTATM_ERRORS"] = error_list
@@ -542,7 +645,7 @@ class SnowflakeService:
542
645
 
543
646
  if column_datatype.lower() in self.snowflake_datatype_mapping["numeric"]:
544
647
  if numeric_scale:
545
- aggregates_min += f', CAST(ROUND(MIN({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MIN_{column}", CAST(ROUND(max({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MAX_{column}"'
648
+ aggregates_min += f', CAST(ROUND(MIN({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MIN_{column}", CAST(ROUND(MAX({column_identifier}),{numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "MAX_{column}"'
546
649
  aggregates += f', CAST(ROUND(SUM({column_identifier}), {numeric_scale}) AS DECIMAL(38,{numeric_scale})) AS "SUM_{column}"'
547
650
  else:
548
651
  aggregates_min += f', MIN({column_identifier}) AS "MIN_{column}", MAX({column_identifier}) AS "MAX_{column}"'