icsDataValidation 1.0.428__py3-none-any.whl → 1.0.438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. icsDataValidation/connection_setups/sqlserver_connection_setup.py +4 -3
  2. icsDataValidation/input_parameters/testing_tool_params.py +0 -1
  3. icsDataValidation/main.py +3 -4
  4. icsDataValidation/services/database_services/snowflake_service.py +170 -65
  5. icsDataValidation/services/database_services/sqlserver_service.py +196 -88
  6. {icsdatavalidation-1.0.428.dist-info → icsdatavalidation-1.0.438.dist-info}/METADATA +1 -1
  7. {icsdatavalidation-1.0.428.dist-info → icsdatavalidation-1.0.438.dist-info}/RECORD +23 -9
  8. {icsdatavalidation-1.0.428.dist-info → icsdatavalidation-1.0.438.dist-info}/WHEEL +1 -1
  9. {icsdatavalidation-1.0.428.dist-info → icsdatavalidation-1.0.438.dist-info}/top_level.txt +1 -0
  10. tests/snowflake_service/test_create_checksums.py +146 -0
  11. tests/snowflake_service/test_create_pandas_df_from_group_by.py +485 -0
  12. tests/snowflake_service/test_create_pandas_df_from_sample.py +444 -0
  13. tests/snowflake_service/test_get_checksum_statement.py +243 -0
  14. tests/snowflake_service/test_get_column_clause.py +305 -0
  15. tests/snowflake_service/test_get_countnulls_statement.py +128 -0
  16. tests/snowflake_service/test_get_in_clause.py +66 -0
  17. tests/sqlserver_service/test_create_checksums.py +153 -0
  18. tests/sqlserver_service/test_create_pandas_df_from_group_by.py +427 -0
  19. tests/sqlserver_service/test_create_pandas_df_from_sample.py +286 -0
  20. tests/sqlserver_service/test_get_checksum_statement.py +160 -0
  21. tests/sqlserver_service/test_get_column_clause.py +182 -0
  22. tests/sqlserver_service/test_get_countnulls_statement.py +121 -0
  23. tests/sqlserver_service/test_get_in_clause.py +87 -0
@@ -0,0 +1,485 @@
1
+ from unittest.mock import MagicMock, patch
2
+
3
+ import pandas as pd
4
+ import pytest
5
+
6
+ from icsDataValidation.core.database_objects import DatabaseObject, DatabaseObjectType
7
+ from icsDataValidation.services.database_services.snowflake_service import SnowflakeService
8
+
9
+
10
+ @pytest.fixture
11
+ def snowflake_service():
12
+ """Create a SnowflakeService instance with mocked connection."""
13
+ mock_params = MagicMock()
14
+ service = SnowflakeService(mock_params)
15
+ service.snowflake_connection = MagicMock()
16
+ return service
17
+
18
+
19
+ @pytest.fixture
20
+ def mock_database_object():
21
+ """Create a mock DatabaseObject."""
22
+ obj = DatabaseObject(
23
+ object_identifier="TestDB.dbo.TestTable",
24
+ object_type=DatabaseObjectType.TABLE
25
+ )
26
+ return obj
27
+
28
+
29
+ class TestCreatePandasDfFromGroupByParametrized:
30
+ """Parametrized tests for create_pandas_df_from_group_by method."""
31
+
32
+ @pytest.mark.parametrize(
33
+ "column_intersections,group_by_columns,group_by_aggregation_columns," \
34
+ "group_by_aggregation_type,only_numeric,where_clause,exclude_columns," \
35
+ "numeric_scale,enclose_quotes,mock_datatypes," \
36
+ "expected_group_by_cols,expected_in_agg_string,expected_not_in_agg_string," \
37
+ "expected_grouping_cols_final",
38
+ [
39
+ ( # single grouping column, no double quotes
40
+ ['region', 'amount'],
41
+ ['region'],
42
+ ['amount'],
43
+ 'various',
44
+ False,
45
+ '',
46
+ [],
47
+ None,
48
+ False,
49
+ [{"COLUMN_NAME": "amount", "DATA_TYPE": "number"}],
50
+ " region ",
51
+ ["SUM(amount)"],
52
+ [],
53
+ ['region']
54
+ ),
55
+ ( # single grouping column, with double quotes
56
+ ['region', 'amount'],
57
+ ['region'],
58
+ ['amount'],
59
+ 'various',
60
+ False,
61
+ '',
62
+ [],
63
+ None,
64
+ True,
65
+ [{"COLUMN_NAME": "amount", "DATA_TYPE": "number"}],
66
+ ' "region" ',
67
+ ['SUM("amount")'],
68
+ [],
69
+ ['region']
70
+ ),
71
+ ( # multiple grouping columns, no double quotes
72
+ ['region', 'department', 'amount'],
73
+ ['region', 'department'],
74
+ ['amount'],
75
+ 'various',
76
+ False,
77
+ '',
78
+ [],
79
+ None,
80
+ False,
81
+ [{"COLUMN_NAME": "amount", "DATA_TYPE": "number"}],
82
+ " region ,department ",
83
+ ["SUM(amount)"],
84
+ [],
85
+ ['region', 'department']
86
+ ),
87
+ ( # grouping column excluded
88
+ ['region', 'department', 'amount'],
89
+ ['region', 'department'],
90
+ ['amount'],
91
+ 'various',
92
+ False,
93
+ '',
94
+ ['department'],
95
+ None,
96
+ False,
97
+ [{"COLUMN_NAME": "amount", "DATA_TYPE": "number"}],
98
+ " region ",
99
+ ["SUM(amount)"],
100
+ [],
101
+ ['region']
102
+ ),
103
+ ( # grouping column not in intersections
104
+ ['amount'],
105
+ ['region'],
106
+ ['amount'],
107
+ 'various',
108
+ False,
109
+ '',
110
+ [],
111
+ None,
112
+ False,
113
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "number"}],
114
+ "",
115
+ [],
116
+ [],
117
+ []
118
+ ),
119
+ ( # only_min_max type, numeric columns
120
+ ['region', 'amount', 'price'],
121
+ ['region'],
122
+ ['amount', 'price'],
123
+ 'only_min_max',
124
+ True,
125
+ '',
126
+ [],
127
+ None,
128
+ False,
129
+ [
130
+ {"COLUMN_NAME": "amount", "DATA_TYPE": "number"},
131
+ {"COLUMN_NAME": "price", "DATA_TYPE": "float"}
132
+ ],
133
+ " region ",
134
+ ["MIN(amount)", "MAX(amount)", "MIN(price)", "MAX(price)"],
135
+ ["SUM(", "COUNTDISTINCT"],
136
+ ['region']
137
+ ),
138
+ ( # only_min_max with numeric_scale
139
+ ['region', 'AMOUNT'],
140
+ ['region'],
141
+ ['AMOUNT'],
142
+ 'only_min_max',
143
+ True,
144
+ '',
145
+ [],
146
+ 2,
147
+ False,
148
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "float"}],
149
+ " region ",
150
+ ["CAST(ROUND(MIN(AMOUNT),2) AS DECIMAL(38,2))", "CAST(ROUND(MAX(AMOUNT),2) AS DECIMAL(38,2))"],
151
+ [],
152
+ ['region']
153
+ ),
154
+ ( # only_min_max with numeric_scale and double quotes
155
+ ['region', 'amount'],
156
+ ['region'],
157
+ ['amount'],
158
+ 'only_min_max',
159
+ True,
160
+ '',
161
+ [],
162
+ 2,
163
+ True,
164
+ [{"COLUMN_NAME": "amount", "DATA_TYPE": "float"}],
165
+ ' "region" ',
166
+ ['CAST(ROUND(MIN("amount"),2) AS DECIMAL(38,2))', 'CAST(ROUND(MAX("amount"),2) AS DECIMAL(38,2))'],
167
+ [],
168
+ ['region']
169
+ ),
170
+ ( # various type, numeric only
171
+ ['REGION', 'AMOUNT'],
172
+ ['REGION'],
173
+ ['AMOUNT'],
174
+ 'various',
175
+ True,
176
+ '',
177
+ [],
178
+ None,
179
+ False,
180
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "number"}],
181
+ " REGION ",
182
+ ["SUM(AMOUNT)"],
183
+ ["MIN(", "MAX("],
184
+ ['REGION']
185
+ ),
186
+ ( # various type with string columns
187
+ ['region', 'AMOUNT', 'DESCRIPTION'],
188
+ ['region'],
189
+ ['AMOUNT', 'DESCRIPTION'],
190
+ 'various',
191
+ False,
192
+ '',
193
+ [],
194
+ None,
195
+ False,
196
+ [
197
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "number"},
198
+ {"COLUMN_NAME": "DESCRIPTION", "DATA_TYPE": "text"}
199
+ ],
200
+ " region ",
201
+ ["SUM(AMOUNT)", "COUNT(DISTINCT LOWER(DESCRIPTION))", '"COUNTDISTINCT_DESCRIPTION"'],
202
+ [],
203
+ ['region']
204
+ ),
205
+ ( # various type with string columns and double quotes
206
+ ['REGION', 'amount', 'DEsCRIPTION'],
207
+ ['REGION'],
208
+ ['amount', 'DEsCRIPTION'],
209
+ 'various',
210
+ False,
211
+ '',
212
+ [],
213
+ None,
214
+ True,
215
+ [
216
+ {"COLUMN_NAME": "amount", "DATA_TYPE": "number"},
217
+ {"COLUMN_NAME": "DEsCRIPTION", "DATA_TYPE": "text"}
218
+ ],
219
+ ' "REGION" ',
220
+ ['SUM("amount")', 'COUNT(DISTINCT LOWER("DEsCRIPTION"))', '"COUNTDISTINCT_DEsCRIPTION"'],
221
+ [],
222
+ ['REGION']
223
+ ),
224
+ ( # various type with boolean columns
225
+ ['REGION', 'AMOUNT', 'IS_ACTIVE'],
226
+ ['REGION'],
227
+ ['AMOUNT', 'IS_ACTIVE'],
228
+ 'various',
229
+ False,
230
+ '',
231
+ [],
232
+ None,
233
+ False,
234
+ [
235
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "number"},
236
+ {"COLUMN_NAME": "IS_ACTIVE", "DATA_TYPE": "boolean"}
237
+ ],
238
+ " REGION ",
239
+ ["SUM(AMOUNT)", "AGGREGATEBOOLEAN_IS_ACTIVE", "COUNT(CASE WHEN IS_ACTIVE = true THEN 1 ELSE NULL END)"],
240
+ [],
241
+ ['REGION']
242
+ ),
243
+ ( # various type with binary columns
244
+ ['REGION', 'BINARY_DATA'],
245
+ ['REGION'],
246
+ ['BINARY_DATA'],
247
+ 'various',
248
+ False,
249
+ '',
250
+ [],
251
+ None,
252
+ False,
253
+ [{"COLUMN_NAME": "BINARY_DATA", "DATA_TYPE": "binary"}],
254
+ " REGION ",
255
+ ["COUNT(DISTINCT LOWER(TRY_TO_NUMBER(BINARY_DATA::VARCHAR)))", "COUNTDISTINCT_BINARY_DATA"],
256
+ [],
257
+ ['REGION']
258
+ ),
259
+ ( # various type with datetime columns
260
+ ['REGION', 'CREATED_DATE'],
261
+ ['REGION'],
262
+ ['CREATED_DATE'],
263
+ 'various',
264
+ False,
265
+ '',
266
+ [],
267
+ None,
268
+ False,
269
+ [{"COLUMN_NAME": "CREATED_DATE", "DATA_TYPE": "timestamp_ntz"}],
270
+ " REGION ",
271
+ ["COUNT(DISTINCT LOWER(CREATED_DATE))", "COUNTDISTINCT_CREATED_DATE"],
272
+ [],
273
+ ['REGION']
274
+ ),
275
+ ( # various_and_min_max type
276
+ ['REGION', 'AMOUNT', 'DESCRIPTION'],
277
+ ['REGION'],
278
+ ['AMOUNT', 'DESCRIPTION'],
279
+ 'various_and_min_max',
280
+ False,
281
+ '',
282
+ [],
283
+ None,
284
+ False,
285
+ [
286
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "number"},
287
+ {"COLUMN_NAME": "DESCRIPTION", "DATA_TYPE": "text"}
288
+ ],
289
+ " REGION ",
290
+ ["MIN(AMOUNT)", "MAX(AMOUNT)", "SUM(AMOUNT)", "COUNT(DISTINCT LOWER(DESCRIPTION))"],
291
+ ["SUM(REGION)", "MAX(REGION)", "MIN(REGION)"],
292
+ ['REGION']
293
+ ),
294
+ ( # aggregation columns 'all'
295
+ ['REGION', 'AMOUNT', 'PRICE'],
296
+ ['REGION'],
297
+ ['all'],
298
+ 'various',
299
+ True,
300
+ '',
301
+ [],
302
+ None,
303
+ False,
304
+ [
305
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "number"},
306
+ {"COLUMN_NAME": "PRICE", "DATA_TYPE": "float"}
307
+ ],
308
+ " REGION ",
309
+ ["SUM(AMOUNT)", "SUM(PRICE)"],
310
+ ["SUM(REGION)"],
311
+ ['REGION']
312
+ ),
313
+ ( # aggregation with exclude_columns
314
+ ['REGION', 'AMOUNT', 'PRICE'],
315
+ ['REGION'],
316
+ ['AMOUNT', 'PRICE'],
317
+ 'various',
318
+ True,
319
+ '',
320
+ ['PRICE'],
321
+ None,
322
+ False,
323
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "number"}],
324
+ " REGION ",
325
+ ["SUM(AMOUNT)"],
326
+ ["SUM(PRICE)"],
327
+ ['REGION']
328
+ ),
329
+ ( # aggregation with exclude_columns and double quotes
330
+ ['REGION', 'AMO/NT', 'prIce'],
331
+ ['REGION'],
332
+ ['AMO/NT', 'prIce'],
333
+ 'various',
334
+ True,
335
+ '',
336
+ ['prIce'],
337
+ None,
338
+ True,
339
+ [{"COLUMN_NAME": "AMO/NT", "DATA_TYPE": "number"}],
340
+ ' "REGION" ',
341
+ ['SUM("AMO/NT")'],
342
+ ["prIce"],
343
+ ['REGION']
344
+ ),
345
+ ( # empty aggregation string - no matching columns
346
+ ['region'],
347
+ ['region'],
348
+ ['amount'],
349
+ 'various',
350
+ True,
351
+ '',
352
+ [],
353
+ None,
354
+ False,
355
+ [],
356
+ " region ",
357
+ [],
358
+ [],
359
+ ['region']
360
+ ),
361
+ ( # numeric_scale with various type
362
+ ['REGION', 'AMOUNT'],
363
+ ['REGION'],
364
+ ['AMOUNT'],
365
+ 'various',
366
+ True,
367
+ '',
368
+ [],
369
+ 3,
370
+ False,
371
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "float"}],
372
+ " REGION ",
373
+ ["CAST(ROUND(SUM(AMOUNT), 3) AS DECIMAL(38,3))"],
374
+ [],
375
+ ['REGION']
376
+ ),
377
+ ( # mixed datatype aggregations
378
+ ['REGION', 'AMOUNT', 'PRICE', 'NAME', 'IS_ACTIVE', 'CREATED_DATE'],
379
+ ['REGION'],
380
+ ['AMOUNT', 'PRICE', 'NAME', 'IS_ACTIVE', 'CREATED_DATE'],
381
+ 'various',
382
+ False,
383
+ '',
384
+ [],
385
+ 2,
386
+ False,
387
+ [
388
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "number"},
389
+ {"COLUMN_NAME": "PRICE", "DATA_TYPE": "float"},
390
+ {"COLUMN_NAME": "NAME", "DATA_TYPE": "text"},
391
+ {"COLUMN_NAME": "IS_ACTIVE", "DATA_TYPE": "boolean"},
392
+ {"COLUMN_NAME": "CREATED_DATE", "DATA_TYPE": "timestamp_ntz"}
393
+ ],
394
+ " REGION ",
395
+ [
396
+ "CAST(ROUND(SUM(AMOUNT), 2) AS DECIMAL(38,2))",
397
+ "CAST(ROUND(SUM(PRICE), 2) AS DECIMAL(38,2))",
398
+ "COUNT(DISTINCT LOWER(NAME))",
399
+ "AGGREGATEBOOLEAN_IS_ACTIVE",
400
+ "COUNT(DISTINCT LOWER(CREATED_DATE))"
401
+ ],
402
+ [],
403
+ ['REGION']
404
+ ),
405
+ ( # only_numeric flag excludes string aggregations
406
+ ['REGION', 'AMOUNT', 'NAME', 'IS_ACTIVE'],
407
+ ['REGION'],
408
+ ['AMOUNT', 'NAME', 'IS_ACTIVE'],
409
+ 'various',
410
+ True,
411
+ '',
412
+ [],
413
+ None,
414
+ False,
415
+ [
416
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "number"},
417
+ {"COLUMN_NAME": "NAME", "DATA_TYPE": "text"},
418
+ {"COLUMN_NAME": "IS_ACTIVE", "DATA_TYPE": "boolean"}
419
+ ],
420
+ " REGION ",
421
+ ["SUM(AMOUNT)"],
422
+ ["COUNTDISTINCT", "AGGREGATEBOOLEAN"],
423
+ ['REGION']
424
+ ),
425
+ ( # special character column names with double quotes
426
+ ['region', '/ISDFPS/amount'],
427
+ ['region'],
428
+ ['/ISDFPS/amount'],
429
+ 'various',
430
+ True,
431
+ '',
432
+ [],
433
+ None,
434
+ True,
435
+ [{"COLUMN_NAME": "/ISDFPS/amount", "DATA_TYPE": "number"}],
436
+ ' "region" ',
437
+ ['"/ISDFPS/amount"'],
438
+ [],
439
+ ['region']
440
+ ),
441
+ ],
442
+ )
443
+ def test_create_pandas_df_from_group_by(
444
+ self, snowflake_service, mock_database_object,
445
+ column_intersections, group_by_columns, group_by_aggregation_columns,
446
+ group_by_aggregation_type, only_numeric, where_clause, exclude_columns,
447
+ numeric_scale, enclose_quotes, mock_datatypes,
448
+ expected_group_by_cols, expected_in_agg_string, expected_not_in_agg_string,
449
+ expected_grouping_cols_final
450
+ ):
451
+ """Test create_pandas_df_from_group_by with various configurations."""
452
+ with patch.object(snowflake_service, 'get_data_types_from_object') as mock_get_datatypes, \
453
+ patch.object(snowflake_service, 'execute_queries') as mock_execute:
454
+
455
+ mock_get_datatypes.return_value = mock_datatypes
456
+ mock_execute.return_value = pd.DataFrame()
457
+
458
+ result = snowflake_service.create_pandas_df_from_group_by(
459
+ object=mock_database_object,
460
+ column_intersections=column_intersections,
461
+ group_by_columns=group_by_columns,
462
+ group_by_aggregation_columns=group_by_aggregation_columns,
463
+ group_by_aggregation_type=group_by_aggregation_type,
464
+ only_numeric=only_numeric,
465
+ where_clause=where_clause,
466
+ exclude_columns=exclude_columns,
467
+ numeric_scale=numeric_scale,
468
+ enclose_column_by_double_quotes=enclose_quotes
469
+ )
470
+
471
+ _, group_by_query_aggregation_string, group_by_query_columns_string, grouping_columns_final, _ = result
472
+
473
+ # Check group_by_query_columns_string
474
+ assert group_by_query_columns_string == expected_group_by_cols
475
+
476
+ # Check grouping_columns_final
477
+ assert grouping_columns_final == expected_grouping_cols_final
478
+
479
+ # Check expected strings in aggregation string
480
+ for expected in expected_in_agg_string:
481
+ assert expected in group_by_query_aggregation_string
482
+
483
+ # Check strings that should NOT be in aggregation string
484
+ for expected in expected_not_in_agg_string:
485
+ assert expected not in group_by_query_aggregation_string