icsDataValidation 1.0.430__py3-none-any.whl → 1.0.438__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. icsDataValidation/connection_setups/sqlserver_connection_setup.py +4 -3
  2. icsDataValidation/input_parameters/testing_tool_params.py +0 -1
  3. icsDataValidation/services/database_services/snowflake_service.py +170 -67
  4. icsDataValidation/services/database_services/sqlserver_service.py +196 -88
  5. {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.438.dist-info}/METADATA +1 -1
  6. {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.438.dist-info}/RECORD +22 -8
  7. {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.438.dist-info}/WHEEL +1 -1
  8. {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.438.dist-info}/top_level.txt +1 -0
  9. tests/snowflake_service/test_create_checksums.py +146 -0
  10. tests/snowflake_service/test_create_pandas_df_from_group_by.py +485 -0
  11. tests/snowflake_service/test_create_pandas_df_from_sample.py +444 -0
  12. tests/snowflake_service/test_get_checksum_statement.py +243 -0
  13. tests/snowflake_service/test_get_column_clause.py +305 -0
  14. tests/snowflake_service/test_get_countnulls_statement.py +128 -0
  15. tests/snowflake_service/test_get_in_clause.py +66 -0
  16. tests/sqlserver_service/test_create_checksums.py +153 -0
  17. tests/sqlserver_service/test_create_pandas_df_from_group_by.py +427 -0
  18. tests/sqlserver_service/test_create_pandas_df_from_sample.py +286 -0
  19. tests/sqlserver_service/test_get_checksum_statement.py +160 -0
  20. tests/sqlserver_service/test_get_column_clause.py +182 -0
  21. tests/sqlserver_service/test_get_countnulls_statement.py +121 -0
  22. tests/sqlserver_service/test_get_in_clause.py +87 -0
@@ -0,0 +1,427 @@
1
+ from unittest.mock import MagicMock, patch
2
+
3
+ import pandas as pd
4
+ import pytest
5
+
6
+ from icsDataValidation.core.database_objects import DatabaseObject, DatabaseObjectType
7
+ from icsDataValidation.services.database_services.sqlserver_service import SQLServerService
8
+
9
+
10
+ @pytest.fixture
11
+ def sqlserver_service():
12
+ """Create a SQLServerService instance with mocked connection."""
13
+ connection_params = {
14
+ 'Driver': 'ODBC Driver 18 for SQL Server',
15
+ 'Server': 'localhost',
16
+ 'Port': '1433',
17
+ 'Database': 'testdb',
18
+ 'User': 'sa',
19
+ 'Password': 'password',
20
+ 'Encrypt': True,
21
+ 'TrustServerCertificate': True
22
+ }
23
+ service = SQLServerService(connection_params=connection_params)
24
+ service.sqlserver_connection = MagicMock()
25
+ return service
26
+
27
+
28
+ @pytest.fixture
29
+ def mock_database_object():
30
+ """Create a mock DatabaseObject."""
31
+ obj = DatabaseObject(
32
+ object_identifier="TestDB.dbo.TestTable",
33
+ object_type=DatabaseObjectType.TABLE
34
+ )
35
+ return obj
36
+
37
+
38
+ class TestCreatePandasDfFromGroupByParametrized:
39
+ """Parametrized tests for create_pandas_df_from_group_by method."""
40
+
41
+ @pytest.mark.parametrize(
42
+ "column_intersections,group_by_columns,group_by_aggregation_columns," \
43
+ "group_by_aggregation_type,only_numeric,where_clause,exclude_columns," \
44
+ "numeric_scale,enclose_quotes,mock_datatypes," \
45
+ "expected_group_by_cols,expected_in_agg_string,expected_not_in_agg_string," \
46
+ "expected_grouping_cols_final",
47
+ [
48
+ ( # single grouping column, no double quotes
49
+ ['region', 'amount'],
50
+ ['region'],
51
+ ['amount'],
52
+ 'various',
53
+ False,
54
+ '',
55
+ [],
56
+ None,
57
+ False,
58
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
59
+ " [region] ",
60
+ ["SUM([AMOUNT])"],
61
+ [],
62
+ ['region']
63
+ ),
64
+ ( # single grouping column, with double quotes (ignored/treated same in SQL Server impl)
65
+ ['region', 'amount'],
66
+ ['region'],
67
+ ['amount'],
68
+ 'various',
69
+ False,
70
+ '',
71
+ [],
72
+ None,
73
+ True,
74
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
75
+ " [region] ",
76
+ ["SUM([AMOUNT])"],
77
+ [],
78
+ ['region']
79
+ ),
80
+ ( # multiple grouping columns
81
+ ['region', 'department', 'amount'],
82
+ ['region', 'department'],
83
+ ['amount'],
84
+ 'various',
85
+ False,
86
+ '',
87
+ [],
88
+ None,
89
+ False,
90
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
91
+ " [region] ,[department] ",
92
+ ["SUM([AMOUNT])"],
93
+ [],
94
+ ['region', 'department']
95
+ ),
96
+ ( # grouping column excluded
97
+ ['region', 'department', 'amount'],
98
+ ['region', 'department'],
99
+ ['amount'],
100
+ 'various',
101
+ False,
102
+ '',
103
+ ['department'],
104
+ None,
105
+ False,
106
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
107
+ " [region] ",
108
+ ["SUM([AMOUNT])"],
109
+ [],
110
+ ['region']
111
+ ),
112
+ ( # grouping column not in intersections
113
+ ['amount'],
114
+ ['region'],
115
+ ['amount'],
116
+ 'various',
117
+ False,
118
+ '',
119
+ [],
120
+ None,
121
+ False,
122
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
123
+ "",
124
+ [],
125
+ [],
126
+ []
127
+ ),
128
+ ( # only_min_max type, numeric columns
129
+ ['region', 'amount', 'price'],
130
+ ['region'],
131
+ ['amount', 'price'],
132
+ 'only_min_max',
133
+ True,
134
+ '',
135
+ [],
136
+ None,
137
+ False,
138
+ [
139
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
140
+ {"COLUMN_NAME": "PRICE", "DATA_TYPE": "decimal"}
141
+ ],
142
+ " [region] ",
143
+ ["MIN([AMOUNT])", "MAX([AMOUNT])", "MIN([PRICE])", "MAX([PRICE])"],
144
+ ["SUM(", "COUNTDISTINCT"],
145
+ ['region']
146
+ ),
147
+ ( # only_min_max with numeric_scale
148
+ ['region', 'AMOUNT'],
149
+ ['region'],
150
+ ['AMOUNT'],
151
+ 'only_min_max',
152
+ True,
153
+ '',
154
+ [],
155
+ 2,
156
+ False,
157
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "decimal"}],
158
+ " [region] ",
159
+ ["CAST(ROUND(MIN([AMOUNT]),2) AS DECIMAL(38,2))", "CAST(ROUND(MAX([AMOUNT]),2) AS DECIMAL(38,2))"],
160
+ [],
161
+ ['region']
162
+ ),
163
+ ( # various type, numeric only
164
+ ['REGION', 'AMOUNT'],
165
+ ['REGION'],
166
+ ['AMOUNT'],
167
+ 'various',
168
+ True,
169
+ '',
170
+ [],
171
+ None,
172
+ False,
173
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
174
+ " [REGION] ",
175
+ ["SUM([AMOUNT])"],
176
+ ["MIN(", "MAX("],
177
+ ['REGION']
178
+ ),
179
+ ( # various type with string columns
180
+ ['region', 'AMOUNT', 'DESCRIPTION'],
181
+ ['region'],
182
+ ['AMOUNT', 'DESCRIPTION'],
183
+ 'various',
184
+ False,
185
+ '',
186
+ [],
187
+ None,
188
+ False,
189
+ [
190
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
191
+ {"COLUMN_NAME": "DESCRIPTION", "DATA_TYPE": "varchar"}
192
+ ],
193
+ " [region] ",
194
+ ["SUM([AMOUNT])", "COUNT(DISTINCT LOWER([DESCRIPTION]))", "[COUNTDISTINCT_DESCRIPTION]"],
195
+ [],
196
+ ['region']
197
+ ),
198
+ ( # various type with boolean columns
199
+ ['REGION', 'AMOUNT', 'IS_ACTIVE'],
200
+ ['REGION'],
201
+ ['AMOUNT', 'IS_ACTIVE'],
202
+ 'various',
203
+ False,
204
+ '',
205
+ [],
206
+ None,
207
+ False,
208
+ [
209
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
210
+ {"COLUMN_NAME": "IS_ACTIVE", "DATA_TYPE": "bit"}
211
+ ],
212
+ " [REGION] ",
213
+ ["SUM([AMOUNT])", "AGGREGATEBOOLEAN_IS_ACTIVE", "CASE WHEN [IS_ACTIVE] = 1"],
214
+ [],
215
+ ['REGION']
216
+ ),
217
+ ( # various type with binary columns
218
+ ['REGION', 'BINARY_DATA'],
219
+ ['REGION'],
220
+ ['BINARY_DATA'],
221
+ 'various',
222
+ False,
223
+ '',
224
+ [],
225
+ None,
226
+ False,
227
+ [{"COLUMN_NAME": "BINARY_DATA", "DATA_TYPE": "varbinary"}],
228
+ " [REGION] ",
229
+ ["COUNT(DISTINCT LOWER(TRY_CONVERT(VARCHAR,[BINARY_DATA])))", "COUNTDISTINCT_BINARY_DATA"],
230
+ [],
231
+ ['REGION']
232
+ ),
233
+ ( # various type with datetime columns
234
+ ['REGION', 'CREATED_DATE'],
235
+ ['REGION'],
236
+ ['CREATED_DATE'],
237
+ 'various',
238
+ False,
239
+ '',
240
+ [],
241
+ None,
242
+ False,
243
+ [{"COLUMN_NAME": "CREATED_DATE", "DATA_TYPE": "datetime"}],
244
+ " [REGION] ",
245
+ ["COUNT(DISTINCT LOWER([CREATED_DATE]))", "COUNTDISTINCT_CREATED_DATE"],
246
+ [],
247
+ ['REGION']
248
+ ),
249
+ ( # various_and_min_max type
250
+ ['REGION', 'AMOUNT', 'DESCRIPTION'],
251
+ ['REGION'],
252
+ ['AMOUNT', 'DESCRIPTION'],
253
+ 'various_and_min_max',
254
+ False,
255
+ '',
256
+ [],
257
+ None,
258
+ False,
259
+ [
260
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
261
+ {"COLUMN_NAME": "DESCRIPTION", "DATA_TYPE": "varchar"}
262
+ ],
263
+ " [REGION] ",
264
+ ["MIN([AMOUNT])", "MAX([AMOUNT])", "SUM([AMOUNT])", "COUNT(DISTINCT LOWER([DESCRIPTION]))"],
265
+ [],
266
+ ['REGION']
267
+ ),
268
+ ( # aggregation columns 'all'
269
+ ['REGION', 'AMOUNT', 'PRICE'],
270
+ ['REGION'],
271
+ ['all'],
272
+ 'various',
273
+ True,
274
+ '',
275
+ [],
276
+ None,
277
+ False,
278
+ [
279
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
280
+ {"COLUMN_NAME": "PRICE", "DATA_TYPE": "decimal"}
281
+ ],
282
+ " [REGION] ",
283
+ ["SUM([AMOUNT])", "SUM([PRICE])"],
284
+ ["SUM([REGION])"],
285
+ ['REGION']
286
+ ),
287
+ ( # aggregation with exclude_columns
288
+ ['REGION', 'AMOUNT', 'PRICE'],
289
+ ['REGION'],
290
+ ['AMOUNT', 'PRICE'],
291
+ 'various',
292
+ True,
293
+ '',
294
+ ['PRICE'],
295
+ None,
296
+ False,
297
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
298
+ " [REGION] ",
299
+ ["SUM([AMOUNT])"],
300
+ ["SUM([PRICE])"],
301
+ ['REGION']
302
+ ),
303
+ ( # numeric_scale with various type
304
+ ['REGION', 'AMOUNT'],
305
+ ['REGION'],
306
+ ['AMOUNT'],
307
+ 'various',
308
+ True,
309
+ '',
310
+ [],
311
+ 3,
312
+ False,
313
+ [{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "decimal"}],
314
+ " [REGION] ",
315
+ ["CAST(ROUND(SUM([AMOUNT]), 3) AS DECIMAL(38,3))"],
316
+ [],
317
+ ['REGION']
318
+ ),
319
+ ( # mixed datatype aggregations
320
+ ['REGION', 'AMOUNT', 'PRICE', 'NAME', 'IS_ACTIVE', 'CREATED_DATE'],
321
+ ['REGION'],
322
+ ['AMOUNT', 'PRICE', 'NAME', 'IS_ACTIVE', 'CREATED_DATE'],
323
+ 'various',
324
+ False,
325
+ '',
326
+ [],
327
+ 2,
328
+ False,
329
+ [
330
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
331
+ {"COLUMN_NAME": "PRICE", "DATA_TYPE": "decimal"},
332
+ {"COLUMN_NAME": "NAME", "DATA_TYPE": "varchar"},
333
+ {"COLUMN_NAME": "IS_ACTIVE", "DATA_TYPE": "bit"},
334
+ {"COLUMN_NAME": "CREATED_DATE", "DATA_TYPE": "datetime"}
335
+ ],
336
+ " [REGION] ",
337
+ [
338
+ "CAST(ROUND(SUM([AMOUNT]), 2) AS DECIMAL(38,2))",
339
+ "CAST(ROUND(SUM([PRICE]), 2) AS DECIMAL(38,2))",
340
+ "COUNT(DISTINCT LOWER([NAME]))",
341
+ "AGGREGATEBOOLEAN_IS_ACTIVE",
342
+ "COUNT(DISTINCT LOWER([CREATED_DATE]))"
343
+ ],
344
+ [],
345
+ ['REGION']
346
+ ),
347
+ ( # only_numeric flag excludes string aggregations
348
+ ['REGION', 'AMOUNT', 'NAME', 'IS_ACTIVE'],
349
+ ['REGION'],
350
+ ['AMOUNT', 'NAME', 'IS_ACTIVE'],
351
+ 'various',
352
+ True,
353
+ '',
354
+ [],
355
+ None,
356
+ False,
357
+ [
358
+ {"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
359
+ {"COLUMN_NAME": "NAME", "DATA_TYPE": "varchar"},
360
+ {"COLUMN_NAME": "IS_ACTIVE", "DATA_TYPE": "bit"}
361
+ ],
362
+ " [REGION] ",
363
+ ["SUM([AMOUNT])"],
364
+ ["COUNTDISTINCT", "AGGREGATEBOOLEAN"],
365
+ ['REGION']
366
+ ),
367
+ ( # special character column names
368
+ ['region', '/ISDFPS/amount'],
369
+ ['region'],
370
+ ['/ISDFPS/amount'],
371
+ 'various',
372
+ True,
373
+ '',
374
+ [],
375
+ None,
376
+ True,
377
+ [{"COLUMN_NAME": "/ISDFPS/AMOUNT", "DATA_TYPE": "int"}],
378
+ " [region] ",
379
+ ["[/ISDFPS/AMOUNT]"],
380
+ [],
381
+ ['region']
382
+ ),
383
+ ],
384
+ )
385
+ def test_create_pandas_df_from_group_by(
386
+ self, sqlserver_service, mock_database_object,
387
+ column_intersections, group_by_columns, group_by_aggregation_columns,
388
+ group_by_aggregation_type, only_numeric, where_clause, exclude_columns,
389
+ numeric_scale, enclose_quotes, mock_datatypes,
390
+ expected_group_by_cols, expected_in_agg_string, expected_not_in_agg_string,
391
+ expected_grouping_cols_final
392
+ ):
393
+ """Test create_pandas_df_from_group_by with various configurations."""
394
+ with patch.object(sqlserver_service, 'get_data_types_from_object') as mock_get_datatypes, \
395
+ patch.object(sqlserver_service, 'execute_queries') as mock_execute:
396
+
397
+ mock_get_datatypes.return_value = mock_datatypes
398
+ mock_execute.return_value = pd.DataFrame()
399
+
400
+ result = sqlserver_service.create_pandas_df_from_group_by(
401
+ object=mock_database_object,
402
+ column_intersections=column_intersections,
403
+ group_by_columns=group_by_columns,
404
+ group_by_aggregation_columns=group_by_aggregation_columns,
405
+ group_by_aggregation_type=group_by_aggregation_type,
406
+ only_numeric=only_numeric,
407
+ where_clause=where_clause,
408
+ exclude_columns=exclude_columns,
409
+ numeric_scale=numeric_scale,
410
+ enclose_column_by_double_quotes=enclose_quotes
411
+ )
412
+
413
+ _, group_by_query_aggregation_string, group_by_query_columns_string, grouping_columns_final, _ = result
414
+
415
+ # Check group_by_query_columns_string
416
+ assert group_by_query_columns_string == expected_group_by_cols
417
+
418
+ # Check grouping_columns_final
419
+ assert grouping_columns_final == expected_grouping_cols_final
420
+
421
+ # Check expected strings in aggregation string
422
+ for expected in expected_in_agg_string:
423
+ assert expected in group_by_query_aggregation_string
424
+
425
+ # Check strings that should NOT be in aggregation string
426
+ for expected in expected_not_in_agg_string:
427
+ assert expected not in group_by_query_aggregation_string