icsDataValidation 1.0.430__py3-none-any.whl → 1.0.438__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- icsDataValidation/connection_setups/sqlserver_connection_setup.py +4 -3
- icsDataValidation/input_parameters/testing_tool_params.py +0 -1
- icsDataValidation/services/database_services/snowflake_service.py +170 -67
- icsDataValidation/services/database_services/sqlserver_service.py +196 -88
- {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.438.dist-info}/METADATA +1 -1
- {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.438.dist-info}/RECORD +22 -8
- {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.438.dist-info}/WHEEL +1 -1
- {icsdatavalidation-1.0.430.dist-info → icsdatavalidation-1.0.438.dist-info}/top_level.txt +1 -0
- tests/snowflake_service/test_create_checksums.py +146 -0
- tests/snowflake_service/test_create_pandas_df_from_group_by.py +485 -0
- tests/snowflake_service/test_create_pandas_df_from_sample.py +444 -0
- tests/snowflake_service/test_get_checksum_statement.py +243 -0
- tests/snowflake_service/test_get_column_clause.py +305 -0
- tests/snowflake_service/test_get_countnulls_statement.py +128 -0
- tests/snowflake_service/test_get_in_clause.py +66 -0
- tests/sqlserver_service/test_create_checksums.py +153 -0
- tests/sqlserver_service/test_create_pandas_df_from_group_by.py +427 -0
- tests/sqlserver_service/test_create_pandas_df_from_sample.py +286 -0
- tests/sqlserver_service/test_get_checksum_statement.py +160 -0
- tests/sqlserver_service/test_get_column_clause.py +182 -0
- tests/sqlserver_service/test_get_countnulls_statement.py +121 -0
- tests/sqlserver_service/test_get_in_clause.py +87 -0
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
from unittest.mock import MagicMock, patch
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from icsDataValidation.core.database_objects import DatabaseObject, DatabaseObjectType
|
|
7
|
+
from icsDataValidation.services.database_services.sqlserver_service import SQLServerService
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@pytest.fixture
|
|
11
|
+
def sqlserver_service():
|
|
12
|
+
"""Create a SQLServerService instance with mocked connection."""
|
|
13
|
+
connection_params = {
|
|
14
|
+
'Driver': 'ODBC Driver 18 for SQL Server',
|
|
15
|
+
'Server': 'localhost',
|
|
16
|
+
'Port': '1433',
|
|
17
|
+
'Database': 'testdb',
|
|
18
|
+
'User': 'sa',
|
|
19
|
+
'Password': 'password',
|
|
20
|
+
'Encrypt': True,
|
|
21
|
+
'TrustServerCertificate': True
|
|
22
|
+
}
|
|
23
|
+
service = SQLServerService(connection_params=connection_params)
|
|
24
|
+
service.sqlserver_connection = MagicMock()
|
|
25
|
+
return service
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pytest.fixture
|
|
29
|
+
def mock_database_object():
|
|
30
|
+
"""Create a mock DatabaseObject."""
|
|
31
|
+
obj = DatabaseObject(
|
|
32
|
+
object_identifier="TestDB.dbo.TestTable",
|
|
33
|
+
object_type=DatabaseObjectType.TABLE
|
|
34
|
+
)
|
|
35
|
+
return obj
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class TestCreatePandasDfFromGroupByParametrized:
|
|
39
|
+
"""Parametrized tests for create_pandas_df_from_group_by method."""
|
|
40
|
+
|
|
41
|
+
@pytest.mark.parametrize(
|
|
42
|
+
"column_intersections,group_by_columns,group_by_aggregation_columns," \
|
|
43
|
+
"group_by_aggregation_type,only_numeric,where_clause,exclude_columns," \
|
|
44
|
+
"numeric_scale,enclose_quotes,mock_datatypes," \
|
|
45
|
+
"expected_group_by_cols,expected_in_agg_string,expected_not_in_agg_string," \
|
|
46
|
+
"expected_grouping_cols_final",
|
|
47
|
+
[
|
|
48
|
+
( # single grouping column, no double quotes
|
|
49
|
+
['region', 'amount'],
|
|
50
|
+
['region'],
|
|
51
|
+
['amount'],
|
|
52
|
+
'various',
|
|
53
|
+
False,
|
|
54
|
+
'',
|
|
55
|
+
[],
|
|
56
|
+
None,
|
|
57
|
+
False,
|
|
58
|
+
[{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
|
|
59
|
+
" [region] ",
|
|
60
|
+
["SUM([AMOUNT])"],
|
|
61
|
+
[],
|
|
62
|
+
['region']
|
|
63
|
+
),
|
|
64
|
+
( # single grouping column, with double quotes (ignored/treated same in SQL Server impl)
|
|
65
|
+
['region', 'amount'],
|
|
66
|
+
['region'],
|
|
67
|
+
['amount'],
|
|
68
|
+
'various',
|
|
69
|
+
False,
|
|
70
|
+
'',
|
|
71
|
+
[],
|
|
72
|
+
None,
|
|
73
|
+
True,
|
|
74
|
+
[{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
|
|
75
|
+
" [region] ",
|
|
76
|
+
["SUM([AMOUNT])"],
|
|
77
|
+
[],
|
|
78
|
+
['region']
|
|
79
|
+
),
|
|
80
|
+
( # multiple grouping columns
|
|
81
|
+
['region', 'department', 'amount'],
|
|
82
|
+
['region', 'department'],
|
|
83
|
+
['amount'],
|
|
84
|
+
'various',
|
|
85
|
+
False,
|
|
86
|
+
'',
|
|
87
|
+
[],
|
|
88
|
+
None,
|
|
89
|
+
False,
|
|
90
|
+
[{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
|
|
91
|
+
" [region] ,[department] ",
|
|
92
|
+
["SUM([AMOUNT])"],
|
|
93
|
+
[],
|
|
94
|
+
['region', 'department']
|
|
95
|
+
),
|
|
96
|
+
( # grouping column excluded
|
|
97
|
+
['region', 'department', 'amount'],
|
|
98
|
+
['region', 'department'],
|
|
99
|
+
['amount'],
|
|
100
|
+
'various',
|
|
101
|
+
False,
|
|
102
|
+
'',
|
|
103
|
+
['department'],
|
|
104
|
+
None,
|
|
105
|
+
False,
|
|
106
|
+
[{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
|
|
107
|
+
" [region] ",
|
|
108
|
+
["SUM([AMOUNT])"],
|
|
109
|
+
[],
|
|
110
|
+
['region']
|
|
111
|
+
),
|
|
112
|
+
( # grouping column not in intersections
|
|
113
|
+
['amount'],
|
|
114
|
+
['region'],
|
|
115
|
+
['amount'],
|
|
116
|
+
'various',
|
|
117
|
+
False,
|
|
118
|
+
'',
|
|
119
|
+
[],
|
|
120
|
+
None,
|
|
121
|
+
False,
|
|
122
|
+
[{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
|
|
123
|
+
"",
|
|
124
|
+
[],
|
|
125
|
+
[],
|
|
126
|
+
[]
|
|
127
|
+
),
|
|
128
|
+
( # only_min_max type, numeric columns
|
|
129
|
+
['region', 'amount', 'price'],
|
|
130
|
+
['region'],
|
|
131
|
+
['amount', 'price'],
|
|
132
|
+
'only_min_max',
|
|
133
|
+
True,
|
|
134
|
+
'',
|
|
135
|
+
[],
|
|
136
|
+
None,
|
|
137
|
+
False,
|
|
138
|
+
[
|
|
139
|
+
{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
|
|
140
|
+
{"COLUMN_NAME": "PRICE", "DATA_TYPE": "decimal"}
|
|
141
|
+
],
|
|
142
|
+
" [region] ",
|
|
143
|
+
["MIN([AMOUNT])", "MAX([AMOUNT])", "MIN([PRICE])", "MAX([PRICE])"],
|
|
144
|
+
["SUM(", "COUNTDISTINCT"],
|
|
145
|
+
['region']
|
|
146
|
+
),
|
|
147
|
+
( # only_min_max with numeric_scale
|
|
148
|
+
['region', 'AMOUNT'],
|
|
149
|
+
['region'],
|
|
150
|
+
['AMOUNT'],
|
|
151
|
+
'only_min_max',
|
|
152
|
+
True,
|
|
153
|
+
'',
|
|
154
|
+
[],
|
|
155
|
+
2,
|
|
156
|
+
False,
|
|
157
|
+
[{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "decimal"}],
|
|
158
|
+
" [region] ",
|
|
159
|
+
["CAST(ROUND(MIN([AMOUNT]),2) AS DECIMAL(38,2))", "CAST(ROUND(MAX([AMOUNT]),2) AS DECIMAL(38,2))"],
|
|
160
|
+
[],
|
|
161
|
+
['region']
|
|
162
|
+
),
|
|
163
|
+
( # various type, numeric only
|
|
164
|
+
['REGION', 'AMOUNT'],
|
|
165
|
+
['REGION'],
|
|
166
|
+
['AMOUNT'],
|
|
167
|
+
'various',
|
|
168
|
+
True,
|
|
169
|
+
'',
|
|
170
|
+
[],
|
|
171
|
+
None,
|
|
172
|
+
False,
|
|
173
|
+
[{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
|
|
174
|
+
" [REGION] ",
|
|
175
|
+
["SUM([AMOUNT])"],
|
|
176
|
+
["MIN(", "MAX("],
|
|
177
|
+
['REGION']
|
|
178
|
+
),
|
|
179
|
+
( # various type with string columns
|
|
180
|
+
['region', 'AMOUNT', 'DESCRIPTION'],
|
|
181
|
+
['region'],
|
|
182
|
+
['AMOUNT', 'DESCRIPTION'],
|
|
183
|
+
'various',
|
|
184
|
+
False,
|
|
185
|
+
'',
|
|
186
|
+
[],
|
|
187
|
+
None,
|
|
188
|
+
False,
|
|
189
|
+
[
|
|
190
|
+
{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
|
|
191
|
+
{"COLUMN_NAME": "DESCRIPTION", "DATA_TYPE": "varchar"}
|
|
192
|
+
],
|
|
193
|
+
" [region] ",
|
|
194
|
+
["SUM([AMOUNT])", "COUNT(DISTINCT LOWER([DESCRIPTION]))", "[COUNTDISTINCT_DESCRIPTION]"],
|
|
195
|
+
[],
|
|
196
|
+
['region']
|
|
197
|
+
),
|
|
198
|
+
( # various type with boolean columns
|
|
199
|
+
['REGION', 'AMOUNT', 'IS_ACTIVE'],
|
|
200
|
+
['REGION'],
|
|
201
|
+
['AMOUNT', 'IS_ACTIVE'],
|
|
202
|
+
'various',
|
|
203
|
+
False,
|
|
204
|
+
'',
|
|
205
|
+
[],
|
|
206
|
+
None,
|
|
207
|
+
False,
|
|
208
|
+
[
|
|
209
|
+
{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
|
|
210
|
+
{"COLUMN_NAME": "IS_ACTIVE", "DATA_TYPE": "bit"}
|
|
211
|
+
],
|
|
212
|
+
" [REGION] ",
|
|
213
|
+
["SUM([AMOUNT])", "AGGREGATEBOOLEAN_IS_ACTIVE", "CASE WHEN [IS_ACTIVE] = 1"],
|
|
214
|
+
[],
|
|
215
|
+
['REGION']
|
|
216
|
+
),
|
|
217
|
+
( # various type with binary columns
|
|
218
|
+
['REGION', 'BINARY_DATA'],
|
|
219
|
+
['REGION'],
|
|
220
|
+
['BINARY_DATA'],
|
|
221
|
+
'various',
|
|
222
|
+
False,
|
|
223
|
+
'',
|
|
224
|
+
[],
|
|
225
|
+
None,
|
|
226
|
+
False,
|
|
227
|
+
[{"COLUMN_NAME": "BINARY_DATA", "DATA_TYPE": "varbinary"}],
|
|
228
|
+
" [REGION] ",
|
|
229
|
+
["COUNT(DISTINCT LOWER(TRY_CONVERT(VARCHAR,[BINARY_DATA])))", "COUNTDISTINCT_BINARY_DATA"],
|
|
230
|
+
[],
|
|
231
|
+
['REGION']
|
|
232
|
+
),
|
|
233
|
+
( # various type with datetime columns
|
|
234
|
+
['REGION', 'CREATED_DATE'],
|
|
235
|
+
['REGION'],
|
|
236
|
+
['CREATED_DATE'],
|
|
237
|
+
'various',
|
|
238
|
+
False,
|
|
239
|
+
'',
|
|
240
|
+
[],
|
|
241
|
+
None,
|
|
242
|
+
False,
|
|
243
|
+
[{"COLUMN_NAME": "CREATED_DATE", "DATA_TYPE": "datetime"}],
|
|
244
|
+
" [REGION] ",
|
|
245
|
+
["COUNT(DISTINCT LOWER([CREATED_DATE]))", "COUNTDISTINCT_CREATED_DATE"],
|
|
246
|
+
[],
|
|
247
|
+
['REGION']
|
|
248
|
+
),
|
|
249
|
+
( # various_and_min_max type
|
|
250
|
+
['REGION', 'AMOUNT', 'DESCRIPTION'],
|
|
251
|
+
['REGION'],
|
|
252
|
+
['AMOUNT', 'DESCRIPTION'],
|
|
253
|
+
'various_and_min_max',
|
|
254
|
+
False,
|
|
255
|
+
'',
|
|
256
|
+
[],
|
|
257
|
+
None,
|
|
258
|
+
False,
|
|
259
|
+
[
|
|
260
|
+
{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
|
|
261
|
+
{"COLUMN_NAME": "DESCRIPTION", "DATA_TYPE": "varchar"}
|
|
262
|
+
],
|
|
263
|
+
" [REGION] ",
|
|
264
|
+
["MIN([AMOUNT])", "MAX([AMOUNT])", "SUM([AMOUNT])", "COUNT(DISTINCT LOWER([DESCRIPTION]))"],
|
|
265
|
+
[],
|
|
266
|
+
['REGION']
|
|
267
|
+
),
|
|
268
|
+
( # aggregation columns 'all'
|
|
269
|
+
['REGION', 'AMOUNT', 'PRICE'],
|
|
270
|
+
['REGION'],
|
|
271
|
+
['all'],
|
|
272
|
+
'various',
|
|
273
|
+
True,
|
|
274
|
+
'',
|
|
275
|
+
[],
|
|
276
|
+
None,
|
|
277
|
+
False,
|
|
278
|
+
[
|
|
279
|
+
{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
|
|
280
|
+
{"COLUMN_NAME": "PRICE", "DATA_TYPE": "decimal"}
|
|
281
|
+
],
|
|
282
|
+
" [REGION] ",
|
|
283
|
+
["SUM([AMOUNT])", "SUM([PRICE])"],
|
|
284
|
+
["SUM([REGION])"],
|
|
285
|
+
['REGION']
|
|
286
|
+
),
|
|
287
|
+
( # aggregation with exclude_columns
|
|
288
|
+
['REGION', 'AMOUNT', 'PRICE'],
|
|
289
|
+
['REGION'],
|
|
290
|
+
['AMOUNT', 'PRICE'],
|
|
291
|
+
'various',
|
|
292
|
+
True,
|
|
293
|
+
'',
|
|
294
|
+
['PRICE'],
|
|
295
|
+
None,
|
|
296
|
+
False,
|
|
297
|
+
[{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"}],
|
|
298
|
+
" [REGION] ",
|
|
299
|
+
["SUM([AMOUNT])"],
|
|
300
|
+
["SUM([PRICE])"],
|
|
301
|
+
['REGION']
|
|
302
|
+
),
|
|
303
|
+
( # numeric_scale with various type
|
|
304
|
+
['REGION', 'AMOUNT'],
|
|
305
|
+
['REGION'],
|
|
306
|
+
['AMOUNT'],
|
|
307
|
+
'various',
|
|
308
|
+
True,
|
|
309
|
+
'',
|
|
310
|
+
[],
|
|
311
|
+
3,
|
|
312
|
+
False,
|
|
313
|
+
[{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "decimal"}],
|
|
314
|
+
" [REGION] ",
|
|
315
|
+
["CAST(ROUND(SUM([AMOUNT]), 3) AS DECIMAL(38,3))"],
|
|
316
|
+
[],
|
|
317
|
+
['REGION']
|
|
318
|
+
),
|
|
319
|
+
( # mixed datatype aggregations
|
|
320
|
+
['REGION', 'AMOUNT', 'PRICE', 'NAME', 'IS_ACTIVE', 'CREATED_DATE'],
|
|
321
|
+
['REGION'],
|
|
322
|
+
['AMOUNT', 'PRICE', 'NAME', 'IS_ACTIVE', 'CREATED_DATE'],
|
|
323
|
+
'various',
|
|
324
|
+
False,
|
|
325
|
+
'',
|
|
326
|
+
[],
|
|
327
|
+
2,
|
|
328
|
+
False,
|
|
329
|
+
[
|
|
330
|
+
{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
|
|
331
|
+
{"COLUMN_NAME": "PRICE", "DATA_TYPE": "decimal"},
|
|
332
|
+
{"COLUMN_NAME": "NAME", "DATA_TYPE": "varchar"},
|
|
333
|
+
{"COLUMN_NAME": "IS_ACTIVE", "DATA_TYPE": "bit"},
|
|
334
|
+
{"COLUMN_NAME": "CREATED_DATE", "DATA_TYPE": "datetime"}
|
|
335
|
+
],
|
|
336
|
+
" [REGION] ",
|
|
337
|
+
[
|
|
338
|
+
"CAST(ROUND(SUM([AMOUNT]), 2) AS DECIMAL(38,2))",
|
|
339
|
+
"CAST(ROUND(SUM([PRICE]), 2) AS DECIMAL(38,2))",
|
|
340
|
+
"COUNT(DISTINCT LOWER([NAME]))",
|
|
341
|
+
"AGGREGATEBOOLEAN_IS_ACTIVE",
|
|
342
|
+
"COUNT(DISTINCT LOWER([CREATED_DATE]))"
|
|
343
|
+
],
|
|
344
|
+
[],
|
|
345
|
+
['REGION']
|
|
346
|
+
),
|
|
347
|
+
( # only_numeric flag excludes string aggregations
|
|
348
|
+
['REGION', 'AMOUNT', 'NAME', 'IS_ACTIVE'],
|
|
349
|
+
['REGION'],
|
|
350
|
+
['AMOUNT', 'NAME', 'IS_ACTIVE'],
|
|
351
|
+
'various',
|
|
352
|
+
True,
|
|
353
|
+
'',
|
|
354
|
+
[],
|
|
355
|
+
None,
|
|
356
|
+
False,
|
|
357
|
+
[
|
|
358
|
+
{"COLUMN_NAME": "AMOUNT", "DATA_TYPE": "int"},
|
|
359
|
+
{"COLUMN_NAME": "NAME", "DATA_TYPE": "varchar"},
|
|
360
|
+
{"COLUMN_NAME": "IS_ACTIVE", "DATA_TYPE": "bit"}
|
|
361
|
+
],
|
|
362
|
+
" [REGION] ",
|
|
363
|
+
["SUM([AMOUNT])"],
|
|
364
|
+
["COUNTDISTINCT", "AGGREGATEBOOLEAN"],
|
|
365
|
+
['REGION']
|
|
366
|
+
),
|
|
367
|
+
( # special character column names
|
|
368
|
+
['region', '/ISDFPS/amount'],
|
|
369
|
+
['region'],
|
|
370
|
+
['/ISDFPS/amount'],
|
|
371
|
+
'various',
|
|
372
|
+
True,
|
|
373
|
+
'',
|
|
374
|
+
[],
|
|
375
|
+
None,
|
|
376
|
+
True,
|
|
377
|
+
[{"COLUMN_NAME": "/ISDFPS/AMOUNT", "DATA_TYPE": "int"}],
|
|
378
|
+
" [region] ",
|
|
379
|
+
["[/ISDFPS/AMOUNT]"],
|
|
380
|
+
[],
|
|
381
|
+
['region']
|
|
382
|
+
),
|
|
383
|
+
],
|
|
384
|
+
)
|
|
385
|
+
def test_create_pandas_df_from_group_by(
|
|
386
|
+
self, sqlserver_service, mock_database_object,
|
|
387
|
+
column_intersections, group_by_columns, group_by_aggregation_columns,
|
|
388
|
+
group_by_aggregation_type, only_numeric, where_clause, exclude_columns,
|
|
389
|
+
numeric_scale, enclose_quotes, mock_datatypes,
|
|
390
|
+
expected_group_by_cols, expected_in_agg_string, expected_not_in_agg_string,
|
|
391
|
+
expected_grouping_cols_final
|
|
392
|
+
):
|
|
393
|
+
"""Test create_pandas_df_from_group_by with various configurations."""
|
|
394
|
+
with patch.object(sqlserver_service, 'get_data_types_from_object') as mock_get_datatypes, \
|
|
395
|
+
patch.object(sqlserver_service, 'execute_queries') as mock_execute:
|
|
396
|
+
|
|
397
|
+
mock_get_datatypes.return_value = mock_datatypes
|
|
398
|
+
mock_execute.return_value = pd.DataFrame()
|
|
399
|
+
|
|
400
|
+
result = sqlserver_service.create_pandas_df_from_group_by(
|
|
401
|
+
object=mock_database_object,
|
|
402
|
+
column_intersections=column_intersections,
|
|
403
|
+
group_by_columns=group_by_columns,
|
|
404
|
+
group_by_aggregation_columns=group_by_aggregation_columns,
|
|
405
|
+
group_by_aggregation_type=group_by_aggregation_type,
|
|
406
|
+
only_numeric=only_numeric,
|
|
407
|
+
where_clause=where_clause,
|
|
408
|
+
exclude_columns=exclude_columns,
|
|
409
|
+
numeric_scale=numeric_scale,
|
|
410
|
+
enclose_column_by_double_quotes=enclose_quotes
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
_, group_by_query_aggregation_string, group_by_query_columns_string, grouping_columns_final, _ = result
|
|
414
|
+
|
|
415
|
+
# Check group_by_query_columns_string
|
|
416
|
+
assert group_by_query_columns_string == expected_group_by_cols
|
|
417
|
+
|
|
418
|
+
# Check grouping_columns_final
|
|
419
|
+
assert grouping_columns_final == expected_grouping_cols_final
|
|
420
|
+
|
|
421
|
+
# Check expected strings in aggregation string
|
|
422
|
+
for expected in expected_in_agg_string:
|
|
423
|
+
assert expected in group_by_query_aggregation_string
|
|
424
|
+
|
|
425
|
+
# Check strings that should NOT be in aggregation string
|
|
426
|
+
for expected in expected_not_in_agg_string:
|
|
427
|
+
assert expected not in group_by_query_aggregation_string
|