snowpark-checkpoints-validators 0.2.0rc1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. snowflake/snowpark_checkpoints/__init__.py +44 -0
  2. snowflake/snowpark_checkpoints/__version__.py +16 -0
  3. snowflake/snowpark_checkpoints/checkpoint.py +580 -0
  4. snowflake/snowpark_checkpoints/errors.py +60 -0
  5. snowflake/snowpark_checkpoints/io_utils/__init__.py +26 -0
  6. snowflake/snowpark_checkpoints/io_utils/io_default_strategy.py +57 -0
  7. snowflake/snowpark_checkpoints/io_utils/io_env_strategy.py +133 -0
  8. snowflake/snowpark_checkpoints/io_utils/io_file_manager.py +76 -0
  9. snowflake/snowpark_checkpoints/job_context.py +128 -0
  10. snowflake/snowpark_checkpoints/singleton.py +23 -0
  11. snowflake/snowpark_checkpoints/snowpark_sampler.py +124 -0
  12. snowflake/snowpark_checkpoints/spark_migration.py +255 -0
  13. snowflake/snowpark_checkpoints/utils/__init__.py +14 -0
  14. snowflake/snowpark_checkpoints/utils/constants.py +134 -0
  15. snowflake/snowpark_checkpoints/utils/extra_config.py +132 -0
  16. snowflake/snowpark_checkpoints/utils/logging_utils.py +67 -0
  17. snowflake/snowpark_checkpoints/utils/pandera_check_manager.py +399 -0
  18. snowflake/snowpark_checkpoints/utils/supported_types.py +65 -0
  19. snowflake/snowpark_checkpoints/utils/telemetry.py +939 -0
  20. snowflake/snowpark_checkpoints/utils/utils_checks.py +398 -0
  21. snowflake/snowpark_checkpoints/validation_result_metadata.py +159 -0
  22. snowflake/snowpark_checkpoints/validation_results.py +49 -0
  23. snowpark_checkpoints_validators-0.3.0.dist-info/METADATA +325 -0
  24. snowpark_checkpoints_validators-0.3.0.dist-info/RECORD +26 -0
  25. snowpark_checkpoints_validators-0.2.0rc1.dist-info/METADATA +0 -514
  26. snowpark_checkpoints_validators-0.2.0rc1.dist-info/RECORD +0 -4
  27. {snowpark_checkpoints_validators-0.2.0rc1.dist-info → snowpark_checkpoints_validators-0.3.0.dist-info}/WHEEL +0 -0
  28. {snowpark_checkpoints_validators-0.2.0rc1.dist-info → snowpark_checkpoints_validators-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,399 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import logging
17
+
18
+ from datetime import datetime
19
+ from typing import Optional
20
+
21
+ from pandera import Check, DataFrameSchema
22
+
23
+ from snowflake.snowpark_checkpoints.utils.constants import (
24
+ COLUMNS_KEY,
25
+ DECIMAL_PRECISION_KEY,
26
+ DEFAULT_DATE_FORMAT,
27
+ FALSE_COUNT_KEY,
28
+ FORMAT_KEY,
29
+ MARGIN_ERROR_KEY,
30
+ MAX_KEY,
31
+ MEAN_KEY,
32
+ MIN_KEY,
33
+ NAME_KEY,
34
+ NULL_COUNT_KEY,
35
+ NULLABLE_KEY,
36
+ ROWS_COUNT_KEY,
37
+ SKIP_ALL,
38
+ TRUE_COUNT_KEY,
39
+ TYPE_KEY,
40
+ )
41
+ from snowflake.snowpark_checkpoints.utils.supported_types import (
42
+ BooleanTypes,
43
+ NumericTypes,
44
+ )
45
+
46
+
47
+ LOGGER = logging.getLogger(__name__)
48
+
49
+
50
+ class PanderaCheckManager:
51
+ def __init__(self, checkpoint_name: str, schema: DataFrameSchema):
52
+ self.checkpoint_name = checkpoint_name
53
+ self.schema = schema
54
+
55
+ def _add_numeric_checks(self, col: str, additional_check: dict[str, any]):
56
+ """Add numeric checks to a specified column in the schema.
57
+
58
+ This method adds two types of checks to the specified column:
59
+ 1. A mean check that ensures the mean of the column values is within the specified margin of error.
60
+ 2. A decimal precision check that ensures the number of decimal places in the column values does not
61
+ exceed the specified precision.
62
+
63
+ Args:
64
+ col (str): The name of the column to which the checks will be added.
65
+ additional_check (dict[str, any]): A dictionary containing the following keys:
66
+ - MEAN_KEY: The expected mean value for the column.
67
+ - MARGIN_ERROR_KEY: The acceptable margin of error for the mean check.
68
+ - DECIMAL_PRECISION_KEY: The maximum number of decimal places allowed for the column values.
69
+
70
+ """
71
+ mean = additional_check.get(MEAN_KEY, 0)
72
+ std = additional_check.get(MARGIN_ERROR_KEY, 0)
73
+
74
+ def check_mean(series):
75
+ series_mean = series.mean()
76
+ return mean - std <= series_mean <= mean + std
77
+
78
+ self.schema.columns[col].checks.append(
79
+ Check(check_mean, element_wise=False, name="mean")
80
+ )
81
+
82
+ if DECIMAL_PRECISION_KEY in additional_check:
83
+ self.schema.columns[col].checks.append(
84
+ Check(
85
+ lambda series: series.apply(
86
+ lambda x: len(str(x).split(".")[1]) if "." in str(x) else 0
87
+ )
88
+ <= additional_check[DECIMAL_PRECISION_KEY],
89
+ name="decimal_precision",
90
+ )
91
+ )
92
+
93
+ def _add_boolean_checks(self, col: str, additional_check: dict[str, any]):
94
+ """Add boolean checks to the schema for a specified column.
95
+
96
+ This method extends the checks for a given column in the schema by adding
97
+ boolean checks based on the provided additional_check dictionary. It calculates
98
+ the percentage of True and False values in the column and ensures that these
99
+ percentages fall within a specified margin of error.
100
+
101
+ Args:
102
+ col (str): The name of the column to which the checks will be added.
103
+ additional_check (dict[str, any]): A dictionary containing the following keys:
104
+ - TRUE_COUNT_KEY: The count of True values in the column.
105
+ - FALSE_COUNT_KEY: The count of False values in the column.
106
+ - ROWS_COUNT_KEY: The total number of rows in the column.
107
+ - MARGIN_ERROR_KEY: The acceptable margin of error for the percentage checks.
108
+
109
+ Returns:
110
+ None
111
+
112
+ """
113
+ count_of_true = additional_check.get(TRUE_COUNT_KEY, 0)
114
+ count_of_false = additional_check.get(FALSE_COUNT_KEY, 0)
115
+ rows_count = additional_check.get(ROWS_COUNT_KEY, 0)
116
+ std = additional_check.get(MARGIN_ERROR_KEY, 0)
117
+ percentage_true = count_of_true / rows_count
118
+ percentage_false = count_of_false / rows_count
119
+
120
+ self.schema.columns[col].checks.extend(
121
+ [
122
+ Check(
123
+ lambda series: (
124
+ percentage_true - std
125
+ <= series.value_counts().get(True, 0) / series.count()
126
+ if series.count() > 0
127
+ else 1 <= percentage_true + std
128
+ ),
129
+ ),
130
+ Check(
131
+ lambda series: (
132
+ percentage_false - std
133
+ <= series.value_counts().get(False, 0) / series.count()
134
+ if series.count() > 0
135
+ else 1 <= percentage_false + std
136
+ ),
137
+ ),
138
+ ]
139
+ )
140
+
141
+ def _add_null_checks(self, col: str, additional_check: dict[str, any]):
142
+ """Add null checks to the schema for a specified column.
143
+
144
+ This method calculates the percentage of null values in the column and
145
+ appends a check to the schema that ensures the percentage of null values
146
+ in the series is within an acceptable margin of error.
147
+
148
+ Args:
149
+ col (str): The name of the column to add null checks for.
150
+ additional_check (dict[str, any]): A dictionary containing additional
151
+ check parameters:
152
+ - NULL_COUNT_KEY (str): The key for the count of null values.
153
+ - ROWS_COUNT_KEY (str): The key for the total number of rows.
154
+ - MARGIN_ERROR_KEY (str): The key for the margin of error.
155
+
156
+ Raises:
157
+ KeyError: If any of the required keys are missing from additional_check.
158
+
159
+ """
160
+ count_of_null = additional_check.get(NULL_COUNT_KEY, 0)
161
+ rows_count = additional_check.get(ROWS_COUNT_KEY, 0)
162
+ std = additional_check.get(MARGIN_ERROR_KEY, 0)
163
+ percentage_null = count_of_null / rows_count
164
+
165
+ self.schema.columns[col].checks.append(
166
+ Check(
167
+ lambda series: (
168
+ percentage_null - std <= series.isnull().sum() / series.count()
169
+ if series.count() > 0
170
+ else 1 <= percentage_null + std
171
+ ),
172
+ ),
173
+ )
174
+
175
+ def _add_date_time_checks(self, col: str, additional_check: dict[str, any]):
176
+ """Add date and time checks to a specified column in the given DataFrameSchema.
177
+
178
+ Args:
179
+ schema (DataFrameSchema): The schema to which the checks will be added.
180
+ col (str): The name of the column to which the checks will be applied.
181
+ additional_check (dict[str, Any]): A dictionary containing additional check parameters.
182
+ - FORMAT_KEY (str): The key for the date format string in the dictionary.
183
+ - MIN_KEY (str): The key for the minimum date value in the dictionary.
184
+ - MAX_KEY (str): The key for the maximum date value in the dictionary.
185
+
186
+ The function will add the following checks based on the provided additional_check dictionary:
187
+ - If both min_date and max_date are provided, a between check is added.
188
+ - If only min_date is provided, a greater_than_or_equal_to check is added.
189
+ - If only max_date is provided, a less_than_or_equal_to check is added.
190
+
191
+ """
192
+ format = additional_check.get(FORMAT_KEY, DEFAULT_DATE_FORMAT)
193
+
194
+ min = additional_check.get(MIN_KEY, None)
195
+ min_date = datetime.strptime(min, format) if min else None
196
+
197
+ max = additional_check.get(MAX_KEY, None)
198
+ max_date = datetime.strptime(max, format) if max else None
199
+
200
+ if min_date and max_date:
201
+ self.schema.columns[col].checks.append(
202
+ Check.between(
203
+ min_date,
204
+ max_date,
205
+ include_max=True,
206
+ include_min=True,
207
+ )
208
+ )
209
+ elif min_date:
210
+ self.schema.columns[col].checks.append(
211
+ Check.greater_than_or_equal_to(min_date)
212
+ )
213
+ elif max_date:
214
+ self.schema.columns[col].checks.append(
215
+ Check.less_than_or_equal_to(max_date)
216
+ )
217
+
218
+ def _add_date_checks(self, col: str, additional_check: dict[str, any]):
219
+ """Add date and time checks to a specified column in the given DataFrameSchema.
220
+
221
+ Args:
222
+ schema (DataFrameSchema): The schema to which the checks will be added.
223
+ col (str): The name of the column to which the checks will be applied.
224
+ additional_check (dict[str, Any]): A dictionary containing additional check parameters.
225
+ - FORMAT_KEY (str): The key for the date format string in the dictionary.
226
+ - MIN_KEY (str): The key for the minimum date value in the dictionary.
227
+ - MAX_KEY (str): The key for the maximum date value in the dictionary.
228
+
229
+ The function will add the following checks based on the provided additional_check dictionary:
230
+ - If both min_date and max_date are provided, a between check is added.
231
+ - If only min_date is provided, a greater_than_or_equal_to check is added.
232
+ - If only max_date is provided, a less_than_or_equal_to check is added.
233
+
234
+ """
235
+ format = additional_check.get(FORMAT_KEY, DEFAULT_DATE_FORMAT)
236
+
237
+ min = additional_check.get(MIN_KEY, None)
238
+ min_date = datetime.strptime(min, format).date() if min else None
239
+
240
+ max = additional_check.get(MAX_KEY, None)
241
+ max_date = datetime.strptime(max, format).date() if max else None
242
+
243
+ if min_date and max_date:
244
+ self.schema.columns[col].checks.append(
245
+ Check.between(
246
+ min_date,
247
+ max_date,
248
+ include_max=True,
249
+ include_min=True,
250
+ )
251
+ )
252
+ elif min_date:
253
+ self.schema.columns[col].checks.append(
254
+ Check.greater_than_or_equal_to(min_date)
255
+ )
256
+ elif max_date:
257
+ self.schema.columns[col].checks.append(
258
+ Check.less_than_or_equal_to(max_date)
259
+ )
260
+
261
+ def proccess_checks(self, custom_data: dict) -> DataFrameSchema:
262
+ """Process the checks defined in the custom_data dictionary and applies them to the schema.
263
+
264
+ Args:
265
+ custom_data (dict): A dictionary containing the custom checks to be applied. The dictionary
266
+ should have a key corresponding to COLUMNS_KEY, which maps to a list of
267
+ column check definitions. Each column check definition should include
268
+ the following keys:
269
+ - TYPE_KEY: The type of the column (e.g., numeric, boolean, date, datetime).
270
+ - NAME_KEY: The name of the column.
271
+ - NULLABLE_KEY: A boolean indicating if the column is nullable.
272
+
273
+ Returns:
274
+ DataFrameSchema: The updated schema with the applied checks.
275
+
276
+ Raises:
277
+ ValueError: If the column name or type is not defined in the schema.
278
+
279
+ """
280
+ LOGGER.info("Adding checks for the checkpoint '%s'", self.checkpoint_name)
281
+
282
+ for additional_check in custom_data.get(COLUMNS_KEY):
283
+ name = additional_check.get(NAME_KEY, None)
284
+ if name is None:
285
+ raise ValueError(
286
+ f"Column name not defined in the schema {self.checkpoint_name}"
287
+ )
288
+
289
+ type = additional_check.get(TYPE_KEY, None)
290
+ if type is None:
291
+ raise ValueError(f"Type not defined for column {name}")
292
+
293
+ if self.schema.columns.get(name) is None:
294
+ LOGGER.warning(
295
+ "Column '%s' was not found in the Pandera schema. Skipping checks for this column.",
296
+ name,
297
+ )
298
+ continue
299
+
300
+ LOGGER.debug("Adding checks for column '%s' of type '%s'", name, type)
301
+
302
+ if type in NumericTypes:
303
+ self._add_numeric_checks(name, additional_check)
304
+
305
+ elif type in BooleanTypes:
306
+ self._add_boolean_checks(name, additional_check)
307
+
308
+ elif type == "date":
309
+ self._add_date_checks(name, additional_check)
310
+
311
+ elif type == "datetime":
312
+ self._add_date_time_checks(name, additional_check)
313
+
314
+ is_nullable = additional_check.get(NULLABLE_KEY, False)
315
+ if is_nullable:
316
+ LOGGER.debug("Column '%s' is nullable. Adding null checks.", name)
317
+ self._add_null_checks(name, additional_check)
318
+
319
+ return self.schema
320
+
321
+ def skip_checks_on_schema(
322
+ self,
323
+ skip_checks: Optional[dict[str, list[str]]] = None,
324
+ ) -> DataFrameSchema:
325
+ """Modify the schema by skipping specified checks on columns.
326
+
327
+ Args:
328
+ skip_checks : Optional[dict[str, list[str]]], optional
329
+ A dictionary where keys are column names and values are lists of check names to skip.
330
+ If the special key 'SKIP_ALL' is present in the list of checks for a column, all checks
331
+ for that column will be skipped. If None, no checks will be skipped.
332
+
333
+ Returns:
334
+ DataFrameSchema: The modified schema with specified checks skipped.
335
+
336
+ """
337
+ if not skip_checks:
338
+ return self.schema
339
+
340
+ for col, checks_to_skip in skip_checks.items():
341
+
342
+ if col in self.schema.columns:
343
+
344
+ if SKIP_ALL in checks_to_skip:
345
+ LOGGER.info(
346
+ "Skipping all checks for column '%s' in checkpoint '%s'",
347
+ col,
348
+ self.checkpoint_name,
349
+ )
350
+ self.schema.columns[col].checks = {}
351
+ else:
352
+ LOGGER.info(
353
+ "Skipping checks %s for column '%s' in checkpoint '%s'",
354
+ checks_to_skip,
355
+ col,
356
+ self.checkpoint_name,
357
+ )
358
+ self.schema.columns[col].checks = [
359
+ check
360
+ for check in self.schema.columns[col].checks
361
+ if check.name not in checks_to_skip
362
+ ]
363
+
364
+ return self.schema
365
+
366
+ def add_custom_checks(
367
+ self,
368
+ custom_checks: Optional[dict[str, list[Check]]] = None,
369
+ ):
370
+ """Add custom checks to a Pandera DataFrameSchema.
371
+
372
+ Args:
373
+ schema (DataFrameSchema): The Pandera DataFrameSchema object to modify.
374
+ custom_checks (Optional[dict[str, list[Check]]]): A dictionary where keys are column names
375
+ and values are lists of checks to add for
376
+ those columns.
377
+
378
+ Returns:
379
+ None
380
+
381
+ """
382
+ if not custom_checks:
383
+ return self.schema
384
+
385
+ for col, checks in custom_checks.items():
386
+
387
+ if col in self.schema.columns:
388
+ LOGGER.info(
389
+ "Adding %s custom checks to column '%s' in checkpoint '%s'",
390
+ len(checks),
391
+ col,
392
+ self.checkpoint_name,
393
+ )
394
+ col_schema = self.schema.columns[col]
395
+ col_schema.checks.extend(checks)
396
+ else:
397
+ raise ValueError(f"Column {col} not found in schema")
398
+
399
+ return self.schema
@@ -0,0 +1,65 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from snowflake.snowpark_checkpoints.utils.constants import (
17
+ BINARY_TYPE,
18
+ BOOLEAN_TYPE,
19
+ BYTE_TYPE,
20
+ DATE_TYPE,
21
+ DECIMAL_TYPE,
22
+ DOUBLE_TYPE,
23
+ FLOAT_TYPE,
24
+ INTEGER_TYPE,
25
+ LONG_TYPE,
26
+ SHORT_TYPE,
27
+ STRING_TYPE,
28
+ TIMESTAMP_NTZ_TYPE,
29
+ TIMESTAMP_TYPE,
30
+ )
31
+
32
+
33
+ NumericTypes = [
34
+ BYTE_TYPE,
35
+ SHORT_TYPE,
36
+ INTEGER_TYPE,
37
+ LONG_TYPE,
38
+ FLOAT_TYPE,
39
+ DOUBLE_TYPE,
40
+ DECIMAL_TYPE,
41
+ ]
42
+
43
+ StringTypes = [STRING_TYPE]
44
+
45
+ BinaryTypes = [BINARY_TYPE]
46
+
47
+ BooleanTypes = [BOOLEAN_TYPE]
48
+
49
+ DateTypes = [DATE_TYPE, TIMESTAMP_TYPE, TIMESTAMP_NTZ_TYPE]
50
+
51
+ SupportedTypes = [
52
+ BYTE_TYPE,
53
+ SHORT_TYPE,
54
+ INTEGER_TYPE,
55
+ LONG_TYPE,
56
+ FLOAT_TYPE,
57
+ DOUBLE_TYPE,
58
+ DECIMAL_TYPE,
59
+ STRING_TYPE,
60
+ BINARY_TYPE,
61
+ BOOLEAN_TYPE,
62
+ DATE_TYPE,
63
+ TIMESTAMP_TYPE,
64
+ TIMESTAMP_NTZ_TYPE,
65
+ ]