snowpark-checkpoints-validators 0.1.3__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/PKG-INFO +16 -4
  2. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/README.md +11 -0
  3. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/pyproject.toml +5 -3
  4. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/__init__.py +11 -1
  5. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/__version__.py +1 -1
  6. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/checkpoint.py +195 -97
  7. snowpark_checkpoints_validators-0.2.0/src/snowflake/snowpark_checkpoints/job_context.py +128 -0
  8. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/snowpark_sampler.py +26 -1
  9. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/spark_migration.py +39 -6
  10. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/extra_config.py +10 -5
  11. snowpark_checkpoints_validators-0.2.0/src/snowflake/snowpark_checkpoints/utils/logging_utils.py +67 -0
  12. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/pandera_check_manager.py +48 -7
  13. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/utils_checks.py +23 -2
  14. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/validation_result_metadata.py +30 -0
  15. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/test_pandera.py +47 -18
  16. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/test_parquet.py +84 -25
  17. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/test_spark_checkpoint.py +40 -21
  18. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_extra_config.py +7 -1
  19. snowpark_checkpoints_validators-0.2.0/test/unit/test_job_context.py +49 -0
  20. snowpark_checkpoints_validators-0.2.0/test/unit/test_logger.py +134 -0
  21. snowpark_checkpoints_validators-0.2.0/test/unit/test_logging_utils.py +132 -0
  22. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_validation_result_metadata.py +40 -0
  23. snowpark_checkpoints_validators-0.1.3/src/snowflake/snowpark_checkpoints/job_context.py +0 -85
  24. snowpark_checkpoints_validators-0.1.3/src/snowflake/snowpark_checkpoints/utils/checkpoint_logger.py +0 -52
  25. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/.gitignore +0 -0
  26. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/CHANGELOG.md +0 -0
  27. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/LICENSE +0 -0
  28. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/errors.py +0 -0
  29. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/singleton.py +0 -0
  30. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/__init__.py +0 -0
  31. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/constants.py +0 -0
  32. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/supported_types.py +0 -0
  33. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/telemetry.py +0 -0
  34. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/validation_results.py +0 -0
  35. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/.coveragerc +0 -0
  36. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/e2eexample.py +0 -0
  37. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_compare_utils.py +0 -0
  38. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/df_mode_dataframe_mismatch_telemetry.json +0 -0
  39. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/df_mode_dataframe_telemetry.json +0 -0
  40. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_df_fail_telemetry.json +0 -0
  41. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_df_pass_telemetry.json +0 -0
  42. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_limit_sample_telemetry.json +0 -0
  43. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_random_sample_telemetry.json +0 -0
  44. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_scalar_fail_telemetry.json +0 -0
  45. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_scalar_passing_telemetry.json +0 -0
  46. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_df_check_custom_check_telemetry.json +0 -0
  47. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_df_check_fail_telemetry.json +0 -0
  48. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_df_check_from_file_telemetry.json +0 -0
  49. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_df_check_skip_check_telemetry.json +0 -0
  50. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_df_check_telemetry.json +0 -0
  51. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_input_fail_telemetry.json +0 -0
  52. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_input_telemetry.json +0 -0
  53. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_output_fail_telemetry.json +0 -0
  54. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_output_telemetry.json +0 -0
  55. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_pandera_check_manager.py +0 -0
  56. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_spark_migration.py +0 -0
  57. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_telemetry.py +0 -0
  58. {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_utils_checks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: snowpark-checkpoints-validators
3
- Version: 0.1.3
3
+ Version: 0.2.0
4
4
  Summary: Migration tools for Snowpark
5
5
  Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
6
  Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
@@ -26,11 +26,9 @@ Classifier: Topic :: Software Development :: Libraries
26
26
  Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
27
27
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
28
  Requires-Python: <3.12,>=3.9
29
- Requires-Dist: pandera-report==0.1.2
30
29
  Requires-Dist: pandera[io]==0.20.4
31
- Requires-Dist: pyspark
32
30
  Requires-Dist: snowflake-connector-python[pandas]
33
- Requires-Dist: snowflake-snowpark-python==1.26.0
31
+ Requires-Dist: snowflake-snowpark-python>=1.23.0
34
32
  Provides-Extra: development
35
33
  Requires-Dist: coverage>=7.6.7; extra == 'development'
36
34
  Requires-Dist: deepdiff==8.1.1; extra == 'development'
@@ -38,10 +36,13 @@ Requires-Dist: deepdiff>=8.0.0; extra == 'development'
38
36
  Requires-Dist: hatchling==1.25.0; extra == 'development'
39
37
  Requires-Dist: pre-commit>=4.0.1; extra == 'development'
40
38
  Requires-Dist: pyarrow>=18.0.0; extra == 'development'
39
+ Requires-Dist: pyspark>=3.5.0; extra == 'development'
41
40
  Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
42
41
  Requires-Dist: pytest>=8.3.3; extra == 'development'
43
42
  Requires-Dist: setuptools>=70.0.0; extra == 'development'
44
43
  Requires-Dist: twine==5.1.1; extra == 'development'
44
+ Provides-Extra: pyspark
45
+ Requires-Dist: pyspark>=3.5.0; extra == 'pyspark'
45
46
  Description-Content-Type: text/markdown
46
47
 
47
48
  # snowpark-checkpoints-validators
@@ -52,6 +53,17 @@ Description-Content-Type: text/markdown
52
53
 
53
54
  **snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
54
55
 
56
+ ---
57
+ ## Install the library
58
+ ```bash
59
+ pip install snowpark-checkpoints-validators
60
+ ```
61
+ This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
62
+ ```bash
63
+ pip install "snowpark-checkpoints-validators[pyspark]"
64
+ ```
65
+ ---
66
+
55
67
  ## Features
56
68
 
57
69
  - Validate Snowpark DataFrames against predefined Pandera schemas.
@@ -6,6 +6,17 @@
6
6
 
7
7
  **snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
8
8
 
9
+ ---
10
+ ## Install the library
11
+ ```bash
12
+ pip install snowpark-checkpoints-validators
13
+ ```
14
+ This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
15
+ ```bash
16
+ pip install "snowpark-checkpoints-validators[pyspark]"
17
+ ```
18
+ ---
19
+
9
20
  ## Features
10
21
 
11
22
  - Validate Snowpark DataFrames against predefined Pandera schemas.
@@ -26,11 +26,9 @@ classifiers = [
26
26
  "Topic :: Scientific/Engineering :: Information Analysis",
27
27
  ]
28
28
  dependencies = [
29
- "snowflake-snowpark-python==1.26.0",
29
+ "snowflake-snowpark-python>=1.23.0",
30
30
  "snowflake-connector-python[pandas]",
31
- "pyspark",
32
31
  "pandera[io]==0.20.4",
33
- "pandera-report==0.1.2",
34
32
  ]
35
33
  description = "Migration tools for Snowpark"
36
34
  dynamic = ['version']
@@ -48,6 +46,9 @@ readme = "README.md"
48
46
  requires-python = '>=3.9,<3.12'
49
47
 
50
48
  [project.optional-dependencies]
49
+ pyspark = [
50
+ "pyspark>=3.5.0",
51
+ ]
51
52
  development = [
52
53
  "deepdiff==8.1.1",
53
54
  "pytest>=8.3.3",
@@ -59,6 +60,7 @@ development = [
59
60
  "setuptools>=70.0.0",
60
61
  "pyarrow>=18.0.0",
61
62
  "deepdiff>=8.0.0",
63
+ "pyspark>=3.5.0",
62
64
  ]
63
65
 
64
66
  [project.urls]
@@ -13,16 +13,26 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
+ import logging
17
+
18
+
19
+ # Add a NullHandler to prevent logging messages from being output to
20
+ # sys.stderr if no logging configuration is provided.
21
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
22
+
23
+ # ruff: noqa: E402
24
+
16
25
  from snowflake.snowpark_checkpoints.checkpoint import (
17
26
  check_dataframe_schema,
18
- check_output_schema,
19
27
  check_input_schema,
28
+ check_output_schema,
20
29
  validate_dataframe_checkpoint,
21
30
  )
22
31
  from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
23
32
  from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
24
33
  from snowflake.snowpark_checkpoints.utils.constants import CheckpointMode
25
34
 
35
+
26
36
  __all__ = [
27
37
  "check_with_spark",
28
38
  "SnowparkJobContext",
@@ -13,4 +13,4 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- __version__ = "0.1.3"
16
+ __version__ = "0.2.0"
@@ -14,11 +14,14 @@
14
14
  # limitations under the License.
15
15
 
16
16
  # Wrapper around pandera which logs to snowflake
17
- from typing import Any, Optional, Union
17
+
18
+ import logging
19
+
20
+ from typing import Any, Optional, Union, cast
18
21
 
19
22
  from pandas import DataFrame as PandasDataFrame
20
- from pandera import Check, DataFrameSchema
21
- from pandera_report import DataFrameValidator
23
+ from pandera import Check, DataFrameModel, DataFrameSchema
24
+ from pandera.errors import SchemaError, SchemaErrors
22
25
 
23
26
  from snowflake.snowpark import DataFrame as SnowparkDataFrame
24
27
  from snowflake.snowpark_checkpoints.errors import SchemaValidationError
@@ -27,13 +30,13 @@ from snowflake.snowpark_checkpoints.snowpark_sampler import (
27
30
  SamplingAdapter,
28
31
  SamplingStrategy,
29
32
  )
30
- from snowflake.snowpark_checkpoints.utils.checkpoint_logger import CheckpointLogger
31
33
  from snowflake.snowpark_checkpoints.utils.constants import (
32
34
  FAIL_STATUS,
33
35
  PASS_STATUS,
34
36
  CheckpointMode,
35
37
  )
36
38
  from snowflake.snowpark_checkpoints.utils.extra_config import is_checkpoint_enabled
39
+ from snowflake.snowpark_checkpoints.utils.logging_utils import log
37
40
  from snowflake.snowpark_checkpoints.utils.pandera_check_manager import (
38
41
  PanderaCheckManager,
39
42
  )
@@ -47,6 +50,10 @@ from snowflake.snowpark_checkpoints.utils.utils_checks import (
47
50
  )
48
51
 
49
52
 
53
+ LOGGER = logging.getLogger(__name__)
54
+
55
+
56
+ @log
50
57
  def validate_dataframe_checkpoint(
51
58
  df: SnowparkDataFrame,
52
59
  checkpoint_name: str,
@@ -84,31 +91,45 @@ def validate_dataframe_checkpoint(
84
91
  """
85
92
  checkpoint_name = _replace_special_characters(checkpoint_name)
86
93
 
87
- if is_checkpoint_enabled(checkpoint_name):
88
-
89
- if mode == CheckpointMode.SCHEMA:
90
- return _check_dataframe_schema_file(
91
- df,
92
- checkpoint_name,
93
- job_context,
94
- custom_checks,
95
- skip_checks,
96
- sample_frac,
97
- sample_number,
98
- sampling_strategy,
99
- output_path,
100
- )
101
- elif mode == CheckpointMode.DATAFRAME:
102
- if job_context is None:
103
- raise ValueError(
104
- "Connectionless mode is not supported for Parquet validation"
105
- )
106
- _check_compare_data(df, job_context, checkpoint_name, output_path)
107
- else:
94
+ if not is_checkpoint_enabled(checkpoint_name):
95
+ LOGGER.warning(
96
+ "Checkpoint '%s' is disabled. Skipping DataFrame checkpoint validation.",
97
+ checkpoint_name,
98
+ )
99
+ return None
100
+
101
+ LOGGER.info(
102
+ "Starting DataFrame checkpoint validation for checkpoint '%s'", checkpoint_name
103
+ )
104
+
105
+ if mode == CheckpointMode.SCHEMA:
106
+ result = _check_dataframe_schema_file(
107
+ df,
108
+ checkpoint_name,
109
+ job_context,
110
+ custom_checks,
111
+ skip_checks,
112
+ sample_frac,
113
+ sample_number,
114
+ sampling_strategy,
115
+ output_path,
116
+ )
117
+ return result
118
+
119
+ if mode == CheckpointMode.DATAFRAME:
120
+ if job_context is None:
108
121
  raise ValueError(
109
- """Invalid validation mode.
110
- Please use for schema validation use a 1 or for a full data validation use a 2 for schema validation."""
122
+ "No job context provided. Please provide one when using DataFrame mode validation."
111
123
  )
124
+ _check_compare_data(df, job_context, checkpoint_name, output_path)
125
+ return None
126
+
127
+ raise ValueError(
128
+ (
129
+ "Invalid validation mode. "
130
+ "Please use 1 for schema validation or 2 for full data validation."
131
+ ),
132
+ )
112
133
 
113
134
 
114
135
  def _check_dataframe_schema_file(
@@ -156,7 +177,7 @@ def _check_dataframe_schema_file(
156
177
 
157
178
  schema = _generate_schema(checkpoint_name, output_path)
158
179
 
159
- return check_dataframe_schema(
180
+ return _check_dataframe_schema(
160
181
  df,
161
182
  schema,
162
183
  checkpoint_name,
@@ -170,6 +191,7 @@ def _check_dataframe_schema_file(
170
191
  )
171
192
 
172
193
 
194
+ @log
173
195
  def check_dataframe_schema(
174
196
  df: SnowparkDataFrame,
175
197
  pandera_schema: DataFrameSchema,
@@ -212,6 +234,9 @@ def check_dataframe_schema(
212
234
 
213
235
  """
214
236
  checkpoint_name = _replace_special_characters(checkpoint_name)
237
+ LOGGER.info(
238
+ "Starting DataFrame schema validation for checkpoint '%s'", checkpoint_name
239
+ )
215
240
 
216
241
  if df is None:
217
242
  raise ValueError("DataFrame is required")
@@ -219,19 +244,25 @@ def check_dataframe_schema(
219
244
  if pandera_schema is None:
220
245
  raise ValueError("Schema is required")
221
246
 
222
- if is_checkpoint_enabled(checkpoint_name):
223
- return _check_dataframe_schema(
224
- df,
225
- pandera_schema,
247
+ if not is_checkpoint_enabled(checkpoint_name):
248
+ LOGGER.warning(
249
+ "Checkpoint '%s' is disabled. Skipping DataFrame schema validation.",
226
250
  checkpoint_name,
227
- job_context,
228
- custom_checks,
229
- skip_checks,
230
- sample_frac,
231
- sample_number,
232
- sampling_strategy,
233
- output_path,
234
251
  )
252
+ return None
253
+
254
+ return _check_dataframe_schema(
255
+ df,
256
+ pandera_schema,
257
+ checkpoint_name,
258
+ job_context,
259
+ custom_checks,
260
+ skip_checks,
261
+ sample_frac,
262
+ sample_number,
263
+ sampling_strategy,
264
+ output_path,
265
+ )
235
266
 
236
267
 
237
268
  @report_telemetry(
@@ -259,17 +290,24 @@ def _check_dataframe_schema(
259
290
  pandera_schema_upper, sample_df = _process_sampling(
260
291
  df, pandera_schema, job_context, sample_frac, sample_number, sampling_strategy
261
292
  )
262
-
263
- # Raises SchemaError on validation issues
264
- validator = DataFrameValidator()
265
- is_valid, validation_result = validator.validate(
266
- pandera_schema_upper, sample_df, validity_flag=True
267
- )
293
+ is_valid, validation_result = _validate(pandera_schema_upper, sample_df)
268
294
  if is_valid:
295
+ LOGGER.info(
296
+ "DataFrame schema validation passed for checkpoint '%s'",
297
+ checkpoint_name,
298
+ )
269
299
  if job_context is not None:
270
300
  job_context._mark_pass(checkpoint_name)
301
+ else:
302
+ LOGGER.warning(
303
+ "No job context provided. Skipping result recording into Snowflake.",
304
+ )
271
305
  _update_validation_result(checkpoint_name, PASS_STATUS, output_path)
272
306
  else:
307
+ LOGGER.error(
308
+ "DataFrame schema validation failed for checkpoint '%s'",
309
+ checkpoint_name,
310
+ )
273
311
  _update_validation_result(checkpoint_name, FAIL_STATUS, output_path)
274
312
  raise SchemaValidationError(
275
313
  "Snowpark DataFrame schema validation error",
@@ -282,6 +320,7 @@ def _check_dataframe_schema(
282
320
 
283
321
 
284
322
  @report_telemetry(params_list=["pandera_schema"])
323
+ @log
285
324
  def check_output_schema(
286
325
  pandera_schema: DataFrameSchema,
287
326
  checkpoint_name: str,
@@ -318,11 +357,8 @@ def check_output_schema(
318
357
  function: The decorated function.
319
358
 
320
359
  """
321
- _checkpoint_name = checkpoint_name
322
- if checkpoint_name is None:
323
- _checkpoint_name = snowpark_fn.__name__
324
- _checkpoint_name = _replace_special_characters(_checkpoint_name)
325
360
 
361
+ @log(log_args=False)
326
362
  def wrapper(*args, **kwargs):
327
363
  """Wrapp a function to validate the schema of the output of a Snowpark function.
328
364
 
@@ -334,7 +370,25 @@ def check_output_schema(
334
370
  Any: The result of the Snowpark function.
335
371
 
336
372
  """
373
+ _checkpoint_name = checkpoint_name
374
+ if checkpoint_name is None:
375
+ LOGGER.warning(
376
+ (
377
+ "No checkpoint name provided for output schema validation. "
378
+ "Using '%s' as the checkpoint name.",
379
+ ),
380
+ snowpark_fn.__name__,
381
+ )
382
+ _checkpoint_name = snowpark_fn.__name__
383
+ _checkpoint_name = _replace_special_characters(_checkpoint_name)
384
+ LOGGER.info(
385
+ "Starting output schema validation for Snowpark function '%s' and checkpoint '%s'",
386
+ snowpark_fn.__name__,
387
+ _checkpoint_name,
388
+ )
389
+
337
390
  # Run the sampled data in snowpark
391
+ LOGGER.info("Running the Snowpark function '%s'", snowpark_fn.__name__)
338
392
  snowpark_results = snowpark_fn(*args, **kwargs)
339
393
  sampler = SamplingAdapter(
340
394
  job_context, sample_frac, sample_number, sampling_strategy
@@ -342,22 +396,28 @@ def check_output_schema(
342
396
  sampler.process_args([snowpark_results])
343
397
  pandas_sample_args = sampler.get_sampled_pandas_args()
344
398
 
345
- # Raises SchemaError on validation issues
346
- validator = DataFrameValidator()
347
- is_valid, validation_result = validator.validate(
348
- pandera_schema, pandas_sample_args[0], validity_flag=True
399
+ is_valid, validation_result = _validate(
400
+ pandera_schema, pandas_sample_args[0]
349
401
  )
350
- logger = CheckpointLogger().get_logger()
351
- logger.info(
352
- f"Checkpoint {_checkpoint_name} validation result:\n{validation_result}"
353
- )
354
-
355
402
  if is_valid:
403
+ LOGGER.info(
404
+ "Output schema validation passed for Snowpark function '%s' and checkpoint '%s'",
405
+ snowpark_fn.__name__,
406
+ _checkpoint_name,
407
+ )
356
408
  if job_context is not None:
357
409
  job_context._mark_pass(_checkpoint_name)
358
-
410
+ else:
411
+ LOGGER.warning(
412
+ "No job context provided. Skipping result recording into Snowflake.",
413
+ )
359
414
  _update_validation_result(_checkpoint_name, PASS_STATUS, output_path)
360
415
  else:
416
+ LOGGER.error(
417
+ "Output schema validation failed for Snowpark function '%s' and checkpoint '%s'",
418
+ snowpark_fn.__name__,
419
+ _checkpoint_name,
420
+ )
361
421
  _update_validation_result(_checkpoint_name, FAIL_STATUS, output_path)
362
422
  raise SchemaValidationError(
363
423
  "Snowpark output schema validation error",
@@ -365,7 +425,6 @@ def check_output_schema(
365
425
  _checkpoint_name,
366
426
  validation_result,
367
427
  )
368
-
369
428
  return snowpark_results
370
429
 
371
430
  return wrapper
@@ -374,6 +433,7 @@ def check_output_schema(
374
433
 
375
434
 
376
435
  @report_telemetry(params_list=["pandera_schema"])
436
+ @log
377
437
  def check_input_schema(
378
438
  pandera_schema: DataFrameSchema,
379
439
  checkpoint_name: str,
@@ -414,11 +474,8 @@ def check_input_schema(
414
474
  Callable: A wrapper function that performs schema validation before executing the original function.
415
475
 
416
476
  """
417
- _checkpoint_name = checkpoint_name
418
- if checkpoint_name is None:
419
- _checkpoint_name = snowpark_fn.__name__
420
- _checkpoint_name = _replace_special_characters(_checkpoint_name)
421
477
 
478
+ @log(log_args=False)
422
479
  def wrapper(*args, **kwargs):
423
480
  """Wrapp a function to validate the schema of the input of a Snowpark function.
424
481
 
@@ -429,6 +486,23 @@ def check_input_schema(
429
486
  Any: The result of the original function after input validation.
430
487
 
431
488
  """
489
+ _checkpoint_name = checkpoint_name
490
+ if checkpoint_name is None:
491
+ LOGGER.warning(
492
+ (
493
+ "No checkpoint name provided for input schema validation. "
494
+ "Using '%s' as the checkpoint name."
495
+ ),
496
+ snowpark_fn.__name__,
497
+ )
498
+ _checkpoint_name = snowpark_fn.__name__
499
+ _checkpoint_name = _replace_special_characters(_checkpoint_name)
500
+ LOGGER.info(
501
+ "Starting input schema validation for Snowpark function '%s' and checkpoint '%s'",
502
+ snowpark_fn.__name__,
503
+ _checkpoint_name,
504
+ )
505
+
432
506
  # Run the sampled data in snowpark
433
507
  sampler = SamplingAdapter(
434
508
  job_context, sample_frac, sample_number, sampling_strategy
@@ -436,47 +510,71 @@ def check_input_schema(
436
510
  sampler.process_args(args)
437
511
  pandas_sample_args = sampler.get_sampled_pandas_args()
438
512
 
513
+ LOGGER.info(
514
+ "Validating %s input argument(s) against a Pandera schema",
515
+ len(pandas_sample_args),
516
+ )
439
517
  # Raises SchemaError on validation issues
440
- for arg in pandas_sample_args:
441
- if isinstance(arg, PandasDataFrame):
442
-
443
- validator = DataFrameValidator()
444
- is_valid, validation_result = validator.validate(
445
- pandera_schema,
446
- arg,
447
- validity_flag=True,
518
+ for index, arg in enumerate(pandas_sample_args, start=1):
519
+ if not isinstance(arg, PandasDataFrame):
520
+ LOGGER.info(
521
+ "Arg %s: Skipping schema validation for non-DataFrame argument",
522
+ index,
448
523
  )
524
+ continue
449
525
 
450
- logger = CheckpointLogger().get_logger()
451
- logger.info(
452
- f"Checkpoint {checkpoint_name} validation result:\n{validation_result}"
526
+ is_valid, validation_result = _validate(
527
+ pandera_schema,
528
+ arg,
529
+ )
530
+ if is_valid:
531
+ LOGGER.info(
532
+ "Arg %s: Input schema validation passed",
533
+ index,
453
534
  )
454
-
455
- if is_valid:
456
- if job_context is not None:
457
- job_context._mark_pass(
458
- _checkpoint_name,
459
- )
460
-
461
- _update_validation_result(
462
- _checkpoint_name,
463
- PASS_STATUS,
464
- output_path,
465
- )
466
- else:
467
- _update_validation_result(
468
- _checkpoint_name,
469
- FAIL_STATUS,
470
- output_path,
471
- )
472
- raise SchemaValidationError(
473
- "Snowpark input schema validation error",
474
- job_context,
535
+ if job_context is not None:
536
+ job_context._mark_pass(
475
537
  _checkpoint_name,
476
- validation_result,
477
538
  )
539
+ _update_validation_result(
540
+ _checkpoint_name,
541
+ PASS_STATUS,
542
+ output_path,
543
+ )
544
+ else:
545
+ LOGGER.error(
546
+ "Arg %s: Input schema validation failed",
547
+ index,
548
+ )
549
+ _update_validation_result(
550
+ _checkpoint_name,
551
+ FAIL_STATUS,
552
+ output_path,
553
+ )
554
+ raise SchemaValidationError(
555
+ "Snowpark input schema validation error",
556
+ job_context,
557
+ _checkpoint_name,
558
+ validation_result,
559
+ )
478
560
  return snowpark_fn(*args, **kwargs)
479
561
 
480
562
  return wrapper
481
563
 
482
564
  return check_input_with_decorator
565
+
566
+
567
+ def _validate(
568
+ schema: Union[type[DataFrameModel], DataFrameSchema],
569
+ df: PandasDataFrame,
570
+ lazy: bool = True,
571
+ ) -> tuple[bool, PandasDataFrame]:
572
+ if not isinstance(schema, DataFrameSchema):
573
+ schema = schema.to_schema()
574
+ is_valid = True
575
+ try:
576
+ df = schema.validate(df, lazy=lazy)
577
+ except (SchemaErrors, SchemaError) as schema_errors:
578
+ df = cast(PandasDataFrame, schema_errors.failure_cases)
579
+ is_valid = False
580
+ return is_valid, df