snowpark-checkpoints-validators 0.1.3__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/PKG-INFO +16 -4
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/README.md +11 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/pyproject.toml +5 -3
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/__init__.py +11 -1
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/__version__.py +1 -1
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/checkpoint.py +195 -97
- snowpark_checkpoints_validators-0.2.0/src/snowflake/snowpark_checkpoints/job_context.py +128 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/snowpark_sampler.py +26 -1
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/spark_migration.py +39 -6
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/extra_config.py +10 -5
- snowpark_checkpoints_validators-0.2.0/src/snowflake/snowpark_checkpoints/utils/logging_utils.py +67 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/pandera_check_manager.py +48 -7
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/utils_checks.py +23 -2
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/validation_result_metadata.py +30 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/test_pandera.py +47 -18
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/test_parquet.py +84 -25
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/test_spark_checkpoint.py +40 -21
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_extra_config.py +7 -1
- snowpark_checkpoints_validators-0.2.0/test/unit/test_job_context.py +49 -0
- snowpark_checkpoints_validators-0.2.0/test/unit/test_logger.py +134 -0
- snowpark_checkpoints_validators-0.2.0/test/unit/test_logging_utils.py +132 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_validation_result_metadata.py +40 -0
- snowpark_checkpoints_validators-0.1.3/src/snowflake/snowpark_checkpoints/job_context.py +0 -85
- snowpark_checkpoints_validators-0.1.3/src/snowflake/snowpark_checkpoints/utils/checkpoint_logger.py +0 -52
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/.gitignore +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/CHANGELOG.md +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/LICENSE +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/errors.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/singleton.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/__init__.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/constants.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/supported_types.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/utils/telemetry.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/src/snowflake/snowpark_checkpoints/validation_results.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/.coveragerc +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/e2eexample.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_compare_utils.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/df_mode_dataframe_mismatch_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/df_mode_dataframe_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_df_fail_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_df_pass_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_limit_sample_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_random_sample_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_scalar_fail_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/spark_checkpoint_scalar_passing_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_df_check_custom_check_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_df_check_fail_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_df_check_from_file_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_df_check_skip_check_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_df_check_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_input_fail_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_input_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_output_fail_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/integ/telemetry_expected/test_output_telemetry.json +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_pandera_check_manager.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_spark_migration.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_telemetry.py +0 -0
- {snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/test/unit/test_utils_checks.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: snowpark-checkpoints-validators
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: Migration tools for Snowpark
|
5
5
|
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
6
|
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
@@ -26,11 +26,9 @@ Classifier: Topic :: Software Development :: Libraries
|
|
26
26
|
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
27
27
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
28
|
Requires-Python: <3.12,>=3.9
|
29
|
-
Requires-Dist: pandera-report==0.1.2
|
30
29
|
Requires-Dist: pandera[io]==0.20.4
|
31
|
-
Requires-Dist: pyspark
|
32
30
|
Requires-Dist: snowflake-connector-python[pandas]
|
33
|
-
Requires-Dist: snowflake-snowpark-python
|
31
|
+
Requires-Dist: snowflake-snowpark-python>=1.23.0
|
34
32
|
Provides-Extra: development
|
35
33
|
Requires-Dist: coverage>=7.6.7; extra == 'development'
|
36
34
|
Requires-Dist: deepdiff==8.1.1; extra == 'development'
|
@@ -38,10 +36,13 @@ Requires-Dist: deepdiff>=8.0.0; extra == 'development'
|
|
38
36
|
Requires-Dist: hatchling==1.25.0; extra == 'development'
|
39
37
|
Requires-Dist: pre-commit>=4.0.1; extra == 'development'
|
40
38
|
Requires-Dist: pyarrow>=18.0.0; extra == 'development'
|
39
|
+
Requires-Dist: pyspark>=3.5.0; extra == 'development'
|
41
40
|
Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
|
42
41
|
Requires-Dist: pytest>=8.3.3; extra == 'development'
|
43
42
|
Requires-Dist: setuptools>=70.0.0; extra == 'development'
|
44
43
|
Requires-Dist: twine==5.1.1; extra == 'development'
|
44
|
+
Provides-Extra: pyspark
|
45
|
+
Requires-Dist: pyspark>=3.5.0; extra == 'pyspark'
|
45
46
|
Description-Content-Type: text/markdown
|
46
47
|
|
47
48
|
# snowpark-checkpoints-validators
|
@@ -52,6 +53,17 @@ Description-Content-Type: text/markdown
|
|
52
53
|
|
53
54
|
**snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
|
54
55
|
|
56
|
+
---
|
57
|
+
## Install the library
|
58
|
+
```bash
|
59
|
+
pip install snowpark-checkpoints-validators
|
60
|
+
```
|
61
|
+
This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
|
62
|
+
```bash
|
63
|
+
pip install "snowpark-checkpoints-validators[pyspark]"
|
64
|
+
```
|
65
|
+
---
|
66
|
+
|
55
67
|
## Features
|
56
68
|
|
57
69
|
- Validate Snowpark DataFrames against predefined Pandera schemas.
|
@@ -6,6 +6,17 @@
|
|
6
6
|
|
7
7
|
**snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
|
8
8
|
|
9
|
+
---
|
10
|
+
## Install the library
|
11
|
+
```bash
|
12
|
+
pip install snowpark-checkpoints-validators
|
13
|
+
```
|
14
|
+
This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
|
15
|
+
```bash
|
16
|
+
pip install "snowpark-checkpoints-validators[pyspark]"
|
17
|
+
```
|
18
|
+
---
|
19
|
+
|
9
20
|
## Features
|
10
21
|
|
11
22
|
- Validate Snowpark DataFrames against predefined Pandera schemas.
|
{snowpark_checkpoints_validators-0.1.3 → snowpark_checkpoints_validators-0.2.0}/pyproject.toml
RENAMED
@@ -26,11 +26,9 @@ classifiers = [
|
|
26
26
|
"Topic :: Scientific/Engineering :: Information Analysis",
|
27
27
|
]
|
28
28
|
dependencies = [
|
29
|
-
"snowflake-snowpark-python
|
29
|
+
"snowflake-snowpark-python>=1.23.0",
|
30
30
|
"snowflake-connector-python[pandas]",
|
31
|
-
"pyspark",
|
32
31
|
"pandera[io]==0.20.4",
|
33
|
-
"pandera-report==0.1.2",
|
34
32
|
]
|
35
33
|
description = "Migration tools for Snowpark"
|
36
34
|
dynamic = ['version']
|
@@ -48,6 +46,9 @@ readme = "README.md"
|
|
48
46
|
requires-python = '>=3.9,<3.12'
|
49
47
|
|
50
48
|
[project.optional-dependencies]
|
49
|
+
pyspark = [
|
50
|
+
"pyspark>=3.5.0",
|
51
|
+
]
|
51
52
|
development = [
|
52
53
|
"deepdiff==8.1.1",
|
53
54
|
"pytest>=8.3.3",
|
@@ -59,6 +60,7 @@ development = [
|
|
59
60
|
"setuptools>=70.0.0",
|
60
61
|
"pyarrow>=18.0.0",
|
61
62
|
"deepdiff>=8.0.0",
|
63
|
+
"pyspark>=3.5.0",
|
62
64
|
]
|
63
65
|
|
64
66
|
[project.urls]
|
@@ -13,16 +13,26 @@
|
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
15
|
|
16
|
+
import logging
|
17
|
+
|
18
|
+
|
19
|
+
# Add a NullHandler to prevent logging messages from being output to
|
20
|
+
# sys.stderr if no logging configuration is provided.
|
21
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
22
|
+
|
23
|
+
# ruff: noqa: E402
|
24
|
+
|
16
25
|
from snowflake.snowpark_checkpoints.checkpoint import (
|
17
26
|
check_dataframe_schema,
|
18
|
-
check_output_schema,
|
19
27
|
check_input_schema,
|
28
|
+
check_output_schema,
|
20
29
|
validate_dataframe_checkpoint,
|
21
30
|
)
|
22
31
|
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
23
32
|
from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
|
24
33
|
from snowflake.snowpark_checkpoints.utils.constants import CheckpointMode
|
25
34
|
|
35
|
+
|
26
36
|
__all__ = [
|
27
37
|
"check_with_spark",
|
28
38
|
"SnowparkJobContext",
|
@@ -14,11 +14,14 @@
|
|
14
14
|
# limitations under the License.
|
15
15
|
|
16
16
|
# Wrapper around pandera which logs to snowflake
|
17
|
-
|
17
|
+
|
18
|
+
import logging
|
19
|
+
|
20
|
+
from typing import Any, Optional, Union, cast
|
18
21
|
|
19
22
|
from pandas import DataFrame as PandasDataFrame
|
20
|
-
from pandera import Check, DataFrameSchema
|
21
|
-
from
|
23
|
+
from pandera import Check, DataFrameModel, DataFrameSchema
|
24
|
+
from pandera.errors import SchemaError, SchemaErrors
|
22
25
|
|
23
26
|
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
24
27
|
from snowflake.snowpark_checkpoints.errors import SchemaValidationError
|
@@ -27,13 +30,13 @@ from snowflake.snowpark_checkpoints.snowpark_sampler import (
|
|
27
30
|
SamplingAdapter,
|
28
31
|
SamplingStrategy,
|
29
32
|
)
|
30
|
-
from snowflake.snowpark_checkpoints.utils.checkpoint_logger import CheckpointLogger
|
31
33
|
from snowflake.snowpark_checkpoints.utils.constants import (
|
32
34
|
FAIL_STATUS,
|
33
35
|
PASS_STATUS,
|
34
36
|
CheckpointMode,
|
35
37
|
)
|
36
38
|
from snowflake.snowpark_checkpoints.utils.extra_config import is_checkpoint_enabled
|
39
|
+
from snowflake.snowpark_checkpoints.utils.logging_utils import log
|
37
40
|
from snowflake.snowpark_checkpoints.utils.pandera_check_manager import (
|
38
41
|
PanderaCheckManager,
|
39
42
|
)
|
@@ -47,6 +50,10 @@ from snowflake.snowpark_checkpoints.utils.utils_checks import (
|
|
47
50
|
)
|
48
51
|
|
49
52
|
|
53
|
+
LOGGER = logging.getLogger(__name__)
|
54
|
+
|
55
|
+
|
56
|
+
@log
|
50
57
|
def validate_dataframe_checkpoint(
|
51
58
|
df: SnowparkDataFrame,
|
52
59
|
checkpoint_name: str,
|
@@ -84,31 +91,45 @@ def validate_dataframe_checkpoint(
|
|
84
91
|
"""
|
85
92
|
checkpoint_name = _replace_special_characters(checkpoint_name)
|
86
93
|
|
87
|
-
if is_checkpoint_enabled(checkpoint_name):
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
94
|
+
if not is_checkpoint_enabled(checkpoint_name):
|
95
|
+
LOGGER.warning(
|
96
|
+
"Checkpoint '%s' is disabled. Skipping DataFrame checkpoint validation.",
|
97
|
+
checkpoint_name,
|
98
|
+
)
|
99
|
+
return None
|
100
|
+
|
101
|
+
LOGGER.info(
|
102
|
+
"Starting DataFrame checkpoint validation for checkpoint '%s'", checkpoint_name
|
103
|
+
)
|
104
|
+
|
105
|
+
if mode == CheckpointMode.SCHEMA:
|
106
|
+
result = _check_dataframe_schema_file(
|
107
|
+
df,
|
108
|
+
checkpoint_name,
|
109
|
+
job_context,
|
110
|
+
custom_checks,
|
111
|
+
skip_checks,
|
112
|
+
sample_frac,
|
113
|
+
sample_number,
|
114
|
+
sampling_strategy,
|
115
|
+
output_path,
|
116
|
+
)
|
117
|
+
return result
|
118
|
+
|
119
|
+
if mode == CheckpointMode.DATAFRAME:
|
120
|
+
if job_context is None:
|
108
121
|
raise ValueError(
|
109
|
-
"
|
110
|
-
Please use for schema validation use a 1 or for a full data validation use a 2 for schema validation."""
|
122
|
+
"No job context provided. Please provide one when using DataFrame mode validation."
|
111
123
|
)
|
124
|
+
_check_compare_data(df, job_context, checkpoint_name, output_path)
|
125
|
+
return None
|
126
|
+
|
127
|
+
raise ValueError(
|
128
|
+
(
|
129
|
+
"Invalid validation mode. "
|
130
|
+
"Please use 1 for schema validation or 2 for full data validation."
|
131
|
+
),
|
132
|
+
)
|
112
133
|
|
113
134
|
|
114
135
|
def _check_dataframe_schema_file(
|
@@ -156,7 +177,7 @@ def _check_dataframe_schema_file(
|
|
156
177
|
|
157
178
|
schema = _generate_schema(checkpoint_name, output_path)
|
158
179
|
|
159
|
-
return
|
180
|
+
return _check_dataframe_schema(
|
160
181
|
df,
|
161
182
|
schema,
|
162
183
|
checkpoint_name,
|
@@ -170,6 +191,7 @@ def _check_dataframe_schema_file(
|
|
170
191
|
)
|
171
192
|
|
172
193
|
|
194
|
+
@log
|
173
195
|
def check_dataframe_schema(
|
174
196
|
df: SnowparkDataFrame,
|
175
197
|
pandera_schema: DataFrameSchema,
|
@@ -212,6 +234,9 @@ def check_dataframe_schema(
|
|
212
234
|
|
213
235
|
"""
|
214
236
|
checkpoint_name = _replace_special_characters(checkpoint_name)
|
237
|
+
LOGGER.info(
|
238
|
+
"Starting DataFrame schema validation for checkpoint '%s'", checkpoint_name
|
239
|
+
)
|
215
240
|
|
216
241
|
if df is None:
|
217
242
|
raise ValueError("DataFrame is required")
|
@@ -219,19 +244,25 @@ def check_dataframe_schema(
|
|
219
244
|
if pandera_schema is None:
|
220
245
|
raise ValueError("Schema is required")
|
221
246
|
|
222
|
-
if is_checkpoint_enabled(checkpoint_name):
|
223
|
-
|
224
|
-
|
225
|
-
pandera_schema,
|
247
|
+
if not is_checkpoint_enabled(checkpoint_name):
|
248
|
+
LOGGER.warning(
|
249
|
+
"Checkpoint '%s' is disabled. Skipping DataFrame schema validation.",
|
226
250
|
checkpoint_name,
|
227
|
-
job_context,
|
228
|
-
custom_checks,
|
229
|
-
skip_checks,
|
230
|
-
sample_frac,
|
231
|
-
sample_number,
|
232
|
-
sampling_strategy,
|
233
|
-
output_path,
|
234
251
|
)
|
252
|
+
return None
|
253
|
+
|
254
|
+
return _check_dataframe_schema(
|
255
|
+
df,
|
256
|
+
pandera_schema,
|
257
|
+
checkpoint_name,
|
258
|
+
job_context,
|
259
|
+
custom_checks,
|
260
|
+
skip_checks,
|
261
|
+
sample_frac,
|
262
|
+
sample_number,
|
263
|
+
sampling_strategy,
|
264
|
+
output_path,
|
265
|
+
)
|
235
266
|
|
236
267
|
|
237
268
|
@report_telemetry(
|
@@ -259,17 +290,24 @@ def _check_dataframe_schema(
|
|
259
290
|
pandera_schema_upper, sample_df = _process_sampling(
|
260
291
|
df, pandera_schema, job_context, sample_frac, sample_number, sampling_strategy
|
261
292
|
)
|
262
|
-
|
263
|
-
# Raises SchemaError on validation issues
|
264
|
-
validator = DataFrameValidator()
|
265
|
-
is_valid, validation_result = validator.validate(
|
266
|
-
pandera_schema_upper, sample_df, validity_flag=True
|
267
|
-
)
|
293
|
+
is_valid, validation_result = _validate(pandera_schema_upper, sample_df)
|
268
294
|
if is_valid:
|
295
|
+
LOGGER.info(
|
296
|
+
"DataFrame schema validation passed for checkpoint '%s'",
|
297
|
+
checkpoint_name,
|
298
|
+
)
|
269
299
|
if job_context is not None:
|
270
300
|
job_context._mark_pass(checkpoint_name)
|
301
|
+
else:
|
302
|
+
LOGGER.warning(
|
303
|
+
"No job context provided. Skipping result recording into Snowflake.",
|
304
|
+
)
|
271
305
|
_update_validation_result(checkpoint_name, PASS_STATUS, output_path)
|
272
306
|
else:
|
307
|
+
LOGGER.error(
|
308
|
+
"DataFrame schema validation failed for checkpoint '%s'",
|
309
|
+
checkpoint_name,
|
310
|
+
)
|
273
311
|
_update_validation_result(checkpoint_name, FAIL_STATUS, output_path)
|
274
312
|
raise SchemaValidationError(
|
275
313
|
"Snowpark DataFrame schema validation error",
|
@@ -282,6 +320,7 @@ def _check_dataframe_schema(
|
|
282
320
|
|
283
321
|
|
284
322
|
@report_telemetry(params_list=["pandera_schema"])
|
323
|
+
@log
|
285
324
|
def check_output_schema(
|
286
325
|
pandera_schema: DataFrameSchema,
|
287
326
|
checkpoint_name: str,
|
@@ -318,11 +357,8 @@ def check_output_schema(
|
|
318
357
|
function: The decorated function.
|
319
358
|
|
320
359
|
"""
|
321
|
-
_checkpoint_name = checkpoint_name
|
322
|
-
if checkpoint_name is None:
|
323
|
-
_checkpoint_name = snowpark_fn.__name__
|
324
|
-
_checkpoint_name = _replace_special_characters(_checkpoint_name)
|
325
360
|
|
361
|
+
@log(log_args=False)
|
326
362
|
def wrapper(*args, **kwargs):
|
327
363
|
"""Wrapp a function to validate the schema of the output of a Snowpark function.
|
328
364
|
|
@@ -334,7 +370,25 @@ def check_output_schema(
|
|
334
370
|
Any: The result of the Snowpark function.
|
335
371
|
|
336
372
|
"""
|
373
|
+
_checkpoint_name = checkpoint_name
|
374
|
+
if checkpoint_name is None:
|
375
|
+
LOGGER.warning(
|
376
|
+
(
|
377
|
+
"No checkpoint name provided for output schema validation. "
|
378
|
+
"Using '%s' as the checkpoint name.",
|
379
|
+
),
|
380
|
+
snowpark_fn.__name__,
|
381
|
+
)
|
382
|
+
_checkpoint_name = snowpark_fn.__name__
|
383
|
+
_checkpoint_name = _replace_special_characters(_checkpoint_name)
|
384
|
+
LOGGER.info(
|
385
|
+
"Starting output schema validation for Snowpark function '%s' and checkpoint '%s'",
|
386
|
+
snowpark_fn.__name__,
|
387
|
+
_checkpoint_name,
|
388
|
+
)
|
389
|
+
|
337
390
|
# Run the sampled data in snowpark
|
391
|
+
LOGGER.info("Running the Snowpark function '%s'", snowpark_fn.__name__)
|
338
392
|
snowpark_results = snowpark_fn(*args, **kwargs)
|
339
393
|
sampler = SamplingAdapter(
|
340
394
|
job_context, sample_frac, sample_number, sampling_strategy
|
@@ -342,22 +396,28 @@ def check_output_schema(
|
|
342
396
|
sampler.process_args([snowpark_results])
|
343
397
|
pandas_sample_args = sampler.get_sampled_pandas_args()
|
344
398
|
|
345
|
-
|
346
|
-
|
347
|
-
is_valid, validation_result = validator.validate(
|
348
|
-
pandera_schema, pandas_sample_args[0], validity_flag=True
|
399
|
+
is_valid, validation_result = _validate(
|
400
|
+
pandera_schema, pandas_sample_args[0]
|
349
401
|
)
|
350
|
-
logger = CheckpointLogger().get_logger()
|
351
|
-
logger.info(
|
352
|
-
f"Checkpoint {_checkpoint_name} validation result:\n{validation_result}"
|
353
|
-
)
|
354
|
-
|
355
402
|
if is_valid:
|
403
|
+
LOGGER.info(
|
404
|
+
"Output schema validation passed for Snowpark function '%s' and checkpoint '%s'",
|
405
|
+
snowpark_fn.__name__,
|
406
|
+
_checkpoint_name,
|
407
|
+
)
|
356
408
|
if job_context is not None:
|
357
409
|
job_context._mark_pass(_checkpoint_name)
|
358
|
-
|
410
|
+
else:
|
411
|
+
LOGGER.warning(
|
412
|
+
"No job context provided. Skipping result recording into Snowflake.",
|
413
|
+
)
|
359
414
|
_update_validation_result(_checkpoint_name, PASS_STATUS, output_path)
|
360
415
|
else:
|
416
|
+
LOGGER.error(
|
417
|
+
"Output schema validation failed for Snowpark function '%s' and checkpoint '%s'",
|
418
|
+
snowpark_fn.__name__,
|
419
|
+
_checkpoint_name,
|
420
|
+
)
|
361
421
|
_update_validation_result(_checkpoint_name, FAIL_STATUS, output_path)
|
362
422
|
raise SchemaValidationError(
|
363
423
|
"Snowpark output schema validation error",
|
@@ -365,7 +425,6 @@ def check_output_schema(
|
|
365
425
|
_checkpoint_name,
|
366
426
|
validation_result,
|
367
427
|
)
|
368
|
-
|
369
428
|
return snowpark_results
|
370
429
|
|
371
430
|
return wrapper
|
@@ -374,6 +433,7 @@ def check_output_schema(
|
|
374
433
|
|
375
434
|
|
376
435
|
@report_telemetry(params_list=["pandera_schema"])
|
436
|
+
@log
|
377
437
|
def check_input_schema(
|
378
438
|
pandera_schema: DataFrameSchema,
|
379
439
|
checkpoint_name: str,
|
@@ -414,11 +474,8 @@ def check_input_schema(
|
|
414
474
|
Callable: A wrapper function that performs schema validation before executing the original function.
|
415
475
|
|
416
476
|
"""
|
417
|
-
_checkpoint_name = checkpoint_name
|
418
|
-
if checkpoint_name is None:
|
419
|
-
_checkpoint_name = snowpark_fn.__name__
|
420
|
-
_checkpoint_name = _replace_special_characters(_checkpoint_name)
|
421
477
|
|
478
|
+
@log(log_args=False)
|
422
479
|
def wrapper(*args, **kwargs):
|
423
480
|
"""Wrapp a function to validate the schema of the input of a Snowpark function.
|
424
481
|
|
@@ -429,6 +486,23 @@ def check_input_schema(
|
|
429
486
|
Any: The result of the original function after input validation.
|
430
487
|
|
431
488
|
"""
|
489
|
+
_checkpoint_name = checkpoint_name
|
490
|
+
if checkpoint_name is None:
|
491
|
+
LOGGER.warning(
|
492
|
+
(
|
493
|
+
"No checkpoint name provided for input schema validation. "
|
494
|
+
"Using '%s' as the checkpoint name."
|
495
|
+
),
|
496
|
+
snowpark_fn.__name__,
|
497
|
+
)
|
498
|
+
_checkpoint_name = snowpark_fn.__name__
|
499
|
+
_checkpoint_name = _replace_special_characters(_checkpoint_name)
|
500
|
+
LOGGER.info(
|
501
|
+
"Starting input schema validation for Snowpark function '%s' and checkpoint '%s'",
|
502
|
+
snowpark_fn.__name__,
|
503
|
+
_checkpoint_name,
|
504
|
+
)
|
505
|
+
|
432
506
|
# Run the sampled data in snowpark
|
433
507
|
sampler = SamplingAdapter(
|
434
508
|
job_context, sample_frac, sample_number, sampling_strategy
|
@@ -436,47 +510,71 @@ def check_input_schema(
|
|
436
510
|
sampler.process_args(args)
|
437
511
|
pandas_sample_args = sampler.get_sampled_pandas_args()
|
438
512
|
|
513
|
+
LOGGER.info(
|
514
|
+
"Validating %s input argument(s) against a Pandera schema",
|
515
|
+
len(pandas_sample_args),
|
516
|
+
)
|
439
517
|
# Raises SchemaError on validation issues
|
440
|
-
for arg in pandas_sample_args:
|
441
|
-
if isinstance(arg, PandasDataFrame):
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
pandera_schema,
|
446
|
-
arg,
|
447
|
-
validity_flag=True,
|
518
|
+
for index, arg in enumerate(pandas_sample_args, start=1):
|
519
|
+
if not isinstance(arg, PandasDataFrame):
|
520
|
+
LOGGER.info(
|
521
|
+
"Arg %s: Skipping schema validation for non-DataFrame argument",
|
522
|
+
index,
|
448
523
|
)
|
524
|
+
continue
|
449
525
|
|
450
|
-
|
451
|
-
|
452
|
-
|
526
|
+
is_valid, validation_result = _validate(
|
527
|
+
pandera_schema,
|
528
|
+
arg,
|
529
|
+
)
|
530
|
+
if is_valid:
|
531
|
+
LOGGER.info(
|
532
|
+
"Arg %s: Input schema validation passed",
|
533
|
+
index,
|
453
534
|
)
|
454
|
-
|
455
|
-
|
456
|
-
if job_context is not None:
|
457
|
-
job_context._mark_pass(
|
458
|
-
_checkpoint_name,
|
459
|
-
)
|
460
|
-
|
461
|
-
_update_validation_result(
|
462
|
-
_checkpoint_name,
|
463
|
-
PASS_STATUS,
|
464
|
-
output_path,
|
465
|
-
)
|
466
|
-
else:
|
467
|
-
_update_validation_result(
|
468
|
-
_checkpoint_name,
|
469
|
-
FAIL_STATUS,
|
470
|
-
output_path,
|
471
|
-
)
|
472
|
-
raise SchemaValidationError(
|
473
|
-
"Snowpark input schema validation error",
|
474
|
-
job_context,
|
535
|
+
if job_context is not None:
|
536
|
+
job_context._mark_pass(
|
475
537
|
_checkpoint_name,
|
476
|
-
validation_result,
|
477
538
|
)
|
539
|
+
_update_validation_result(
|
540
|
+
_checkpoint_name,
|
541
|
+
PASS_STATUS,
|
542
|
+
output_path,
|
543
|
+
)
|
544
|
+
else:
|
545
|
+
LOGGER.error(
|
546
|
+
"Arg %s: Input schema validation failed",
|
547
|
+
index,
|
548
|
+
)
|
549
|
+
_update_validation_result(
|
550
|
+
_checkpoint_name,
|
551
|
+
FAIL_STATUS,
|
552
|
+
output_path,
|
553
|
+
)
|
554
|
+
raise SchemaValidationError(
|
555
|
+
"Snowpark input schema validation error",
|
556
|
+
job_context,
|
557
|
+
_checkpoint_name,
|
558
|
+
validation_result,
|
559
|
+
)
|
478
560
|
return snowpark_fn(*args, **kwargs)
|
479
561
|
|
480
562
|
return wrapper
|
481
563
|
|
482
564
|
return check_input_with_decorator
|
565
|
+
|
566
|
+
|
567
|
+
def _validate(
|
568
|
+
schema: Union[type[DataFrameModel], DataFrameSchema],
|
569
|
+
df: PandasDataFrame,
|
570
|
+
lazy: bool = True,
|
571
|
+
) -> tuple[bool, PandasDataFrame]:
|
572
|
+
if not isinstance(schema, DataFrameSchema):
|
573
|
+
schema = schema.to_schema()
|
574
|
+
is_valid = True
|
575
|
+
try:
|
576
|
+
df = schema.validate(df, lazy=lazy)
|
577
|
+
except (SchemaErrors, SchemaError) as schema_errors:
|
578
|
+
df = cast(PandasDataFrame, schema_errors.failure_cases)
|
579
|
+
is_valid = False
|
580
|
+
return is_valid, df
|