snowpark-checkpoints-validators 0.2.0rc1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/.gitignore +4 -0
- snowpark_checkpoints_validators-0.3.0/PKG-INFO +325 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/README.md +12 -4
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/pyproject.toml +28 -12
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/__init__.py +44 -0
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/__version__.py +16 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/src/snowflake/snowpark_checkpoints/checkpoint.py +210 -101
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/src/snowflake/snowpark_checkpoints/errors.py +14 -3
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/io_utils/__init__.py +26 -0
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/io_utils/io_default_strategy.py +57 -0
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/io_utils/io_env_strategy.py +133 -0
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/io_utils/io_file_manager.py +76 -0
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/job_context.py +128 -0
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/singleton.py +23 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/src/snowflake/snowpark_checkpoints/snowpark_sampler.py +40 -4
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/src/snowflake/snowpark_checkpoints/spark_migration.py +53 -9
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/utils/__init__.py +14 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/src/snowflake/snowpark_checkpoints/utils/constants.py +14 -3
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/utils/extra_config.py +132 -0
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/utils/logging_utils.py +67 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/src/snowflake/snowpark_checkpoints/utils/pandera_check_manager.py +48 -7
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/src/snowflake/snowpark_checkpoints/utils/supported_types.py +14 -3
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/src/snowflake/snowpark_checkpoints/utils/telemetry.py +241 -83
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/src/snowflake/snowpark_checkpoints/utils/utils_checks.py +89 -16
- snowpark_checkpoints_validators-0.3.0/src/snowflake/snowpark_checkpoints/validation_result_metadata.py +159 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/src/snowflake/snowpark_checkpoints/validation_results.py +14 -3
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/integ/e2eexample.py +14 -3
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/integ/telemetry_compare_utils.py +33 -2
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/df_mode_dataframe_mismatch_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/df_mode_dataframe_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/spark_checkpoint_df_fail_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/spark_checkpoint_df_pass_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/spark_checkpoint_limit_sample_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/spark_checkpoint_random_sample_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/spark_checkpoint_scalar_fail_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/spark_checkpoint_scalar_passing_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/test_df_check_custom_check_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/test_df_check_fail_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/test_df_check_from_file_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/test_df_check_skip_check_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/test_df_check_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/test_input_fail_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/test_input_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/test_output_fail_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.3.0/test/integ/telemetry_expected/test_output_telemetry.json +18 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/integ/test_pandera.py +63 -23
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/integ/test_parquet.py +195 -22
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/integ/test_spark_checkpoint.py +56 -26
- snowpark_checkpoints_validators-0.3.0/test/unit/io_utils/test_default_strategy.py +292 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/unit/test_extra_config.py +57 -4
- snowpark_checkpoints_validators-0.3.0/test/unit/test_job_context.py +49 -0
- snowpark_checkpoints_validators-0.3.0/test/unit/test_logger.py +134 -0
- snowpark_checkpoints_validators-0.3.0/test/unit/test_logging_utils.py +132 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/unit/test_spark_migration.py +14 -3
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/unit/test_telemetry.py +416 -165
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/unit/test_utils_checks.py +17 -6
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/unit/test_validation_result_metadata.py +98 -3
- snowpark_checkpoints_validators-0.2.0rc1/PKG-INFO +0 -514
- snowpark_checkpoints_validators-0.2.0rc1/src/snowflake/snowpark_checkpoints/__init__.py +0 -23
- snowpark_checkpoints_validators-0.2.0rc1/src/snowflake/snowpark_checkpoints/job_context.py +0 -74
- snowpark_checkpoints_validators-0.2.0rc1/src/snowflake/snowpark_checkpoints/utils/__init__.py +0 -3
- snowpark_checkpoints_validators-0.2.0rc1/src/snowflake/snowpark_checkpoints/utils/checkpoint_logger.py +0 -41
- snowpark_checkpoints_validators-0.2.0rc1/src/snowflake/snowpark_checkpoints/utils/extra_config.py +0 -73
- snowpark_checkpoints_validators-0.2.0rc1/src/snowflake/snowpark_checkpoints/validation_result_metadata.py +0 -104
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/spark_checkpoint_df_fail_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/spark_checkpoint_df_pass_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/spark_checkpoint_limit_sample_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/spark_checkpoint_random_sample_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/spark_checkpoint_scalar_fail_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/spark_checkpoint_scalar_passing_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/test_df_check_custom_check_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/test_df_check_fail_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/test_df_check_from_file_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/test_df_check_skip_check_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/test_df_check_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/test_input_fail_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/test_input_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/test_output_fail_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.2.0rc1/test/integ/telemetry_expected/test_output_telemetry.json +0 -17
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/CHANGELOG.md +0 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/LICENSE +0 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/.coveragerc +0 -0
- {snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/test/unit/test_pandera_check_manager.py +0 -0
@@ -0,0 +1,325 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: snowpark-checkpoints-validators
|
3
|
+
Version: 0.3.0
|
4
|
+
Summary: Migration tools for Snowpark
|
5
|
+
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
|
+
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
7
|
+
Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
|
8
|
+
License: Apache License, Version 2.0
|
9
|
+
License-File: LICENSE
|
10
|
+
Keywords: Snowflake,Snowpark,analytics,cloud,database,db
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
12
|
+
Classifier: Environment :: Console
|
13
|
+
Classifier: Environment :: Other Environment
|
14
|
+
Classifier: Intended Audience :: Developers
|
15
|
+
Classifier: Intended Audience :: Education
|
16
|
+
Classifier: Intended Audience :: Information Technology
|
17
|
+
Classifier: Intended Audience :: System Administrators
|
18
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
19
|
+
Classifier: Operating System :: OS Independent
|
20
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
21
|
+
Classifier: Programming Language :: SQL
|
22
|
+
Classifier: Topic :: Database
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
24
|
+
Classifier: Topic :: Software Development
|
25
|
+
Classifier: Topic :: Software Development :: Libraries
|
26
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
|
+
Requires-Python: <3.12,>=3.9
|
29
|
+
Requires-Dist: pandera[io]==0.20.4
|
30
|
+
Requires-Dist: pydantic>=2.0
|
31
|
+
Requires-Dist: snowflake-connector-python[pandas]
|
32
|
+
Requires-Dist: snowflake-snowpark-python>=1.23.0
|
33
|
+
Provides-Extra: development
|
34
|
+
Requires-Dist: certifi==2025.1.31; extra == 'development'
|
35
|
+
Requires-Dist: coverage>=7.6.7; extra == 'development'
|
36
|
+
Requires-Dist: deepdiff==8.1.1; extra == 'development'
|
37
|
+
Requires-Dist: deepdiff>=8.0.0; extra == 'development'
|
38
|
+
Requires-Dist: hatchling==1.25.0; extra == 'development'
|
39
|
+
Requires-Dist: pre-commit>=4.0.1; extra == 'development'
|
40
|
+
Requires-Dist: pyarrow>=18.0.0; extra == 'development'
|
41
|
+
Requires-Dist: pyspark>=3.5.0; extra == 'development'
|
42
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
|
43
|
+
Requires-Dist: pytest>=8.3.3; extra == 'development'
|
44
|
+
Requires-Dist: setuptools>=70.0.0; extra == 'development'
|
45
|
+
Requires-Dist: twine==5.1.1; extra == 'development'
|
46
|
+
Provides-Extra: pyspark
|
47
|
+
Requires-Dist: pyspark>=3.5.0; extra == 'pyspark'
|
48
|
+
Description-Content-Type: text/markdown
|
49
|
+
|
50
|
+
# snowpark-checkpoints-validators
|
51
|
+
|
52
|
+
---
|
53
|
+
##### This package is on Public Preview.
|
54
|
+
---
|
55
|
+
|
56
|
+
**snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
|
57
|
+
|
58
|
+
---
|
59
|
+
## Install the library
|
60
|
+
```bash
|
61
|
+
pip install snowpark-checkpoints-validators
|
62
|
+
```
|
63
|
+
This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
|
64
|
+
```bash
|
65
|
+
pip install "snowpark-checkpoints-validators[pyspark]"
|
66
|
+
```
|
67
|
+
---
|
68
|
+
|
69
|
+
## Features
|
70
|
+
|
71
|
+
- Validate Snowpark DataFrames against predefined Pandera schemas.
|
72
|
+
- Perform custom checks and skip specific checks as needed.
|
73
|
+
- Generate validation results and log them for further analysis.
|
74
|
+
- Support for sampling strategies to validate large datasets efficiently.
|
75
|
+
- Integration with PySpark for cross-validation between Snowpark and PySpark DataFrames.
|
76
|
+
|
77
|
+
## Functionalities
|
78
|
+
|
79
|
+
### Validate DataFrame Schema from File
|
80
|
+
|
81
|
+
The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
|
82
|
+
|
83
|
+
```python
|
84
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
85
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
86
|
+
from snowflake.snowpark_checkpoints.utils.constant import (
|
87
|
+
CheckpointMode,
|
88
|
+
)
|
89
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
90
|
+
from typing import Any, Optional
|
91
|
+
|
92
|
+
# Signature of the function
|
93
|
+
def validate_dataframe_checkpoint(
|
94
|
+
df: SnowparkDataFrame,
|
95
|
+
checkpoint_name: str,
|
96
|
+
job_context: Optional[SnowparkJobContext] = None,
|
97
|
+
mode: Optional[CheckpointMode] = CheckpointMode.SCHEMA,
|
98
|
+
custom_checks: Optional[dict[Any, Any]] = None,
|
99
|
+
skip_checks: Optional[dict[Any, Any]] = None,
|
100
|
+
sample_frac: Optional[float] = 1.0,
|
101
|
+
sample_number: Optional[int] = None,
|
102
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
103
|
+
output_path: Optional[str] = None,
|
104
|
+
):
|
105
|
+
...
|
106
|
+
```
|
107
|
+
|
108
|
+
- `df`: Snowpark dataframe to validate.
|
109
|
+
- `checkpoint_name`: Name of the checkpoint schema file or dataframe.
|
110
|
+
- `job_context`: Snowpark job context.
|
111
|
+
- `mode`: Checkpoint mode (schema or data).
|
112
|
+
- `custom_checks`: Custom checks to perform.
|
113
|
+
- `skip_checks`: Checks to skip.
|
114
|
+
- `sample_frac`: Fraction of the dataframe to sample.
|
115
|
+
- `sample_number`: Number of rows to sample.
|
116
|
+
- `sampling_strategy`: Sampling strategy to use.
|
117
|
+
- `output_path`: Output path for the checkpoint report.
|
118
|
+
|
119
|
+
### Usage Example
|
120
|
+
|
121
|
+
```python
|
122
|
+
from snowflake.snowpark import Session
|
123
|
+
from snowflake.snowpark_checkpoints.utils.constant import (
|
124
|
+
CheckpointMode,
|
125
|
+
)
|
126
|
+
from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
|
127
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
128
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
129
|
+
from pyspark.sql import SparkSession
|
130
|
+
|
131
|
+
session = Session.builder.getOrCreate()
|
132
|
+
job_context = SnowparkJobContext(
|
133
|
+
session, SparkSession.builder.getOrCreate(), "job_context", True
|
134
|
+
)
|
135
|
+
df = session.read.format("csv").load("data.csv")
|
136
|
+
|
137
|
+
validate_dataframe_checkpoint(
|
138
|
+
df,
|
139
|
+
"schema_checkpoint",
|
140
|
+
job_context=job_context,
|
141
|
+
mode=CheckpointMode.SCHEMA,
|
142
|
+
sample_frac=0.1,
|
143
|
+
sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
|
144
|
+
)
|
145
|
+
```
|
146
|
+
|
147
|
+
### Check with Spark Decorator
|
148
|
+
|
149
|
+
The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
|
150
|
+
|
151
|
+
```python
|
152
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
153
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
154
|
+
from typing import Callable, Optional, TypeVar
|
155
|
+
|
156
|
+
fn = TypeVar("F", bound=Callable)
|
157
|
+
|
158
|
+
# Signature of the decorator
|
159
|
+
def check_with_spark(
|
160
|
+
job_context: Optional[SnowparkJobContext],
|
161
|
+
spark_function: fn,
|
162
|
+
checkpoint_name: str,
|
163
|
+
sample_number: Optional[int] = 100,
|
164
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
165
|
+
output_path: Optional[str] = None,
|
166
|
+
) -> Callable[[fn], fn]:
|
167
|
+
...
|
168
|
+
```
|
169
|
+
|
170
|
+
- `job_context`: Snowpark job context.
|
171
|
+
- `spark_function`: PySpark function to execute.
|
172
|
+
- `checkpoint_name`: Name of the check.
|
173
|
+
- `sample_number`: Number of rows to sample.
|
174
|
+
- `sampling_strategy`: Sampling strategy to use.
|
175
|
+
- `output_path`: Output path for the checkpoint report.
|
176
|
+
|
177
|
+
### Usage Example
|
178
|
+
|
179
|
+
```python
|
180
|
+
from snowflake.snowpark import Session
|
181
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
182
|
+
from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
|
183
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
184
|
+
from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
|
185
|
+
|
186
|
+
session = Session.builder.getOrCreate()
|
187
|
+
job_context = SnowparkJobContext(
|
188
|
+
session, SparkSession.builder.getOrCreate(), "job_context", True
|
189
|
+
)
|
190
|
+
|
191
|
+
def my_spark_scalar_fn(df: SparkDataFrame):
|
192
|
+
return df.count()
|
193
|
+
|
194
|
+
@check_with_spark(
|
195
|
+
job_context=job_context,
|
196
|
+
spark_function=my_spark_scalar_fn,
|
197
|
+
checkpoint_name="count_checkpoint",
|
198
|
+
)
|
199
|
+
def my_snowpark_scalar_fn(df: SnowparkDataFrame):
|
200
|
+
return df.count()
|
201
|
+
|
202
|
+
df = job_context.snowpark_session.create_dataframe(
|
203
|
+
[[1, 2], [3, 4]], schema=["a", "b"]
|
204
|
+
)
|
205
|
+
count = my_snowpark_scalar_fn(df)
|
206
|
+
```
|
207
|
+
|
208
|
+
### Pandera Snowpark Decorators
|
209
|
+
|
210
|
+
The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
|
211
|
+
|
212
|
+
```python
|
213
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
214
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
215
|
+
from pandera import DataFrameSchema
|
216
|
+
from typing import Optional
|
217
|
+
|
218
|
+
# Signature of the decorator
|
219
|
+
def check_input_schema(
|
220
|
+
pandera_schema: DataFrameSchema,
|
221
|
+
checkpoint_name: str,
|
222
|
+
sample_frac: Optional[float] = 1.0,
|
223
|
+
sample_number: Optional[int] = None,
|
224
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
225
|
+
job_context: Optional[SnowparkJobContext] = None,
|
226
|
+
output_path: Optional[str] = None,
|
227
|
+
):
|
228
|
+
...
|
229
|
+
|
230
|
+
# Signature of the decorator
|
231
|
+
def check_output_schema(
|
232
|
+
pandera_schema: DataFrameSchema,
|
233
|
+
checkpoint_name: str,
|
234
|
+
sample_frac: Optional[float] = 1.0,
|
235
|
+
sample_number: Optional[int] = None,
|
236
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
237
|
+
job_context: Optional[SnowparkJobContext] = None,
|
238
|
+
output_path: Optional[str] = None,
|
239
|
+
):
|
240
|
+
...
|
241
|
+
```
|
242
|
+
|
243
|
+
- `pandera_schema`: Pandera schema to validate.
|
244
|
+
- `checkpoint_name`: Name of the checkpoint schema file or DataFrame.
|
245
|
+
- `sample_frac`: Fraction of the DataFrame to sample.
|
246
|
+
- `sample_number`: Number of rows to sample.
|
247
|
+
- `sampling_strategy`: Sampling strategy to use.
|
248
|
+
- `job_context`: Snowpark job context.
|
249
|
+
- `output_path`: Output path for the checkpoint report.
|
250
|
+
|
251
|
+
### Usage Example
|
252
|
+
|
253
|
+
#### Check Input Schema Example
|
254
|
+
```python
|
255
|
+
from pandas import DataFrame as PandasDataFrame
|
256
|
+
from pandera import DataFrameSchema, Column, Check
|
257
|
+
from snowflake.snowpark import Session
|
258
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
259
|
+
from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
|
260
|
+
from numpy import int8
|
261
|
+
|
262
|
+
df = PandasDataFrame(
|
263
|
+
{
|
264
|
+
"COLUMN1": [1, 4, 0, 10, 9],
|
265
|
+
"COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
|
266
|
+
}
|
267
|
+
)
|
268
|
+
|
269
|
+
in_schema = DataFrameSchema(
|
270
|
+
{
|
271
|
+
"COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
|
272
|
+
"COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
|
273
|
+
}
|
274
|
+
)
|
275
|
+
|
276
|
+
@check_input_schema(in_schema, "input_schema_checkpoint")
|
277
|
+
def preprocessor(dataframe: SnowparkDataFrame):
|
278
|
+
dataframe = dataframe.withColumn(
|
279
|
+
"COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
|
280
|
+
)
|
281
|
+
return dataframe
|
282
|
+
|
283
|
+
session = Session.builder.getOrCreate()
|
284
|
+
sp_dataframe = session.create_dataframe(df)
|
285
|
+
|
286
|
+
preprocessed_dataframe = preprocessor(sp_dataframe)
|
287
|
+
```
|
288
|
+
|
289
|
+
#### Check Input Schema Example
|
290
|
+
```python
|
291
|
+
from pandas import DataFrame as PandasDataFrame
|
292
|
+
from pandera import DataFrameSchema, Column, Check
|
293
|
+
from snowflake.snowpark import Session
|
294
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
295
|
+
from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
|
296
|
+
from numpy import int8
|
297
|
+
|
298
|
+
df = PandasDataFrame(
|
299
|
+
{
|
300
|
+
"COLUMN1": [1, 4, 0, 10, 9],
|
301
|
+
"COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
|
302
|
+
}
|
303
|
+
)
|
304
|
+
|
305
|
+
out_schema = DataFrameSchema(
|
306
|
+
{
|
307
|
+
"COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
|
308
|
+
"COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
|
309
|
+
"COLUMN3": Column(float, Check.less_than(10)),
|
310
|
+
}
|
311
|
+
)
|
312
|
+
|
313
|
+
@check_output_schema(out_schema, "output_schema_checkpoint")
|
314
|
+
def preprocessor(dataframe: SnowparkDataFrame):
|
315
|
+
return dataframe.with_column(
|
316
|
+
"COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
|
317
|
+
)
|
318
|
+
|
319
|
+
session = Session.builder.getOrCreate()
|
320
|
+
sp_dataframe = session.create_dataframe(df)
|
321
|
+
|
322
|
+
preprocessed_dataframe = preprocessor(sp_dataframe)
|
323
|
+
```
|
324
|
+
|
325
|
+
------
|
{snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/README.md
RENAMED
@@ -1,14 +1,22 @@
|
|
1
1
|
# snowpark-checkpoints-validators
|
2
2
|
|
3
3
|
---
|
4
|
-
|
5
|
-
|
6
|
-
This package is on Private Preview.
|
7
|
-
|
4
|
+
##### This package is on Public Preview.
|
8
5
|
---
|
9
6
|
|
10
7
|
**snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
|
11
8
|
|
9
|
+
---
|
10
|
+
## Install the library
|
11
|
+
```bash
|
12
|
+
pip install snowpark-checkpoints-validators
|
13
|
+
```
|
14
|
+
This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
|
15
|
+
```bash
|
16
|
+
pip install "snowpark-checkpoints-validators[pyspark]"
|
17
|
+
```
|
18
|
+
---
|
19
|
+
|
12
20
|
## Features
|
13
21
|
|
14
22
|
- Validate Snowpark DataFrames against predefined Pandera schemas.
|
{snowpark_checkpoints_validators-0.2.0rc1 → snowpark_checkpoints_validators-0.3.0}/pyproject.toml
RENAMED
@@ -3,7 +3,9 @@ build-backend = "hatchling.build"
|
|
3
3
|
requires = ["hatchling"]
|
4
4
|
|
5
5
|
[project]
|
6
|
-
authors = [
|
6
|
+
authors = [
|
7
|
+
{name = "Snowflake, Inc.", email = "snowflake-python-libraries-dl@snowflake.com"},
|
8
|
+
]
|
7
9
|
classifiers = [
|
8
10
|
"Development Status :: 4 - Beta",
|
9
11
|
"Environment :: Console",
|
@@ -24,13 +26,13 @@ classifiers = [
|
|
24
26
|
"Topic :: Scientific/Engineering :: Information Analysis",
|
25
27
|
]
|
26
28
|
dependencies = [
|
27
|
-
"snowflake-snowpark-python",
|
28
|
-
"snowflake-connector-python",
|
29
|
-
"pyspark",
|
29
|
+
"snowflake-snowpark-python>=1.23.0",
|
30
|
+
"snowflake-connector-python[pandas]",
|
30
31
|
"pandera[io]==0.20.4",
|
31
|
-
"
|
32
|
+
"pydantic>=2.0"
|
32
33
|
]
|
33
34
|
description = "Migration tools for Snowpark"
|
35
|
+
dynamic = ['version']
|
34
36
|
keywords = [
|
35
37
|
'Snowflake',
|
36
38
|
'analytics',
|
@@ -39,14 +41,17 @@ keywords = [
|
|
39
41
|
'db',
|
40
42
|
'Snowpark',
|
41
43
|
]
|
42
|
-
license = {
|
44
|
+
license = {text = "Apache License, Version 2.0"}
|
43
45
|
name = "snowpark-checkpoints-validators"
|
44
46
|
readme = "README.md"
|
45
47
|
requires-python = '>=3.9,<3.12'
|
46
|
-
dynamic = ['version']
|
47
48
|
|
48
49
|
[project.optional-dependencies]
|
50
|
+
pyspark = [
|
51
|
+
"pyspark>=3.5.0",
|
52
|
+
]
|
49
53
|
development = [
|
54
|
+
"deepdiff==8.1.1",
|
50
55
|
"pytest>=8.3.3",
|
51
56
|
"pytest-cov>=6.0.0",
|
52
57
|
"coverage>=7.6.7",
|
@@ -56,6 +61,8 @@ development = [
|
|
56
61
|
"setuptools>=70.0.0",
|
57
62
|
"pyarrow>=18.0.0",
|
58
63
|
"deepdiff>=8.0.0",
|
64
|
+
"pyspark>=3.5.0",
|
65
|
+
"certifi==2025.1.31",
|
59
66
|
]
|
60
67
|
|
61
68
|
[project.urls]
|
@@ -63,7 +70,7 @@ development = [
|
|
63
70
|
"Source code" = "https://github.com/snowflakedb/snowpark-checkpoints/"
|
64
71
|
|
65
72
|
[tool.hatch.version]
|
66
|
-
path = "__version__.py"
|
73
|
+
path = "src/snowflake/snowpark_checkpoints/__version__.py"
|
67
74
|
pattern = '^__version__ = "(?P<version>.*)"'
|
68
75
|
source = "regex"
|
69
76
|
|
@@ -75,15 +82,22 @@ where = ["src/"]
|
|
75
82
|
dev-mode-dirs = ['src']
|
76
83
|
directory = 'snowpark-checkpoints-validators'
|
77
84
|
|
85
|
+
[[tool.hatch.sources]]
|
86
|
+
dir = "src/snowflake/snowpark_checkpoints"
|
87
|
+
name = "snowpark-checkpoints-validators"
|
88
|
+
type = "package"
|
89
|
+
|
78
90
|
[tool.hatch.build.targets.wheel]
|
79
91
|
directory = "dist"
|
80
|
-
packages = [
|
92
|
+
packages = [
|
93
|
+
"src/snowflake",
|
94
|
+
]
|
81
95
|
|
82
96
|
[tool.hatch.build.targets.sdist]
|
83
97
|
directory = "dist"
|
84
98
|
exclude = ["/.github", "/.idea"]
|
85
99
|
include = [
|
86
|
-
'src
|
100
|
+
'src/**',
|
87
101
|
'README.md',
|
88
102
|
'LICENSE',
|
89
103
|
'test/',
|
@@ -118,8 +132,10 @@ check = [
|
|
118
132
|
|
119
133
|
[tool.hatch.envs.test.scripts]
|
120
134
|
check = [
|
121
|
-
|
122
|
-
'
|
135
|
+
'python -m pip install --upgrade pip -q',
|
136
|
+
'pip install -q -e ../snowpark-checkpoints-configuration',
|
137
|
+
'pip list',
|
138
|
+
'pytest -vvv --junitxml=test/outcome/test-results.xml --cov=. --cov-branch --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=html:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.html --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
|
123
139
|
]
|
124
140
|
|
125
141
|
coverage = [
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
import logging
|
17
|
+
|
18
|
+
|
19
|
+
# Add a NullHandler to prevent logging messages from being output to
|
20
|
+
# sys.stderr if no logging configuration is provided.
|
21
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
22
|
+
|
23
|
+
# ruff: noqa: E402
|
24
|
+
|
25
|
+
from snowflake.snowpark_checkpoints.checkpoint import (
|
26
|
+
check_dataframe_schema,
|
27
|
+
check_input_schema,
|
28
|
+
check_output_schema,
|
29
|
+
validate_dataframe_checkpoint,
|
30
|
+
)
|
31
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
32
|
+
from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
|
33
|
+
from snowflake.snowpark_checkpoints.utils.constants import CheckpointMode
|
34
|
+
|
35
|
+
|
36
|
+
__all__ = [
|
37
|
+
"check_with_spark",
|
38
|
+
"SnowparkJobContext",
|
39
|
+
"check_dataframe_schema",
|
40
|
+
"check_output_schema",
|
41
|
+
"check_input_schema",
|
42
|
+
"validate_dataframe_checkpoint",
|
43
|
+
"CheckpointMode",
|
44
|
+
]
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
__version__ = "0.3.0"
|