snowpark-checkpoints-validators 0.1.0rc1__tar.gz → 0.1.0rc3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/.gitignore +3 -0
- snowpark_checkpoints_validators-0.1.0rc3/PKG-INFO +313 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/README.md +119 -51
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/pyproject.toml +6 -4
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/__init__.py +2 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/checkpoint.py +90 -89
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/errors.py +1 -1
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/job_context.py +14 -3
- snowpark_checkpoints_validators-0.1.0rc3/src/snowflake/snowpark_checkpoints/singleton.py +12 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/spark_migration.py +5 -11
- snowpark_checkpoints_validators-0.1.0rc1/src/snowflake/snowpark_checkpoints/utils/constant.py → snowpark_checkpoints_validators-0.1.0rc3/src/snowflake/snowpark_checkpoints/utils/constants.py +9 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/extra_config.py +1 -1
- snowpark_checkpoints_validators-0.1.0rc3/src/snowflake/snowpark_checkpoints/utils/pandera_check_manager.py +358 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/supported_types.py +1 -1
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/telemetry.py +355 -112
- snowpark_checkpoints_validators-0.1.0rc3/src/snowflake/snowpark_checkpoints/utils/utils_checks.py +361 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/validation_result_metadata.py +16 -12
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_compare_utils.py +60 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/df_mode_dataframe_mismatch_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/df_mode_dataframe_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_df_fail_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_df_pass_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_limit_sample_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_random_sample_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_scalar_fail_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_scalar_passing_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_df_check_custom_check_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_df_check_fail_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_df_check_from_file_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_df_check_skip_check_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_df_check_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_input_fail_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_input_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_output_fail_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_output_telemetry.json +18 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_pandera.py +185 -22
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_parquet.py +34 -5
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_spark_checkpoint.py +45 -6
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_extra_config.py +1 -1
- snowpark_checkpoints_validators-0.1.0rc3/test/unit/test_pandera_check_manager.py +785 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_telemetry.py +97 -18
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_utils_checks.py +33 -376
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_validation_result_metadata.py +44 -1
- snowpark_checkpoints_validators-0.1.0rc1/PKG-INFO +0 -446
- snowpark_checkpoints_validators-0.1.0rc1/src/snowflake/snowpark_checkpoints/utils/utils_checks.py +0 -560
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/CHANGELOG.md +0 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/LICENSE +0 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/snowpark_sampler.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/__init__.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/checkpoint_logger.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/validation_results.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/.coveragerc +0 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/e2eexample.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_spark_migration.py +0 -0
@@ -0,0 +1,313 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: snowpark-checkpoints-validators
|
3
|
+
Version: 0.1.0rc3
|
4
|
+
Summary: Migration tools for Snowpark
|
5
|
+
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
|
+
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
7
|
+
Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
|
8
|
+
License: Apache License, Version 2.0
|
9
|
+
License-File: LICENSE
|
10
|
+
Keywords: Snowflake,Snowpark,analytics,cloud,database,db
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
12
|
+
Classifier: Environment :: Console
|
13
|
+
Classifier: Environment :: Other Environment
|
14
|
+
Classifier: Intended Audience :: Developers
|
15
|
+
Classifier: Intended Audience :: Education
|
16
|
+
Classifier: Intended Audience :: Information Technology
|
17
|
+
Classifier: Intended Audience :: System Administrators
|
18
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
19
|
+
Classifier: Operating System :: OS Independent
|
20
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
21
|
+
Classifier: Programming Language :: SQL
|
22
|
+
Classifier: Topic :: Database
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
24
|
+
Classifier: Topic :: Software Development
|
25
|
+
Classifier: Topic :: Software Development :: Libraries
|
26
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
|
+
Requires-Python: <3.12,>=3.9
|
29
|
+
Requires-Dist: pandera-report==0.1.2
|
30
|
+
Requires-Dist: pandera[io]==0.20.4
|
31
|
+
Requires-Dist: pyspark
|
32
|
+
Requires-Dist: snowflake-connector-python
|
33
|
+
Requires-Dist: snowflake-snowpark-python
|
34
|
+
Provides-Extra: development
|
35
|
+
Requires-Dist: coverage>=7.6.7; extra == 'development'
|
36
|
+
Requires-Dist: deepdiff>=8.0.0; extra == 'development'
|
37
|
+
Requires-Dist: hatchling==1.25.0; extra == 'development'
|
38
|
+
Requires-Dist: pre-commit>=4.0.1; extra == 'development'
|
39
|
+
Requires-Dist: pyarrow>=18.0.0; extra == 'development'
|
40
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
|
41
|
+
Requires-Dist: pytest>=8.3.3; extra == 'development'
|
42
|
+
Requires-Dist: setuptools>=70.0.0; extra == 'development'
|
43
|
+
Requires-Dist: twine==5.1.1; extra == 'development'
|
44
|
+
Description-Content-Type: text/markdown
|
45
|
+
|
46
|
+
# snowpark-checkpoints-validators
|
47
|
+
|
48
|
+
---
|
49
|
+
**NOTE**
|
50
|
+
|
51
|
+
This package is on Private Preview.
|
52
|
+
|
53
|
+
---
|
54
|
+
|
55
|
+
**snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
|
56
|
+
|
57
|
+
## Features
|
58
|
+
|
59
|
+
- Validate Snowpark DataFrames against predefined Pandera schemas.
|
60
|
+
- Perform custom checks and skip specific checks as needed.
|
61
|
+
- Generate validation results and log them for further analysis.
|
62
|
+
- Support for sampling strategies to validate large datasets efficiently.
|
63
|
+
- Integration with PySpark for cross-validation between Snowpark and PySpark DataFrames.
|
64
|
+
|
65
|
+
## Functionalities
|
66
|
+
|
67
|
+
### Validate DataFrame Schema from File
|
68
|
+
|
69
|
+
The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
|
70
|
+
|
71
|
+
```python
|
72
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
73
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
74
|
+
from snowflake.snowpark_checkpoints.utils.constant import (
|
75
|
+
CheckpointMode,
|
76
|
+
)
|
77
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
78
|
+
from typing import Any, Optional
|
79
|
+
|
80
|
+
# Signature of the function
|
81
|
+
def validate_dataframe_checkpoint(
|
82
|
+
df: SnowparkDataFrame,
|
83
|
+
checkpoint_name: str,
|
84
|
+
job_context: Optional[SnowparkJobContext] = None,
|
85
|
+
mode: Optional[CheckpointMode] = CheckpointMode.SCHEMA,
|
86
|
+
custom_checks: Optional[dict[Any, Any]] = None,
|
87
|
+
skip_checks: Optional[dict[Any, Any]] = None,
|
88
|
+
sample_frac: Optional[float] = 1.0,
|
89
|
+
sample_number: Optional[int] = None,
|
90
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
91
|
+
output_path: Optional[str] = None,
|
92
|
+
):
|
93
|
+
...
|
94
|
+
```
|
95
|
+
|
96
|
+
- `df`: Snowpark dataframe to validate.
|
97
|
+
- `checkpoint_name`: Name of the checkpoint schema file or dataframe.
|
98
|
+
- `job_context`: Snowpark job context.
|
99
|
+
- `mode`: Checkpoint mode (schema or data).
|
100
|
+
- `custom_checks`: Custom checks to perform.
|
101
|
+
- `skip_checks`: Checks to skip.
|
102
|
+
- `sample_frac`: Fraction of the dataframe to sample.
|
103
|
+
- `sample_number`: Number of rows to sample.
|
104
|
+
- `sampling_strategy`: Sampling strategy to use.
|
105
|
+
- `output_path`: Output path for the checkpoint report.
|
106
|
+
|
107
|
+
### Usage Example
|
108
|
+
|
109
|
+
```python
|
110
|
+
from snowflake.snowpark import Session
|
111
|
+
from snowflake.snowpark_checkpoints.utils.constant import (
|
112
|
+
CheckpointMode,
|
113
|
+
)
|
114
|
+
from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
|
115
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
116
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
117
|
+
from pyspark.sql import SparkSession
|
118
|
+
|
119
|
+
session = Session.builder.getOrCreate()
|
120
|
+
job_context = SnowparkJobContext(
|
121
|
+
session, SparkSession.builder.getOrCreate(), "job_context", True
|
122
|
+
)
|
123
|
+
df = session.read.format("csv").load("data.csv")
|
124
|
+
|
125
|
+
validate_dataframe_checkpoint(
|
126
|
+
df,
|
127
|
+
"schema_checkpoint",
|
128
|
+
job_context=job_context,
|
129
|
+
mode=CheckpointMode.SCHEMA,
|
130
|
+
sample_frac=0.1,
|
131
|
+
sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
|
132
|
+
)
|
133
|
+
```
|
134
|
+
|
135
|
+
### Check with Spark Decorator
|
136
|
+
|
137
|
+
The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
|
138
|
+
|
139
|
+
```python
|
140
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
141
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
142
|
+
from typing import Callable, Optional, TypeVar
|
143
|
+
|
144
|
+
fn = TypeVar("F", bound=Callable)
|
145
|
+
|
146
|
+
# Signature of the decorator
|
147
|
+
def check_with_spark(
|
148
|
+
job_context: Optional[SnowparkJobContext],
|
149
|
+
spark_function: fn,
|
150
|
+
checkpoint_name: str,
|
151
|
+
sample_number: Optional[int] = 100,
|
152
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
153
|
+
output_path: Optional[str] = None,
|
154
|
+
) -> Callable[[fn], fn]:
|
155
|
+
...
|
156
|
+
```
|
157
|
+
|
158
|
+
- `job_context`: Snowpark job context.
|
159
|
+
- `spark_function`: PySpark function to execute.
|
160
|
+
- `checkpoint_name`: Name of the check.
|
161
|
+
- `sample_number`: Number of rows to sample.
|
162
|
+
- `sampling_strategy`: Sampling strategy to use.
|
163
|
+
- `output_path`: Output path for the checkpoint report.
|
164
|
+
|
165
|
+
### Usage Example
|
166
|
+
|
167
|
+
```python
|
168
|
+
from snowflake.snowpark import Session
|
169
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
170
|
+
from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
|
171
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
172
|
+
from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
|
173
|
+
|
174
|
+
session = Session.builder.getOrCreate()
|
175
|
+
job_context = SnowparkJobContext(
|
176
|
+
session, SparkSession.builder.getOrCreate(), "job_context", True
|
177
|
+
)
|
178
|
+
|
179
|
+
def my_spark_scalar_fn(df: SparkDataFrame):
|
180
|
+
return df.count()
|
181
|
+
|
182
|
+
@check_with_spark(
|
183
|
+
job_context=job_context,
|
184
|
+
spark_function=my_spark_scalar_fn,
|
185
|
+
checkpoint_name="count_checkpoint",
|
186
|
+
)
|
187
|
+
def my_snowpark_scalar_fn(df: SnowparkDataFrame):
|
188
|
+
return df.count()
|
189
|
+
|
190
|
+
df = job_context.snowpark_session.create_dataframe(
|
191
|
+
[[1, 2], [3, 4]], schema=["a", "b"]
|
192
|
+
)
|
193
|
+
count = my_snowpark_scalar_fn(df)
|
194
|
+
```
|
195
|
+
|
196
|
+
### Pandera Snowpark Decorators
|
197
|
+
|
198
|
+
The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
|
199
|
+
|
200
|
+
```python
|
201
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
202
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
203
|
+
from pandera import DataFrameSchema
|
204
|
+
from typing import Optional
|
205
|
+
|
206
|
+
# Signature of the decorator
|
207
|
+
def check_input_schema(
|
208
|
+
pandera_schema: DataFrameSchema,
|
209
|
+
checkpoint_name: str,
|
210
|
+
sample_frac: Optional[float] = 1.0,
|
211
|
+
sample_number: Optional[int] = None,
|
212
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
213
|
+
job_context: Optional[SnowparkJobContext] = None,
|
214
|
+
output_path: Optional[str] = None,
|
215
|
+
):
|
216
|
+
...
|
217
|
+
|
218
|
+
# Signature of the decorator
|
219
|
+
def check_output_schema(
|
220
|
+
pandera_schema: DataFrameSchema,
|
221
|
+
checkpoint_name: str,
|
222
|
+
sample_frac: Optional[float] = 1.0,
|
223
|
+
sample_number: Optional[int] = None,
|
224
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
225
|
+
job_context: Optional[SnowparkJobContext] = None,
|
226
|
+
output_path: Optional[str] = None,
|
227
|
+
):
|
228
|
+
...
|
229
|
+
```
|
230
|
+
|
231
|
+
- `pandera_schema`: Pandera schema to validate.
|
232
|
+
- `checkpoint_name`: Name of the checkpoint schema file or DataFrame.
|
233
|
+
- `sample_frac`: Fraction of the DataFrame to sample.
|
234
|
+
- `sample_number`: Number of rows to sample.
|
235
|
+
- `sampling_strategy`: Sampling strategy to use.
|
236
|
+
- `job_context`: Snowpark job context.
|
237
|
+
- `output_path`: Output path for the checkpoint report.
|
238
|
+
|
239
|
+
### Usage Example
|
240
|
+
|
241
|
+
#### Check Input Schema Example
|
242
|
+
```python
|
243
|
+
from pandas import DataFrame as PandasDataFrame
|
244
|
+
from pandera import DataFrameSchema, Column, Check
|
245
|
+
from snowflake.snowpark import Session
|
246
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
247
|
+
from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
|
248
|
+
from numpy import int8
|
249
|
+
|
250
|
+
df = PandasDataFrame(
|
251
|
+
{
|
252
|
+
"COLUMN1": [1, 4, 0, 10, 9],
|
253
|
+
"COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
|
254
|
+
}
|
255
|
+
)
|
256
|
+
|
257
|
+
in_schema = DataFrameSchema(
|
258
|
+
{
|
259
|
+
"COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
|
260
|
+
"COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
|
261
|
+
}
|
262
|
+
)
|
263
|
+
|
264
|
+
@check_input_schema(in_schema, "input_schema_checkpoint")
|
265
|
+
def preprocessor(dataframe: SnowparkDataFrame):
|
266
|
+
dataframe = dataframe.withColumn(
|
267
|
+
"COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
|
268
|
+
)
|
269
|
+
return dataframe
|
270
|
+
|
271
|
+
session = Session.builder.getOrCreate()
|
272
|
+
sp_dataframe = session.create_dataframe(df)
|
273
|
+
|
274
|
+
preprocessed_dataframe = preprocessor(sp_dataframe)
|
275
|
+
```
|
276
|
+
|
277
|
+
#### Check Input Schema Example
|
278
|
+
```python
|
279
|
+
from pandas import DataFrame as PandasDataFrame
|
280
|
+
from pandera import DataFrameSchema, Column, Check
|
281
|
+
from snowflake.snowpark import Session
|
282
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
283
|
+
from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
|
284
|
+
from numpy import int8
|
285
|
+
|
286
|
+
df = PandasDataFrame(
|
287
|
+
{
|
288
|
+
"COLUMN1": [1, 4, 0, 10, 9],
|
289
|
+
"COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
|
290
|
+
}
|
291
|
+
)
|
292
|
+
|
293
|
+
out_schema = DataFrameSchema(
|
294
|
+
{
|
295
|
+
"COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
|
296
|
+
"COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
|
297
|
+
"COLUMN3": Column(float, Check.less_than(10)),
|
298
|
+
}
|
299
|
+
)
|
300
|
+
|
301
|
+
@check_output_schema(out_schema, "output_schema_checkpoint")
|
302
|
+
def preprocessor(dataframe: SnowparkDataFrame):
|
303
|
+
return dataframe.with_column(
|
304
|
+
"COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
|
305
|
+
)
|
306
|
+
|
307
|
+
session = Session.builder.getOrCreate()
|
308
|
+
sp_dataframe = session.create_dataframe(df)
|
309
|
+
|
310
|
+
preprocessed_dataframe = preprocessor(sp_dataframe)
|
311
|
+
```
|
312
|
+
|
313
|
+
------
|
{snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/README.md
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# snowpark-checkpoints-validators
|
2
2
|
|
3
3
|
---
|
4
4
|
**NOTE**
|
@@ -24,9 +24,16 @@ This package is on Private Preview.
|
|
24
24
|
The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
|
25
25
|
|
26
26
|
```python
|
27
|
-
from snowflake.
|
27
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
28
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
29
|
+
from snowflake.snowpark_checkpoints.utils.constant import (
|
30
|
+
CheckpointMode,
|
31
|
+
)
|
32
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
33
|
+
from typing import Any, Optional
|
28
34
|
|
29
|
-
|
35
|
+
# Signature of the function
|
36
|
+
def validate_dataframe_checkpoint(
|
30
37
|
df: SnowparkDataFrame,
|
31
38
|
checkpoint_name: str,
|
32
39
|
job_context: Optional[SnowparkJobContext] = None,
|
@@ -37,16 +44,17 @@ validate_dataframe_checkpoint(
|
|
37
44
|
sample_number: Optional[int] = None,
|
38
45
|
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
39
46
|
output_path: Optional[str] = None,
|
40
|
-
)
|
47
|
+
):
|
48
|
+
...
|
41
49
|
```
|
42
50
|
|
43
|
-
- `df`: Snowpark
|
44
|
-
- `checkpoint_name`: Name of the checkpoint schema file or
|
51
|
+
- `df`: Snowpark dataframe to validate.
|
52
|
+
- `checkpoint_name`: Name of the checkpoint schema file or dataframe.
|
45
53
|
- `job_context`: Snowpark job context.
|
46
54
|
- `mode`: Checkpoint mode (schema or data).
|
47
55
|
- `custom_checks`: Custom checks to perform.
|
48
56
|
- `skip_checks`: Checks to skip.
|
49
|
-
- `sample_frac`: Fraction of the
|
57
|
+
- `sample_frac`: Fraction of the dataframe to sample.
|
50
58
|
- `sample_number`: Number of rows to sample.
|
51
59
|
- `sampling_strategy`: Sampling strategy to use.
|
52
60
|
- `output_path`: Output path for the checkpoint report.
|
@@ -55,16 +63,24 @@ validate_dataframe_checkpoint(
|
|
55
63
|
|
56
64
|
```python
|
57
65
|
from snowflake.snowpark import Session
|
58
|
-
from snowflake.
|
66
|
+
from snowflake.snowpark_checkpoints.utils.constant import (
|
67
|
+
CheckpointMode,
|
68
|
+
)
|
59
69
|
from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
|
70
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
71
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
72
|
+
from pyspark.sql import SparkSession
|
60
73
|
|
61
74
|
session = Session.builder.getOrCreate()
|
75
|
+
job_context = SnowparkJobContext(
|
76
|
+
session, SparkSession.builder.getOrCreate(), "job_context", True
|
77
|
+
)
|
62
78
|
df = session.read.format("csv").load("data.csv")
|
63
79
|
|
64
80
|
validate_dataframe_checkpoint(
|
65
81
|
df,
|
66
82
|
"schema_checkpoint",
|
67
|
-
job_context=
|
83
|
+
job_context=job_context,
|
68
84
|
mode=CheckpointMode.SCHEMA,
|
69
85
|
sample_frac=0.1,
|
70
86
|
sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
|
@@ -73,22 +89,24 @@ validate_dataframe_checkpoint(
|
|
73
89
|
|
74
90
|
### Check with Spark Decorator
|
75
91
|
|
76
|
-
The `check_with_spark` decorator converts any Snowpark
|
92
|
+
The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
|
77
93
|
|
78
94
|
```python
|
79
|
-
from snowflake.snowpark_checkpoints.
|
95
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
96
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
97
|
+
from typing import Callable, Optional, TypeVar
|
80
98
|
|
81
|
-
|
99
|
+
fn = TypeVar("F", bound=Callable)
|
100
|
+
|
101
|
+
# Signature of the decorator
|
102
|
+
def check_with_spark(
|
82
103
|
job_context: Optional[SnowparkJobContext],
|
83
|
-
spark_function:
|
104
|
+
spark_function: fn,
|
84
105
|
checkpoint_name: str,
|
85
106
|
sample_number: Optional[int] = 100,
|
86
107
|
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
87
|
-
check_dtypes: Optional[bool] = False,
|
88
|
-
check_with_precision: Optional[bool] = False,
|
89
108
|
output_path: Optional[str] = None,
|
90
|
-
)
|
91
|
-
def snowpark_fn(df: SnowparkDataFrame):
|
109
|
+
) -> Callable[[fn], fn]:
|
92
110
|
...
|
93
111
|
```
|
94
112
|
|
@@ -97,8 +115,6 @@ def snowpark_fn(df: SnowparkDataFrame):
|
|
97
115
|
- `checkpoint_name`: Name of the check.
|
98
116
|
- `sample_number`: Number of rows to sample.
|
99
117
|
- `sampling_strategy`: Sampling strategy to use.
|
100
|
-
- `check_dtypes`: Check data types.
|
101
|
-
- `check_with_precision`: Check with precision.
|
102
118
|
- `output_path`: Output path for the checkpoint report.
|
103
119
|
|
104
120
|
### Usage Example
|
@@ -107,52 +123,63 @@ def snowpark_fn(df: SnowparkDataFrame):
|
|
107
123
|
from snowflake.snowpark import Session
|
108
124
|
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
109
125
|
from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
|
126
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
127
|
+
from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
|
110
128
|
|
111
129
|
session = Session.builder.getOrCreate()
|
112
|
-
|
130
|
+
job_context = SnowparkJobContext(
|
131
|
+
session, SparkSession.builder.getOrCreate(), "job_context", True
|
132
|
+
)
|
133
|
+
|
134
|
+
def my_spark_scalar_fn(df: SparkDataFrame):
|
135
|
+
return df.count()
|
113
136
|
|
114
137
|
@check_with_spark(
|
115
|
-
job_context=
|
116
|
-
spark_function=
|
117
|
-
checkpoint_name="
|
118
|
-
sample_number=100,
|
119
|
-
sampling_strategy=SamplingStrategy.RANDOM_SAMPLE,
|
138
|
+
job_context=job_context,
|
139
|
+
spark_function=my_spark_scalar_fn,
|
140
|
+
checkpoint_name="count_checkpoint",
|
120
141
|
)
|
121
|
-
def
|
122
|
-
return df.
|
142
|
+
def my_snowpark_scalar_fn(df: SnowparkDataFrame):
|
143
|
+
return df.count()
|
123
144
|
|
124
|
-
|
145
|
+
df = job_context.snowpark_session.create_dataframe(
|
146
|
+
[[1, 2], [3, 4]], schema=["a", "b"]
|
147
|
+
)
|
148
|
+
count = my_snowpark_scalar_fn(df)
|
125
149
|
```
|
126
150
|
|
127
151
|
### Pandera Snowpark Decorators
|
128
152
|
|
129
|
-
The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark
|
153
|
+
The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
|
130
154
|
|
131
155
|
```python
|
132
|
-
from snowflake.snowpark_checkpoints.
|
156
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
157
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
158
|
+
from pandera import DataFrameSchema
|
159
|
+
from typing import Optional
|
133
160
|
|
134
|
-
|
161
|
+
# Signature of the decorator
|
162
|
+
def check_input_schema(
|
135
163
|
pandera_schema: DataFrameSchema,
|
136
164
|
checkpoint_name: str,
|
137
165
|
sample_frac: Optional[float] = 1.0,
|
138
166
|
sample_number: Optional[int] = None,
|
139
167
|
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
140
|
-
job_context: Optional[SnowparkJobContext],
|
168
|
+
job_context: Optional[SnowparkJobContext] = None,
|
141
169
|
output_path: Optional[str] = None,
|
142
|
-
)
|
143
|
-
def snowpark_fn(df: SnowparkDataFrame):
|
170
|
+
):
|
144
171
|
...
|
145
172
|
|
146
|
-
|
173
|
+
# Signature of the decorator
|
174
|
+
def check_output_schema(
|
147
175
|
pandera_schema: DataFrameSchema,
|
148
176
|
checkpoint_name: str,
|
149
177
|
sample_frac: Optional[float] = 1.0,
|
150
178
|
sample_number: Optional[int] = None,
|
151
179
|
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
152
|
-
job_context: Optional[SnowparkJobContext],
|
180
|
+
job_context: Optional[SnowparkJobContext] = None,
|
153
181
|
output_path: Optional[str] = None,
|
154
|
-
)
|
155
|
-
def snowpark_fn(df: SnowparkDataFrame):
|
182
|
+
):
|
156
183
|
...
|
157
184
|
```
|
158
185
|
|
@@ -166,28 +193,71 @@ def snowpark_fn(df: SnowparkDataFrame):
|
|
166
193
|
|
167
194
|
### Usage Example
|
168
195
|
|
169
|
-
|
196
|
+
#### Check Input Schema Example
|
197
|
+
```python
|
198
|
+
from pandas import DataFrame as PandasDataFrame
|
199
|
+
from pandera import DataFrameSchema, Column, Check
|
200
|
+
from snowflake.snowpark import Session
|
201
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
202
|
+
from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
|
203
|
+
from numpy import int8
|
204
|
+
|
205
|
+
df = PandasDataFrame(
|
206
|
+
{
|
207
|
+
"COLUMN1": [1, 4, 0, 10, 9],
|
208
|
+
"COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
|
209
|
+
}
|
210
|
+
)
|
170
211
|
|
212
|
+
in_schema = DataFrameSchema(
|
213
|
+
{
|
214
|
+
"COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
|
215
|
+
"COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
|
216
|
+
}
|
217
|
+
)
|
218
|
+
|
219
|
+
@check_input_schema(in_schema, "input_schema_checkpoint")
|
220
|
+
def preprocessor(dataframe: SnowparkDataFrame):
|
221
|
+
dataframe = dataframe.withColumn(
|
222
|
+
"COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
|
223
|
+
)
|
224
|
+
return dataframe
|
225
|
+
|
226
|
+
session = Session.builder.getOrCreate()
|
227
|
+
sp_dataframe = session.create_dataframe(df)
|
228
|
+
|
229
|
+
preprocessed_dataframe = preprocessor(sp_dataframe)
|
230
|
+
```
|
231
|
+
|
232
|
+
#### Check Input Schema Example
|
171
233
|
```python
|
172
234
|
from pandas import DataFrame as PandasDataFrame
|
173
235
|
from pandera import DataFrameSchema, Column, Check
|
174
236
|
from snowflake.snowpark import Session
|
175
237
|
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
176
238
|
from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
|
239
|
+
from numpy import int8
|
177
240
|
|
178
|
-
df = PandasDataFrame(
|
179
|
-
|
180
|
-
|
181
|
-
|
241
|
+
df = PandasDataFrame(
|
242
|
+
{
|
243
|
+
"COLUMN1": [1, 4, 0, 10, 9],
|
244
|
+
"COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
|
245
|
+
}
|
246
|
+
)
|
182
247
|
|
183
|
-
out_schema = DataFrameSchema(
|
184
|
-
|
185
|
-
|
186
|
-
|
248
|
+
out_schema = DataFrameSchema(
|
249
|
+
{
|
250
|
+
"COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
|
251
|
+
"COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
|
252
|
+
"COLUMN3": Column(float, Check.less_than(10)),
|
253
|
+
}
|
254
|
+
)
|
187
255
|
|
188
256
|
@check_output_schema(out_schema, "output_schema_checkpoint")
|
189
257
|
def preprocessor(dataframe: SnowparkDataFrame):
|
190
|
-
return dataframe.with_column(
|
258
|
+
return dataframe.with_column(
|
259
|
+
"COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
|
260
|
+
)
|
191
261
|
|
192
262
|
session = Session.builder.getOrCreate()
|
193
263
|
sp_dataframe = session.create_dataframe(df)
|
@@ -195,6 +265,4 @@ sp_dataframe = session.create_dataframe(df)
|
|
195
265
|
preprocessed_dataframe = preprocessor(sp_dataframe)
|
196
266
|
```
|
197
267
|
|
198
|
-
|
199
|
-
|
200
|
-
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details.
|
268
|
+
------
|
{snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/pyproject.toml
RENAMED
@@ -3,7 +3,9 @@ build-backend = "hatchling.build"
|
|
3
3
|
requires = ["hatchling"]
|
4
4
|
|
5
5
|
[project]
|
6
|
-
authors = [
|
6
|
+
authors = [
|
7
|
+
{name = "Snowflake, Inc.", email = "snowflake-python-libraries-dl@snowflake.com"},
|
8
|
+
]
|
7
9
|
classifiers = [
|
8
10
|
"Development Status :: 4 - Beta",
|
9
11
|
"Environment :: Console",
|
@@ -31,6 +33,7 @@ dependencies = [
|
|
31
33
|
"pandera-report==0.1.2",
|
32
34
|
]
|
33
35
|
description = "Migration tools for Snowpark"
|
36
|
+
dynamic = ['version']
|
34
37
|
keywords = [
|
35
38
|
'Snowflake',
|
36
39
|
'analytics',
|
@@ -39,11 +42,10 @@ keywords = [
|
|
39
42
|
'db',
|
40
43
|
'Snowpark',
|
41
44
|
]
|
42
|
-
license = {
|
45
|
+
license = {text = "Apache License, Version 2.0"}
|
43
46
|
name = "snowpark-checkpoints-validators"
|
44
47
|
readme = "README.md"
|
45
48
|
requires-python = '>=3.9,<3.12'
|
46
|
-
dynamic = ['version']
|
47
49
|
|
48
50
|
[project.optional-dependencies]
|
49
51
|
development = [
|
@@ -118,7 +120,7 @@ check = [
|
|
118
120
|
|
119
121
|
[tool.hatch.envs.test.scripts]
|
120
122
|
check = [
|
121
|
-
"pip install -e ../snowpark-checkpoints-configuration"
|
123
|
+
"pip install -e ../snowpark-checkpoints-configuration",
|
122
124
|
'pytest -v --junitxml=test/outcome/test-results.xml --cov=. --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
|
123
125
|
]
|
124
126
|
|
@@ -10,6 +10,7 @@ from snowflake.snowpark_checkpoints.checkpoint import (
|
|
10
10
|
)
|
11
11
|
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
12
12
|
from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
|
13
|
+
from snowflake.snowpark_checkpoints.utils.constants import CheckpointMode
|
13
14
|
|
14
15
|
__all__ = [
|
15
16
|
"check_with_spark",
|
@@ -18,4 +19,5 @@ __all__ = [
|
|
18
19
|
"check_output_schema",
|
19
20
|
"check_input_schema",
|
20
21
|
"validate_dataframe_checkpoint",
|
22
|
+
"CheckpointMode",
|
21
23
|
]
|