snowpark-checkpoints-validators 0.1.0rc1__tar.gz → 0.1.0rc3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/.gitignore +3 -0
  2. snowpark_checkpoints_validators-0.1.0rc3/PKG-INFO +313 -0
  3. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/README.md +119 -51
  4. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/pyproject.toml +6 -4
  5. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/__init__.py +2 -0
  6. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/checkpoint.py +90 -89
  7. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/errors.py +1 -1
  8. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/job_context.py +14 -3
  9. snowpark_checkpoints_validators-0.1.0rc3/src/snowflake/snowpark_checkpoints/singleton.py +12 -0
  10. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/spark_migration.py +5 -11
  11. snowpark_checkpoints_validators-0.1.0rc1/src/snowflake/snowpark_checkpoints/utils/constant.py → snowpark_checkpoints_validators-0.1.0rc3/src/snowflake/snowpark_checkpoints/utils/constants.py +9 -0
  12. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/extra_config.py +1 -1
  13. snowpark_checkpoints_validators-0.1.0rc3/src/snowflake/snowpark_checkpoints/utils/pandera_check_manager.py +358 -0
  14. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/supported_types.py +1 -1
  15. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/telemetry.py +355 -112
  16. snowpark_checkpoints_validators-0.1.0rc3/src/snowflake/snowpark_checkpoints/utils/utils_checks.py +361 -0
  17. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/validation_result_metadata.py +16 -12
  18. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_compare_utils.py +60 -0
  19. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/df_mode_dataframe_mismatch_telemetry.json +18 -0
  20. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/df_mode_dataframe_telemetry.json +18 -0
  21. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_df_fail_telemetry.json +18 -0
  22. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_df_pass_telemetry.json +18 -0
  23. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_limit_sample_telemetry.json +18 -0
  24. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_random_sample_telemetry.json +18 -0
  25. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_scalar_fail_telemetry.json +18 -0
  26. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_scalar_passing_telemetry.json +18 -0
  27. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_df_check_custom_check_telemetry.json +18 -0
  28. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_df_check_fail_telemetry.json +18 -0
  29. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_df_check_from_file_telemetry.json +18 -0
  30. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_df_check_skip_check_telemetry.json +18 -0
  31. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_df_check_telemetry.json +18 -0
  32. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_input_fail_telemetry.json +18 -0
  33. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_input_telemetry.json +18 -0
  34. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_output_fail_telemetry.json +18 -0
  35. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/test_output_telemetry.json +18 -0
  36. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_pandera.py +185 -22
  37. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_parquet.py +34 -5
  38. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_spark_checkpoint.py +45 -6
  39. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_extra_config.py +1 -1
  40. snowpark_checkpoints_validators-0.1.0rc3/test/unit/test_pandera_check_manager.py +785 -0
  41. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_telemetry.py +97 -18
  42. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_utils_checks.py +33 -376
  43. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_validation_result_metadata.py +44 -1
  44. snowpark_checkpoints_validators-0.1.0rc1/PKG-INFO +0 -446
  45. snowpark_checkpoints_validators-0.1.0rc1/src/snowflake/snowpark_checkpoints/utils/utils_checks.py +0 -560
  46. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/CHANGELOG.md +0 -0
  47. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/LICENSE +0 -0
  48. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/snowpark_sampler.py +0 -0
  49. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/__init__.py +0 -0
  50. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/checkpoint_logger.py +0 -0
  51. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/validation_results.py +0 -0
  52. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/.coveragerc +0 -0
  53. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/e2eexample.py +0 -0
  54. {snowpark_checkpoints_validators-0.1.0rc1 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_spark_migration.py +0 -0
@@ -4,10 +4,13 @@
4
4
 
5
5
  # demos
6
6
  snowpark-checkpoints-output/
7
+ Demos/Demos/
8
+ Demos/snowpark-checkpoints-output/
7
9
 
8
10
  # env
9
11
  wheelvenv/
10
12
 
13
+
11
14
  # version
12
15
  !__version__.py
13
16
 
@@ -0,0 +1,313 @@
1
+ Metadata-Version: 2.4
2
+ Name: snowpark-checkpoints-validators
3
+ Version: 0.1.0rc3
4
+ Summary: Migration tools for Snowpark
5
+ Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
+ Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
7
+ Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
8
+ License: Apache License, Version 2.0
9
+ License-File: LICENSE
10
+ Keywords: Snowflake,Snowpark,analytics,cloud,database,db
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Environment :: Other Environment
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: Information Technology
17
+ Classifier: Intended Audience :: System Administrators
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: SQL
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: Topic :: Software Development
25
+ Classifier: Topic :: Software Development :: Libraries
26
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Requires-Python: <3.12,>=3.9
29
+ Requires-Dist: pandera-report==0.1.2
30
+ Requires-Dist: pandera[io]==0.20.4
31
+ Requires-Dist: pyspark
32
+ Requires-Dist: snowflake-connector-python
33
+ Requires-Dist: snowflake-snowpark-python
34
+ Provides-Extra: development
35
+ Requires-Dist: coverage>=7.6.7; extra == 'development'
36
+ Requires-Dist: deepdiff>=8.0.0; extra == 'development'
37
+ Requires-Dist: hatchling==1.25.0; extra == 'development'
38
+ Requires-Dist: pre-commit>=4.0.1; extra == 'development'
39
+ Requires-Dist: pyarrow>=18.0.0; extra == 'development'
40
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
41
+ Requires-Dist: pytest>=8.3.3; extra == 'development'
42
+ Requires-Dist: setuptools>=70.0.0; extra == 'development'
43
+ Requires-Dist: twine==5.1.1; extra == 'development'
44
+ Description-Content-Type: text/markdown
45
+
46
+ # snowpark-checkpoints-validators
47
+
48
+ ---
49
+ **NOTE**
50
+
51
+ This package is on Private Preview.
52
+
53
+ ---
54
+
55
+ **snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
56
+
57
+ ## Features
58
+
59
+ - Validate Snowpark DataFrames against predefined Pandera schemas.
60
+ - Perform custom checks and skip specific checks as needed.
61
+ - Generate validation results and log them for further analysis.
62
+ - Support for sampling strategies to validate large datasets efficiently.
63
+ - Integration with PySpark for cross-validation between Snowpark and PySpark DataFrames.
64
+
65
+ ## Functionalities
66
+
67
+ ### Validate DataFrame Schema from File
68
+
69
+ The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
70
+
71
+ ```python
72
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
73
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
74
+ from snowflake.snowpark_checkpoints.utils.constant import (
75
+ CheckpointMode,
76
+ )
77
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
78
+ from typing import Any, Optional
79
+
80
+ # Signature of the function
81
+ def validate_dataframe_checkpoint(
82
+ df: SnowparkDataFrame,
83
+ checkpoint_name: str,
84
+ job_context: Optional[SnowparkJobContext] = None,
85
+ mode: Optional[CheckpointMode] = CheckpointMode.SCHEMA,
86
+ custom_checks: Optional[dict[Any, Any]] = None,
87
+ skip_checks: Optional[dict[Any, Any]] = None,
88
+ sample_frac: Optional[float] = 1.0,
89
+ sample_number: Optional[int] = None,
90
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
91
+ output_path: Optional[str] = None,
92
+ ):
93
+ ...
94
+ ```
95
+
96
+ - `df`: Snowpark dataframe to validate.
97
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
98
+ - `job_context`: Snowpark job context.
99
+ - `mode`: Checkpoint mode (schema or data).
100
+ - `custom_checks`: Custom checks to perform.
101
+ - `skip_checks`: Checks to skip.
102
+ - `sample_frac`: Fraction of the dataframe to sample.
103
+ - `sample_number`: Number of rows to sample.
104
+ - `sampling_strategy`: Sampling strategy to use.
105
+ - `output_path`: Output path for the checkpoint report.
106
+
107
+ ### Usage Example
108
+
109
+ ```python
110
+ from snowflake.snowpark import Session
111
+ from snowflake.snowpark_checkpoints.utils.constant import (
112
+ CheckpointMode,
113
+ )
114
+ from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
115
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
116
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
117
+ from pyspark.sql import SparkSession
118
+
119
+ session = Session.builder.getOrCreate()
120
+ job_context = SnowparkJobContext(
121
+ session, SparkSession.builder.getOrCreate(), "job_context", True
122
+ )
123
+ df = session.read.format("csv").load("data.csv")
124
+
125
+ validate_dataframe_checkpoint(
126
+ df,
127
+ "schema_checkpoint",
128
+ job_context=job_context,
129
+ mode=CheckpointMode.SCHEMA,
130
+ sample_frac=0.1,
131
+ sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
132
+ )
133
+ ```
134
+
135
+ ### Check with Spark Decorator
136
+
137
+ The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
138
+
139
+ ```python
140
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
141
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
142
+ from typing import Callable, Optional, TypeVar
143
+
144
+ fn = TypeVar("F", bound=Callable)
145
+
146
+ # Signature of the decorator
147
+ def check_with_spark(
148
+ job_context: Optional[SnowparkJobContext],
149
+ spark_function: fn,
150
+ checkpoint_name: str,
151
+ sample_number: Optional[int] = 100,
152
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
153
+ output_path: Optional[str] = None,
154
+ ) -> Callable[[fn], fn]:
155
+ ...
156
+ ```
157
+
158
+ - `job_context`: Snowpark job context.
159
+ - `spark_function`: PySpark function to execute.
160
+ - `checkpoint_name`: Name of the check.
161
+ - `sample_number`: Number of rows to sample.
162
+ - `sampling_strategy`: Sampling strategy to use.
163
+ - `output_path`: Output path for the checkpoint report.
164
+
165
+ ### Usage Example
166
+
167
+ ```python
168
+ from snowflake.snowpark import Session
169
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
170
+ from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
171
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
172
+ from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
173
+
174
+ session = Session.builder.getOrCreate()
175
+ job_context = SnowparkJobContext(
176
+ session, SparkSession.builder.getOrCreate(), "job_context", True
177
+ )
178
+
179
+ def my_spark_scalar_fn(df: SparkDataFrame):
180
+ return df.count()
181
+
182
+ @check_with_spark(
183
+ job_context=job_context,
184
+ spark_function=my_spark_scalar_fn,
185
+ checkpoint_name="count_checkpoint",
186
+ )
187
+ def my_snowpark_scalar_fn(df: SnowparkDataFrame):
188
+ return df.count()
189
+
190
+ df = job_context.snowpark_session.create_dataframe(
191
+ [[1, 2], [3, 4]], schema=["a", "b"]
192
+ )
193
+ count = my_snowpark_scalar_fn(df)
194
+ ```
195
+
196
+ ### Pandera Snowpark Decorators
197
+
198
+ The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
199
+
200
+ ```python
201
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
202
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
203
+ from pandera import DataFrameSchema
204
+ from typing import Optional
205
+
206
+ # Signature of the decorator
207
+ def check_input_schema(
208
+ pandera_schema: DataFrameSchema,
209
+ checkpoint_name: str,
210
+ sample_frac: Optional[float] = 1.0,
211
+ sample_number: Optional[int] = None,
212
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
213
+ job_context: Optional[SnowparkJobContext] = None,
214
+ output_path: Optional[str] = None,
215
+ ):
216
+ ...
217
+
218
+ # Signature of the decorator
219
+ def check_output_schema(
220
+ pandera_schema: DataFrameSchema,
221
+ checkpoint_name: str,
222
+ sample_frac: Optional[float] = 1.0,
223
+ sample_number: Optional[int] = None,
224
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
225
+ job_context: Optional[SnowparkJobContext] = None,
226
+ output_path: Optional[str] = None,
227
+ ):
228
+ ...
229
+ ```
230
+
231
+ - `pandera_schema`: Pandera schema to validate.
232
+ - `checkpoint_name`: Name of the checkpoint schema file or DataFrame.
233
+ - `sample_frac`: Fraction of the DataFrame to sample.
234
+ - `sample_number`: Number of rows to sample.
235
+ - `sampling_strategy`: Sampling strategy to use.
236
+ - `job_context`: Snowpark job context.
237
+ - `output_path`: Output path for the checkpoint report.
238
+
239
+ ### Usage Example
240
+
241
+ #### Check Input Schema Example
242
+ ```python
243
+ from pandas import DataFrame as PandasDataFrame
244
+ from pandera import DataFrameSchema, Column, Check
245
+ from snowflake.snowpark import Session
246
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
247
+ from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
248
+ from numpy import int8
249
+
250
+ df = PandasDataFrame(
251
+ {
252
+ "COLUMN1": [1, 4, 0, 10, 9],
253
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
254
+ }
255
+ )
256
+
257
+ in_schema = DataFrameSchema(
258
+ {
259
+ "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
260
+ "COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
261
+ }
262
+ )
263
+
264
+ @check_input_schema(in_schema, "input_schema_checkpoint")
265
+ def preprocessor(dataframe: SnowparkDataFrame):
266
+ dataframe = dataframe.withColumn(
267
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
268
+ )
269
+ return dataframe
270
+
271
+ session = Session.builder.getOrCreate()
272
+ sp_dataframe = session.create_dataframe(df)
273
+
274
+ preprocessed_dataframe = preprocessor(sp_dataframe)
275
+ ```
276
+
277
+ #### Check Input Schema Example
278
+ ```python
279
+ from pandas import DataFrame as PandasDataFrame
280
+ from pandera import DataFrameSchema, Column, Check
281
+ from snowflake.snowpark import Session
282
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
283
+ from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
284
+ from numpy import int8
285
+
286
+ df = PandasDataFrame(
287
+ {
288
+ "COLUMN1": [1, 4, 0, 10, 9],
289
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
290
+ }
291
+ )
292
+
293
+ out_schema = DataFrameSchema(
294
+ {
295
+ "COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
296
+ "COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
297
+ "COLUMN3": Column(float, Check.less_than(10)),
298
+ }
299
+ )
300
+
301
+ @check_output_schema(out_schema, "output_schema_checkpoint")
302
+ def preprocessor(dataframe: SnowparkDataFrame):
303
+ return dataframe.with_column(
304
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
305
+ )
306
+
307
+ session = Session.builder.getOrCreate()
308
+ sp_dataframe = session.create_dataframe(df)
309
+
310
+ preprocessed_dataframe = preprocessor(sp_dataframe)
311
+ ```
312
+
313
+ ------
@@ -1,4 +1,4 @@
1
- # Snowpark Checkpoints Validators
1
+ # snowpark-checkpoints-validators
2
2
 
3
3
  ---
4
4
  **NOTE**
@@ -24,9 +24,16 @@ This package is on Private Preview.
24
24
  The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
25
25
 
26
26
  ```python
27
- from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
27
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
28
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
29
+ from snowflake.snowpark_checkpoints.utils.constant import (
30
+ CheckpointMode,
31
+ )
32
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
33
+ from typing import Any, Optional
28
34
 
29
- validate_dataframe_checkpoint(
35
+ # Signature of the function
36
+ def validate_dataframe_checkpoint(
30
37
  df: SnowparkDataFrame,
31
38
  checkpoint_name: str,
32
39
  job_context: Optional[SnowparkJobContext] = None,
@@ -37,16 +44,17 @@ validate_dataframe_checkpoint(
37
44
  sample_number: Optional[int] = None,
38
45
  sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
39
46
  output_path: Optional[str] = None,
40
- )
47
+ ):
48
+ ...
41
49
  ```
42
50
 
43
- - `df`: Snowpark DataFrame to validate.
44
- - `checkpoint_name`: Name of the checkpoint schema file or DataFrame.
51
+ - `df`: Snowpark dataframe to validate.
52
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
45
53
  - `job_context`: Snowpark job context.
46
54
  - `mode`: Checkpoint mode (schema or data).
47
55
  - `custom_checks`: Custom checks to perform.
48
56
  - `skip_checks`: Checks to skip.
49
- - `sample_frac`: Fraction of the DataFrame to sample.
57
+ - `sample_frac`: Fraction of the dataframe to sample.
50
58
  - `sample_number`: Number of rows to sample.
51
59
  - `sampling_strategy`: Sampling strategy to use.
52
60
  - `output_path`: Output path for the checkpoint report.
@@ -55,16 +63,24 @@ validate_dataframe_checkpoint(
55
63
 
56
64
  ```python
57
65
  from snowflake.snowpark import Session
58
- from snowflake.snowpark import DataFrame as SnowparkDataFrame
66
+ from snowflake.snowpark_checkpoints.utils.constant import (
67
+ CheckpointMode,
68
+ )
59
69
  from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
70
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
71
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
72
+ from pyspark.sql import SparkSession
60
73
 
61
74
  session = Session.builder.getOrCreate()
75
+ job_context = SnowparkJobContext(
76
+ session, SparkSession.builder.getOrCreate(), "job_context", True
77
+ )
62
78
  df = session.read.format("csv").load("data.csv")
63
79
 
64
80
  validate_dataframe_checkpoint(
65
81
  df,
66
82
  "schema_checkpoint",
67
- job_context=session,
83
+ job_context=job_context,
68
84
  mode=CheckpointMode.SCHEMA,
69
85
  sample_frac=0.1,
70
86
  sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
@@ -73,22 +89,24 @@ validate_dataframe_checkpoint(
73
89
 
74
90
  ### Check with Spark Decorator
75
91
 
76
- The `check_with_spark` decorator converts any Snowpark DataFrame arguments to a function, samples them, and converts them to PySpark DataFrames. It then executes a provided Spark function and compares the outputs between the two implementations.
92
+ The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
77
93
 
78
94
  ```python
79
- from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
95
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
96
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
97
+ from typing import Callable, Optional, TypeVar
80
98
 
81
- @check_with_spark(
99
+ fn = TypeVar("F", bound=Callable)
100
+
101
+ # Signature of the decorator
102
+ def check_with_spark(
82
103
  job_context: Optional[SnowparkJobContext],
83
- spark_function: Callable,
104
+ spark_function: fn,
84
105
  checkpoint_name: str,
85
106
  sample_number: Optional[int] = 100,
86
107
  sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
87
- check_dtypes: Optional[bool] = False,
88
- check_with_precision: Optional[bool] = False,
89
108
  output_path: Optional[str] = None,
90
- )
91
- def snowpark_fn(df: SnowparkDataFrame):
109
+ ) -> Callable[[fn], fn]:
92
110
  ...
93
111
  ```
94
112
 
@@ -97,8 +115,6 @@ def snowpark_fn(df: SnowparkDataFrame):
97
115
  - `checkpoint_name`: Name of the check.
98
116
  - `sample_number`: Number of rows to sample.
99
117
  - `sampling_strategy`: Sampling strategy to use.
100
- - `check_dtypes`: Check data types.
101
- - `check_with_precision`: Check with precision.
102
118
  - `output_path`: Output path for the checkpoint report.
103
119
 
104
120
  ### Usage Example
@@ -107,52 +123,63 @@ def snowpark_fn(df: SnowparkDataFrame):
107
123
  from snowflake.snowpark import Session
108
124
  from snowflake.snowpark import DataFrame as SnowparkDataFrame
109
125
  from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
126
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
127
+ from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
110
128
 
111
129
  session = Session.builder.getOrCreate()
112
- df = session.read.format("csv").load("data.csv")
130
+ job_context = SnowparkJobContext(
131
+ session, SparkSession.builder.getOrCreate(), "job_context", True
132
+ )
133
+
134
+ def my_spark_scalar_fn(df: SparkDataFrame):
135
+ return df.count()
113
136
 
114
137
  @check_with_spark(
115
- job_context=session,
116
- spark_function=lambda df: df.withColumn("COLUMN1", df["COLUMN1"] + 1),
117
- checkpoint_name="Check_Column1_Increment",
118
- sample_number=100,
119
- sampling_strategy=SamplingStrategy.RANDOM_SAMPLE,
138
+ job_context=job_context,
139
+ spark_function=my_spark_scalar_fn,
140
+ checkpoint_name="count_checkpoint",
120
141
  )
121
- def increment_column1(df: SnowparkDataFrame):
122
- return df.with_column("COLUMN1", df["COLUMN1"] + 1)
142
+ def my_snowpark_scalar_fn(df: SnowparkDataFrame):
143
+ return df.count()
123
144
 
124
- increment_column1(df)
145
+ df = job_context.snowpark_session.create_dataframe(
146
+ [[1, 2], [3, 4]], schema=["a", "b"]
147
+ )
148
+ count = my_snowpark_scalar_fn(df)
125
149
  ```
126
150
 
127
151
  ### Pandera Snowpark Decorators
128
152
 
129
- The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark DataFrames in the input arguments or in the return value.
153
+ The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
130
154
 
131
155
  ```python
132
- from snowflake.snowpark_checkpoints.checkpoint import check_input_schema, check_output_schema
156
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
157
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
158
+ from pandera import DataFrameSchema
159
+ from typing import Optional
133
160
 
134
- @check_input_schema(
161
+ # Signature of the decorator
162
+ def check_input_schema(
135
163
  pandera_schema: DataFrameSchema,
136
164
  checkpoint_name: str,
137
165
  sample_frac: Optional[float] = 1.0,
138
166
  sample_number: Optional[int] = None,
139
167
  sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
140
- job_context: Optional[SnowparkJobContext],
168
+ job_context: Optional[SnowparkJobContext] = None,
141
169
  output_path: Optional[str] = None,
142
- )
143
- def snowpark_fn(df: SnowparkDataFrame):
170
+ ):
144
171
  ...
145
172
 
146
- @check_output_schema(
173
+ # Signature of the decorator
174
+ def check_output_schema(
147
175
  pandera_schema: DataFrameSchema,
148
176
  checkpoint_name: str,
149
177
  sample_frac: Optional[float] = 1.0,
150
178
  sample_number: Optional[int] = None,
151
179
  sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
152
- job_context: Optional[SnowparkJobContext],
180
+ job_context: Optional[SnowparkJobContext] = None,
153
181
  output_path: Optional[str] = None,
154
- )
155
- def snowpark_fn(df: SnowparkDataFrame):
182
+ ):
156
183
  ...
157
184
  ```
158
185
 
@@ -166,28 +193,71 @@ def snowpark_fn(df: SnowparkDataFrame):
166
193
 
167
194
  ### Usage Example
168
195
 
169
- The following will result in a Pandera `SchemaError`:
196
+ #### Check Input Schema Example
197
+ ```python
198
+ from pandas import DataFrame as PandasDataFrame
199
+ from pandera import DataFrameSchema, Column, Check
200
+ from snowflake.snowpark import Session
201
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
202
+ from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
203
+ from numpy import int8
204
+
205
+ df = PandasDataFrame(
206
+ {
207
+ "COLUMN1": [1, 4, 0, 10, 9],
208
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
209
+ }
210
+ )
170
211
 
212
+ in_schema = DataFrameSchema(
213
+ {
214
+ "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
215
+ "COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
216
+ }
217
+ )
218
+
219
+ @check_input_schema(in_schema, "input_schema_checkpoint")
220
+ def preprocessor(dataframe: SnowparkDataFrame):
221
+ dataframe = dataframe.withColumn(
222
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
223
+ )
224
+ return dataframe
225
+
226
+ session = Session.builder.getOrCreate()
227
+ sp_dataframe = session.create_dataframe(df)
228
+
229
+ preprocessed_dataframe = preprocessor(sp_dataframe)
230
+ ```
231
+
232
+ #### Check Input Schema Example
171
233
  ```python
172
234
  from pandas import DataFrame as PandasDataFrame
173
235
  from pandera import DataFrameSchema, Column, Check
174
236
  from snowflake.snowpark import Session
175
237
  from snowflake.snowpark import DataFrame as SnowparkDataFrame
176
238
  from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
239
+ from numpy import int8
177
240
 
178
- df = PandasDataFrame({
179
- "COLUMN1": [1, 4, 0, 10, 9],
180
- "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
181
- })
241
+ df = PandasDataFrame(
242
+ {
243
+ "COLUMN1": [1, 4, 0, 10, 9],
244
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
245
+ }
246
+ )
182
247
 
183
- out_schema = DataFrameSchema({
184
- "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
185
- "COLUMN2": Column(float, Check(lambda x: x < -1.2)),
186
- })
248
+ out_schema = DataFrameSchema(
249
+ {
250
+ "COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
251
+ "COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
252
+ "COLUMN3": Column(float, Check.less_than(10)),
253
+ }
254
+ )
187
255
 
188
256
  @check_output_schema(out_schema, "output_schema_checkpoint")
189
257
  def preprocessor(dataframe: SnowparkDataFrame):
190
- return dataframe.with_column("COLUMN1", lit('Some bad data yo'))
258
+ return dataframe.with_column(
259
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
260
+ )
191
261
 
192
262
  session = Session.builder.getOrCreate()
193
263
  sp_dataframe = session.create_dataframe(df)
@@ -195,6 +265,4 @@ sp_dataframe = session.create_dataframe(df)
195
265
  preprocessed_dataframe = preprocessor(sp_dataframe)
196
266
  ```
197
267
 
198
- ## License
199
-
200
- This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details.
268
+ ------
@@ -3,7 +3,9 @@ build-backend = "hatchling.build"
3
3
  requires = ["hatchling"]
4
4
 
5
5
  [project]
6
- authors = [{name = "Snowflake Inc."}]
6
+ authors = [
7
+ {name = "Snowflake, Inc.", email = "snowflake-python-libraries-dl@snowflake.com"},
8
+ ]
7
9
  classifiers = [
8
10
  "Development Status :: 4 - Beta",
9
11
  "Environment :: Console",
@@ -31,6 +33,7 @@ dependencies = [
31
33
  "pandera-report==0.1.2",
32
34
  ]
33
35
  description = "Migration tools for Snowpark"
36
+ dynamic = ['version']
34
37
  keywords = [
35
38
  'Snowflake',
36
39
  'analytics',
@@ -39,11 +42,10 @@ keywords = [
39
42
  'db',
40
43
  'Snowpark',
41
44
  ]
42
- license = {file = "LICENSE"}
45
+ license = {text = "Apache License, Version 2.0"}
43
46
  name = "snowpark-checkpoints-validators"
44
47
  readme = "README.md"
45
48
  requires-python = '>=3.9,<3.12'
46
- dynamic = ['version']
47
49
 
48
50
  [project.optional-dependencies]
49
51
  development = [
@@ -118,7 +120,7 @@ check = [
118
120
 
119
121
  [tool.hatch.envs.test.scripts]
120
122
  check = [
121
- "pip install -e ../snowpark-checkpoints-configuration" ,
123
+ "pip install -e ../snowpark-checkpoints-configuration",
122
124
  'pytest -v --junitxml=test/outcome/test-results.xml --cov=. --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
123
125
  ]
124
126
 
@@ -10,6 +10,7 @@ from snowflake.snowpark_checkpoints.checkpoint import (
10
10
  )
11
11
  from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
12
12
  from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
13
+ from snowflake.snowpark_checkpoints.utils.constants import CheckpointMode
13
14
 
14
15
  __all__ = [
15
16
  "check_with_spark",
@@ -18,4 +19,5 @@ __all__ = [
18
19
  "check_output_schema",
19
20
  "check_input_schema",
20
21
  "validate_dataframe_checkpoint",
22
+ "CheckpointMode",
21
23
  ]