snowpark-checkpoints-validators 0.1.0rc2__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. snowflake/snowpark_checkpoints/__init__.py +34 -0
  2. snowflake/snowpark_checkpoints/checkpoint.py +482 -0
  3. snowflake/snowpark_checkpoints/errors.py +60 -0
  4. snowflake/snowpark_checkpoints/job_context.py +85 -0
  5. snowflake/snowpark_checkpoints/singleton.py +23 -0
  6. snowflake/snowpark_checkpoints/snowpark_sampler.py +99 -0
  7. snowflake/snowpark_checkpoints/spark_migration.py +222 -0
  8. snowflake/snowpark_checkpoints/utils/__init__.py +14 -0
  9. snowflake/snowpark_checkpoints/utils/checkpoint_logger.py +52 -0
  10. snowflake/snowpark_checkpoints/utils/constants.py +134 -0
  11. snowflake/snowpark_checkpoints/utils/extra_config.py +84 -0
  12. snowflake/snowpark_checkpoints/utils/pandera_check_manager.py +358 -0
  13. snowflake/snowpark_checkpoints/utils/supported_types.py +65 -0
  14. snowflake/snowpark_checkpoints/utils/telemetry.py +900 -0
  15. snowflake/snowpark_checkpoints/utils/utils_checks.py +372 -0
  16. snowflake/snowpark_checkpoints/validation_result_metadata.py +116 -0
  17. snowflake/snowpark_checkpoints/validation_results.py +49 -0
  18. snowpark_checkpoints_validators-0.1.1.dist-info/METADATA +311 -0
  19. snowpark_checkpoints_validators-0.1.1.dist-info/RECORD +21 -0
  20. snowpark_checkpoints_validators-0.1.0rc2.dist-info/METADATA +0 -514
  21. snowpark_checkpoints_validators-0.1.0rc2.dist-info/RECORD +0 -4
  22. {snowpark_checkpoints_validators-0.1.0rc2.dist-info → snowpark_checkpoints_validators-0.1.1.dist-info}/WHEEL +0 -0
  23. {snowpark_checkpoints_validators-0.1.0rc2.dist-info → snowpark_checkpoints_validators-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,311 @@
1
+ Metadata-Version: 2.4
2
+ Name: snowpark-checkpoints-validators
3
+ Version: 0.1.1
4
+ Summary: Migration tools for Snowpark
5
+ Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
+ Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
7
+ Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
8
+ License: Apache License, Version 2.0
9
+ License-File: LICENSE
10
+ Keywords: Snowflake,Snowpark,analytics,cloud,database,db
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Environment :: Other Environment
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: Information Technology
17
+ Classifier: Intended Audience :: System Administrators
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: SQL
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: Topic :: Software Development
25
+ Classifier: Topic :: Software Development :: Libraries
26
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Requires-Python: <3.12,>=3.9
29
+ Requires-Dist: pandera-report==0.1.2
30
+ Requires-Dist: pandera[io]==0.20.4
31
+ Requires-Dist: pyspark
32
+ Requires-Dist: snowflake-connector-python==3.13.0
33
+ Requires-Dist: snowflake-snowpark-python==1.26.0
34
+ Provides-Extra: development
35
+ Requires-Dist: coverage>=7.6.7; extra == 'development'
36
+ Requires-Dist: deepdiff>=8.0.0; extra == 'development'
37
+ Requires-Dist: hatchling==1.25.0; extra == 'development'
38
+ Requires-Dist: pre-commit>=4.0.1; extra == 'development'
39
+ Requires-Dist: pyarrow>=18.0.0; extra == 'development'
40
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
41
+ Requires-Dist: pytest>=8.3.3; extra == 'development'
42
+ Requires-Dist: setuptools>=70.0.0; extra == 'development'
43
+ Requires-Dist: twine==5.1.1; extra == 'development'
44
+ Description-Content-Type: text/markdown
45
+
46
+ # snowpark-checkpoints-validators
47
+
48
+ ---
49
+ **NOTE**
50
+ This package is on Public Preview.
51
+ ---
52
+
53
+ **snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
54
+
55
+ ## Features
56
+
57
+ - Validate Snowpark DataFrames against predefined Pandera schemas.
58
+ - Perform custom checks and skip specific checks as needed.
59
+ - Generate validation results and log them for further analysis.
60
+ - Support for sampling strategies to validate large datasets efficiently.
61
+ - Integration with PySpark for cross-validation between Snowpark and PySpark DataFrames.
62
+
63
+ ## Functionalities
64
+
65
+ ### Validate DataFrame Schema from File
66
+
67
+ The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
68
+
69
+ ```python
70
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
71
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
72
+ from snowflake.snowpark_checkpoints.utils.constant import (
73
+ CheckpointMode,
74
+ )
75
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
76
+ from typing import Any, Optional
77
+
78
+ # Signature of the function
79
+ def validate_dataframe_checkpoint(
80
+ df: SnowparkDataFrame,
81
+ checkpoint_name: str,
82
+ job_context: Optional[SnowparkJobContext] = None,
83
+ mode: Optional[CheckpointMode] = CheckpointMode.SCHEMA,
84
+ custom_checks: Optional[dict[Any, Any]] = None,
85
+ skip_checks: Optional[dict[Any, Any]] = None,
86
+ sample_frac: Optional[float] = 1.0,
87
+ sample_number: Optional[int] = None,
88
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
89
+ output_path: Optional[str] = None,
90
+ ):
91
+ ...
92
+ ```
93
+
94
+ - `df`: Snowpark dataframe to validate.
95
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
96
+ - `job_context`: Snowpark job context.
97
+ - `mode`: Checkpoint mode (schema or data).
98
+ - `custom_checks`: Custom checks to perform.
99
+ - `skip_checks`: Checks to skip.
100
+ - `sample_frac`: Fraction of the dataframe to sample.
101
+ - `sample_number`: Number of rows to sample.
102
+ - `sampling_strategy`: Sampling strategy to use.
103
+ - `output_path`: Output path for the checkpoint report.
104
+
105
+ ### Usage Example
106
+
107
+ ```python
108
+ from snowflake.snowpark import Session
109
+ from snowflake.snowpark_checkpoints.utils.constant import (
110
+ CheckpointMode,
111
+ )
112
+ from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
113
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
114
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
115
+ from pyspark.sql import SparkSession
116
+
117
+ session = Session.builder.getOrCreate()
118
+ job_context = SnowparkJobContext(
119
+ session, SparkSession.builder.getOrCreate(), "job_context", True
120
+ )
121
+ df = session.read.format("csv").load("data.csv")
122
+
123
+ validate_dataframe_checkpoint(
124
+ df,
125
+ "schema_checkpoint",
126
+ job_context=job_context,
127
+ mode=CheckpointMode.SCHEMA,
128
+ sample_frac=0.1,
129
+ sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
130
+ )
131
+ ```
132
+
133
+ ### Check with Spark Decorator
134
+
135
+ The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
136
+
137
+ ```python
138
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
139
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
140
+ from typing import Callable, Optional, TypeVar
141
+
142
+ fn = TypeVar("F", bound=Callable)
143
+
144
+ # Signature of the decorator
145
+ def check_with_spark(
146
+ job_context: Optional[SnowparkJobContext],
147
+ spark_function: fn,
148
+ checkpoint_name: str,
149
+ sample_number: Optional[int] = 100,
150
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
151
+ output_path: Optional[str] = None,
152
+ ) -> Callable[[fn], fn]:
153
+ ...
154
+ ```
155
+
156
+ - `job_context`: Snowpark job context.
157
+ - `spark_function`: PySpark function to execute.
158
+ - `checkpoint_name`: Name of the check.
159
+ - `sample_number`: Number of rows to sample.
160
+ - `sampling_strategy`: Sampling strategy to use.
161
+ - `output_path`: Output path for the checkpoint report.
162
+
163
+ ### Usage Example
164
+
165
+ ```python
166
+ from snowflake.snowpark import Session
167
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
168
+ from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
169
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
170
+ from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
171
+
172
+ session = Session.builder.getOrCreate()
173
+ job_context = SnowparkJobContext(
174
+ session, SparkSession.builder.getOrCreate(), "job_context", True
175
+ )
176
+
177
+ def my_spark_scalar_fn(df: SparkDataFrame):
178
+ return df.count()
179
+
180
+ @check_with_spark(
181
+ job_context=job_context,
182
+ spark_function=my_spark_scalar_fn,
183
+ checkpoint_name="count_checkpoint",
184
+ )
185
+ def my_snowpark_scalar_fn(df: SnowparkDataFrame):
186
+ return df.count()
187
+
188
+ df = job_context.snowpark_session.create_dataframe(
189
+ [[1, 2], [3, 4]], schema=["a", "b"]
190
+ )
191
+ count = my_snowpark_scalar_fn(df)
192
+ ```
193
+
194
+ ### Pandera Snowpark Decorators
195
+
196
+ The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
197
+
198
+ ```python
199
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
200
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
201
+ from pandera import DataFrameSchema
202
+ from typing import Optional
203
+
204
+ # Signature of the decorator
205
+ def check_input_schema(
206
+ pandera_schema: DataFrameSchema,
207
+ checkpoint_name: str,
208
+ sample_frac: Optional[float] = 1.0,
209
+ sample_number: Optional[int] = None,
210
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
211
+ job_context: Optional[SnowparkJobContext] = None,
212
+ output_path: Optional[str] = None,
213
+ ):
214
+ ...
215
+
216
+ # Signature of the decorator
217
+ def check_output_schema(
218
+ pandera_schema: DataFrameSchema,
219
+ checkpoint_name: str,
220
+ sample_frac: Optional[float] = 1.0,
221
+ sample_number: Optional[int] = None,
222
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
223
+ job_context: Optional[SnowparkJobContext] = None,
224
+ output_path: Optional[str] = None,
225
+ ):
226
+ ...
227
+ ```
228
+
229
+ - `pandera_schema`: Pandera schema to validate.
230
+ - `checkpoint_name`: Name of the checkpoint schema file or DataFrame.
231
+ - `sample_frac`: Fraction of the DataFrame to sample.
232
+ - `sample_number`: Number of rows to sample.
233
+ - `sampling_strategy`: Sampling strategy to use.
234
+ - `job_context`: Snowpark job context.
235
+ - `output_path`: Output path for the checkpoint report.
236
+
237
+ ### Usage Example
238
+
239
+ #### Check Input Schema Example
240
+ ```python
241
+ from pandas import DataFrame as PandasDataFrame
242
+ from pandera import DataFrameSchema, Column, Check
243
+ from snowflake.snowpark import Session
244
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
245
+ from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
246
+ from numpy import int8
247
+
248
+ df = PandasDataFrame(
249
+ {
250
+ "COLUMN1": [1, 4, 0, 10, 9],
251
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
252
+ }
253
+ )
254
+
255
+ in_schema = DataFrameSchema(
256
+ {
257
+ "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
258
+ "COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
259
+ }
260
+ )
261
+
262
+ @check_input_schema(in_schema, "input_schema_checkpoint")
263
+ def preprocessor(dataframe: SnowparkDataFrame):
264
+ dataframe = dataframe.withColumn(
265
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
266
+ )
267
+ return dataframe
268
+
269
+ session = Session.builder.getOrCreate()
270
+ sp_dataframe = session.create_dataframe(df)
271
+
272
+ preprocessed_dataframe = preprocessor(sp_dataframe)
273
+ ```
274
+
275
+ #### Check Input Schema Example
276
+ ```python
277
+ from pandas import DataFrame as PandasDataFrame
278
+ from pandera import DataFrameSchema, Column, Check
279
+ from snowflake.snowpark import Session
280
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
281
+ from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
282
+ from numpy import int8
283
+
284
+ df = PandasDataFrame(
285
+ {
286
+ "COLUMN1": [1, 4, 0, 10, 9],
287
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
288
+ }
289
+ )
290
+
291
+ out_schema = DataFrameSchema(
292
+ {
293
+ "COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
294
+ "COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
295
+ "COLUMN3": Column(float, Check.less_than(10)),
296
+ }
297
+ )
298
+
299
+ @check_output_schema(out_schema, "output_schema_checkpoint")
300
+ def preprocessor(dataframe: SnowparkDataFrame):
301
+ return dataframe.with_column(
302
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
303
+ )
304
+
305
+ session = Session.builder.getOrCreate()
306
+ sp_dataframe = session.create_dataframe(df)
307
+
308
+ preprocessed_dataframe = preprocessor(sp_dataframe)
309
+ ```
310
+
311
+ ------
@@ -0,0 +1,21 @@
1
+ snowflake/snowpark_checkpoints/__init__.py,sha256=1_xzSopIHWpw1i3gQqWLN0wCfWWEefjr4cl1vl0xSdY,1211
2
+ snowflake/snowpark_checkpoints/checkpoint.py,sha256=-y1iWdGxYGuTWdngOEXdA59MT33PCiM7cP1s3jJs9jE,18997
3
+ snowflake/snowpark_checkpoints/errors.py,sha256=9KjzRf8bjDZTTNL4LeySJAwuucDOyz0Ka7EFBKWFpyg,1821
4
+ snowflake/snowpark_checkpoints/job_context.py,sha256=7LdJ682lC8hCJOYUn-AVXq_Llv18R9oGdK2F-amYR_o,2990
5
+ snowflake/snowpark_checkpoints/singleton.py,sha256=7AgIHQBXVRvPBBCkmBplzkdrrm-xVWf_N8svzA2vF8E,836
6
+ snowflake/snowpark_checkpoints/snowpark_sampler.py,sha256=-t7cg-swMK0SaU7r8y90MLSDPXGlKprc6xdVxEs29sU,3632
7
+ snowflake/snowpark_checkpoints/spark_migration.py,sha256=DzzgUZ-XlzIqCz-aWpBICP8mgnjk8UNoL8JsomadF-U,8832
8
+ snowflake/snowpark_checkpoints/validation_result_metadata.py,sha256=zNU7hk9GH4d73iVfNopSAs_8yJeT12s_mcbpB7FShSY,4516
9
+ snowflake/snowpark_checkpoints/validation_results.py,sha256=J8OcpNty6hQD8RbAy8xmA0UMbPWfXSmQnHYspWWSisk,1502
10
+ snowflake/snowpark_checkpoints/utils/__init__.py,sha256=I4srmZ8G1q9DU6Suo1S91aVfNvETyisKH95uvLAvEJ0,609
11
+ snowflake/snowpark_checkpoints/utils/checkpoint_logger.py,sha256=meGl5T3Avp4Qn0GEwkJi5GSLS4MDb7zTGbTOI-8bf1E,1592
12
+ snowflake/snowpark_checkpoints/utils/constants.py,sha256=pgFttLDQ6fTa6obSdvivWBYClS21ap41YVDNGAS4sxY,4146
13
+ snowflake/snowpark_checkpoints/utils/extra_config.py,sha256=pmGLYT7cu9WMKzQwcEPkgk1DMnnT1fREm45p19e79hk,2567
14
+ snowflake/snowpark_checkpoints/utils/pandera_check_manager.py,sha256=ddTwXauuZdowIRwPMT61GWYCG4XGKOFkVyfZO49bc-8,14516
15
+ snowflake/snowpark_checkpoints/utils/supported_types.py,sha256=GrMX2tHdSFnK7LlPbZx20UufD6Br6TNVRkkBwIxdPy0,1433
16
+ snowflake/snowpark_checkpoints/utils/telemetry.py,sha256=JZ5bdPBxAoyWT7clua4T_QprHcwWQChd2A5ojLFHJ0o,31366
17
+ snowflake/snowpark_checkpoints/utils/utils_checks.py,sha256=LF1EJrwJwV4gBqifXdULBBGKXxCZqC9vR7BGLe_LTSM,13490
18
+ snowpark_checkpoints_validators-0.1.1.dist-info/METADATA,sha256=o2uERHArlH9q3HpWYRGZBjzJAgYKNlolLQAd6JkiuXY,11012
19
+ snowpark_checkpoints_validators-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
20
+ snowpark_checkpoints_validators-0.1.1.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
21
+ snowpark_checkpoints_validators-0.1.1.dist-info/RECORD,,