snowpark-checkpoints-validators 0.2.0rc1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. snowflake/snowpark_checkpoints/__init__.py +44 -0
  2. snowflake/snowpark_checkpoints/__version__.py +16 -0
  3. snowflake/snowpark_checkpoints/checkpoint.py +580 -0
  4. snowflake/snowpark_checkpoints/errors.py +60 -0
  5. snowflake/snowpark_checkpoints/io_utils/__init__.py +26 -0
  6. snowflake/snowpark_checkpoints/io_utils/io_default_strategy.py +57 -0
  7. snowflake/snowpark_checkpoints/io_utils/io_env_strategy.py +133 -0
  8. snowflake/snowpark_checkpoints/io_utils/io_file_manager.py +76 -0
  9. snowflake/snowpark_checkpoints/job_context.py +128 -0
  10. snowflake/snowpark_checkpoints/singleton.py +23 -0
  11. snowflake/snowpark_checkpoints/snowpark_sampler.py +124 -0
  12. snowflake/snowpark_checkpoints/spark_migration.py +255 -0
  13. snowflake/snowpark_checkpoints/utils/__init__.py +14 -0
  14. snowflake/snowpark_checkpoints/utils/constants.py +134 -0
  15. snowflake/snowpark_checkpoints/utils/extra_config.py +132 -0
  16. snowflake/snowpark_checkpoints/utils/logging_utils.py +67 -0
  17. snowflake/snowpark_checkpoints/utils/pandera_check_manager.py +399 -0
  18. snowflake/snowpark_checkpoints/utils/supported_types.py +65 -0
  19. snowflake/snowpark_checkpoints/utils/telemetry.py +939 -0
  20. snowflake/snowpark_checkpoints/utils/utils_checks.py +398 -0
  21. snowflake/snowpark_checkpoints/validation_result_metadata.py +159 -0
  22. snowflake/snowpark_checkpoints/validation_results.py +49 -0
  23. snowpark_checkpoints_validators-0.3.0.dist-info/METADATA +325 -0
  24. snowpark_checkpoints_validators-0.3.0.dist-info/RECORD +26 -0
  25. snowpark_checkpoints_validators-0.2.0rc1.dist-info/METADATA +0 -514
  26. snowpark_checkpoints_validators-0.2.0rc1.dist-info/RECORD +0 -4
  27. {snowpark_checkpoints_validators-0.2.0rc1.dist-info → snowpark_checkpoints_validators-0.3.0.dist-info}/WHEEL +0 -0
  28. {snowpark_checkpoints_validators-0.2.0rc1.dist-info → snowpark_checkpoints_validators-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,325 @@
1
+ Metadata-Version: 2.4
2
+ Name: snowpark-checkpoints-validators
3
+ Version: 0.3.0
4
+ Summary: Migration tools for Snowpark
5
+ Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
+ Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
7
+ Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
8
+ License: Apache License, Version 2.0
9
+ License-File: LICENSE
10
+ Keywords: Snowflake,Snowpark,analytics,cloud,database,db
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Environment :: Other Environment
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: Information Technology
17
+ Classifier: Intended Audience :: System Administrators
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: SQL
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: Topic :: Software Development
25
+ Classifier: Topic :: Software Development :: Libraries
26
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Requires-Python: <3.12,>=3.9
29
+ Requires-Dist: pandera[io]==0.20.4
30
+ Requires-Dist: pydantic>=2.0
31
+ Requires-Dist: snowflake-connector-python[pandas]
32
+ Requires-Dist: snowflake-snowpark-python>=1.23.0
33
+ Provides-Extra: development
34
+ Requires-Dist: certifi==2025.1.31; extra == 'development'
35
+ Requires-Dist: coverage>=7.6.7; extra == 'development'
36
+ Requires-Dist: deepdiff==8.1.1; extra == 'development'
37
+ Requires-Dist: deepdiff>=8.0.0; extra == 'development'
38
+ Requires-Dist: hatchling==1.25.0; extra == 'development'
39
+ Requires-Dist: pre-commit>=4.0.1; extra == 'development'
40
+ Requires-Dist: pyarrow>=18.0.0; extra == 'development'
41
+ Requires-Dist: pyspark>=3.5.0; extra == 'development'
42
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
43
+ Requires-Dist: pytest>=8.3.3; extra == 'development'
44
+ Requires-Dist: setuptools>=70.0.0; extra == 'development'
45
+ Requires-Dist: twine==5.1.1; extra == 'development'
46
+ Provides-Extra: pyspark
47
+ Requires-Dist: pyspark>=3.5.0; extra == 'pyspark'
48
+ Description-Content-Type: text/markdown
49
+
50
+ # snowpark-checkpoints-validators
51
+
52
+ ---
53
+ ##### This package is on Public Preview.
54
+ ---
55
+
56
+ **snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
57
+
58
+ ---
59
+ ## Install the library
60
+ ```bash
61
+ pip install snowpark-checkpoints-validators
62
+ ```
63
+ This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
64
+ ```bash
65
+ pip install "snowpark-checkpoints-validators[pyspark]"
66
+ ```
67
+ ---
68
+
69
+ ## Features
70
+
71
+ - Validate Snowpark DataFrames against predefined Pandera schemas.
72
+ - Perform custom checks and skip specific checks as needed.
73
+ - Generate validation results and log them for further analysis.
74
+ - Support for sampling strategies to validate large datasets efficiently.
75
+ - Integration with PySpark for cross-validation between Snowpark and PySpark DataFrames.
76
+
77
+ ## Functionalities
78
+
79
+ ### Validate DataFrame Schema from File
80
+
81
+ The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
82
+
83
+ ```python
84
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
85
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
86
+ from snowflake.snowpark_checkpoints.utils.constant import (
87
+ CheckpointMode,
88
+ )
89
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
90
+ from typing import Any, Optional
91
+
92
+ # Signature of the function
93
+ def validate_dataframe_checkpoint(
94
+ df: SnowparkDataFrame,
95
+ checkpoint_name: str,
96
+ job_context: Optional[SnowparkJobContext] = None,
97
+ mode: Optional[CheckpointMode] = CheckpointMode.SCHEMA,
98
+ custom_checks: Optional[dict[Any, Any]] = None,
99
+ skip_checks: Optional[dict[Any, Any]] = None,
100
+ sample_frac: Optional[float] = 1.0,
101
+ sample_number: Optional[int] = None,
102
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
103
+ output_path: Optional[str] = None,
104
+ ):
105
+ ...
106
+ ```
107
+
108
+ - `df`: Snowpark dataframe to validate.
109
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
110
+ - `job_context`: Snowpark job context.
111
+ - `mode`: Checkpoint mode (schema or data).
112
+ - `custom_checks`: Custom checks to perform.
113
+ - `skip_checks`: Checks to skip.
114
+ - `sample_frac`: Fraction of the dataframe to sample.
115
+ - `sample_number`: Number of rows to sample.
116
+ - `sampling_strategy`: Sampling strategy to use.
117
+ - `output_path`: Output path for the checkpoint report.
118
+
119
+ ### Usage Example
120
+
121
+ ```python
122
+ from snowflake.snowpark import Session
123
+ from snowflake.snowpark_checkpoints.utils.constant import (
124
+ CheckpointMode,
125
+ )
126
+ from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
127
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
128
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
129
+ from pyspark.sql import SparkSession
130
+
131
+ session = Session.builder.getOrCreate()
132
+ job_context = SnowparkJobContext(
133
+ session, SparkSession.builder.getOrCreate(), "job_context", True
134
+ )
135
+ df = session.read.format("csv").load("data.csv")
136
+
137
+ validate_dataframe_checkpoint(
138
+ df,
139
+ "schema_checkpoint",
140
+ job_context=job_context,
141
+ mode=CheckpointMode.SCHEMA,
142
+ sample_frac=0.1,
143
+ sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
144
+ )
145
+ ```
146
+
147
+ ### Check with Spark Decorator
148
+
149
+ The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
150
+
151
+ ```python
152
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
153
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
154
+ from typing import Callable, Optional, TypeVar
155
+
156
+ fn = TypeVar("F", bound=Callable)
157
+
158
+ # Signature of the decorator
159
+ def check_with_spark(
160
+ job_context: Optional[SnowparkJobContext],
161
+ spark_function: fn,
162
+ checkpoint_name: str,
163
+ sample_number: Optional[int] = 100,
164
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
165
+ output_path: Optional[str] = None,
166
+ ) -> Callable[[fn], fn]:
167
+ ...
168
+ ```
169
+
170
+ - `job_context`: Snowpark job context.
171
+ - `spark_function`: PySpark function to execute.
172
+ - `checkpoint_name`: Name of the check.
173
+ - `sample_number`: Number of rows to sample.
174
+ - `sampling_strategy`: Sampling strategy to use.
175
+ - `output_path`: Output path for the checkpoint report.
176
+
177
+ ### Usage Example
178
+
179
+ ```python
180
+ from snowflake.snowpark import Session
181
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
182
+ from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
183
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
184
+ from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
185
+
186
+ session = Session.builder.getOrCreate()
187
+ job_context = SnowparkJobContext(
188
+ session, SparkSession.builder.getOrCreate(), "job_context", True
189
+ )
190
+
191
+ def my_spark_scalar_fn(df: SparkDataFrame):
192
+ return df.count()
193
+
194
+ @check_with_spark(
195
+ job_context=job_context,
196
+ spark_function=my_spark_scalar_fn,
197
+ checkpoint_name="count_checkpoint",
198
+ )
199
+ def my_snowpark_scalar_fn(df: SnowparkDataFrame):
200
+ return df.count()
201
+
202
+ df = job_context.snowpark_session.create_dataframe(
203
+ [[1, 2], [3, 4]], schema=["a", "b"]
204
+ )
205
+ count = my_snowpark_scalar_fn(df)
206
+ ```
207
+
208
+ ### Pandera Snowpark Decorators
209
+
210
+ The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
211
+
212
+ ```python
213
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
214
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
215
+ from pandera import DataFrameSchema
216
+ from typing import Optional
217
+
218
+ # Signature of the decorator
219
+ def check_input_schema(
220
+ pandera_schema: DataFrameSchema,
221
+ checkpoint_name: str,
222
+ sample_frac: Optional[float] = 1.0,
223
+ sample_number: Optional[int] = None,
224
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
225
+ job_context: Optional[SnowparkJobContext] = None,
226
+ output_path: Optional[str] = None,
227
+ ):
228
+ ...
229
+
230
+ # Signature of the decorator
231
+ def check_output_schema(
232
+ pandera_schema: DataFrameSchema,
233
+ checkpoint_name: str,
234
+ sample_frac: Optional[float] = 1.0,
235
+ sample_number: Optional[int] = None,
236
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
237
+ job_context: Optional[SnowparkJobContext] = None,
238
+ output_path: Optional[str] = None,
239
+ ):
240
+ ...
241
+ ```
242
+
243
+ - `pandera_schema`: Pandera schema to validate.
244
+ - `checkpoint_name`: Name of the checkpoint schema file or DataFrame.
245
+ - `sample_frac`: Fraction of the DataFrame to sample.
246
+ - `sample_number`: Number of rows to sample.
247
+ - `sampling_strategy`: Sampling strategy to use.
248
+ - `job_context`: Snowpark job context.
249
+ - `output_path`: Output path for the checkpoint report.
250
+
251
+ ### Usage Example
252
+
253
+ #### Check Input Schema Example
254
+ ```python
255
+ from pandas import DataFrame as PandasDataFrame
256
+ from pandera import DataFrameSchema, Column, Check
257
+ from snowflake.snowpark import Session
258
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
259
+ from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
260
+ from numpy import int8
261
+
262
+ df = PandasDataFrame(
263
+ {
264
+ "COLUMN1": [1, 4, 0, 10, 9],
265
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
266
+ }
267
+ )
268
+
269
+ in_schema = DataFrameSchema(
270
+ {
271
+ "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
272
+ "COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
273
+ }
274
+ )
275
+
276
+ @check_input_schema(in_schema, "input_schema_checkpoint")
277
+ def preprocessor(dataframe: SnowparkDataFrame):
278
+ dataframe = dataframe.withColumn(
279
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
280
+ )
281
+ return dataframe
282
+
283
+ session = Session.builder.getOrCreate()
284
+ sp_dataframe = session.create_dataframe(df)
285
+
286
+ preprocessed_dataframe = preprocessor(sp_dataframe)
287
+ ```
288
+
289
+ #### Check Input Schema Example
290
+ ```python
291
+ from pandas import DataFrame as PandasDataFrame
292
+ from pandera import DataFrameSchema, Column, Check
293
+ from snowflake.snowpark import Session
294
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
295
+ from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
296
+ from numpy import int8
297
+
298
+ df = PandasDataFrame(
299
+ {
300
+ "COLUMN1": [1, 4, 0, 10, 9],
301
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
302
+ }
303
+ )
304
+
305
+ out_schema = DataFrameSchema(
306
+ {
307
+ "COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
308
+ "COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
309
+ "COLUMN3": Column(float, Check.less_than(10)),
310
+ }
311
+ )
312
+
313
+ @check_output_schema(out_schema, "output_schema_checkpoint")
314
+ def preprocessor(dataframe: SnowparkDataFrame):
315
+ return dataframe.with_column(
316
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
317
+ )
318
+
319
+ session = Session.builder.getOrCreate()
320
+ sp_dataframe = session.create_dataframe(df)
321
+
322
+ preprocessed_dataframe = preprocessor(sp_dataframe)
323
+ ```
324
+
325
+ ------
@@ -0,0 +1,26 @@
1
+ snowflake/snowpark_checkpoints/__init__.py,sha256=p7fzH3f8foD5nhNJHZ00JT3ODTXJGGkWTd3xRKx-8aQ,1435
2
+ snowflake/snowpark_checkpoints/__version__.py,sha256=kbbDnlkY7JOLNHvfWYkCO_mOBOV9GniMGdxYoQpLhyg,632
3
+ snowflake/snowpark_checkpoints/checkpoint.py,sha256=i-iDRYbGvQHy9ipW7UxHVhJhQ9BXNSO-bsCcHyg3oLA,22056
4
+ snowflake/snowpark_checkpoints/errors.py,sha256=9KjzRf8bjDZTTNL4LeySJAwuucDOyz0Ka7EFBKWFpyg,1821
5
+ snowflake/snowpark_checkpoints/job_context.py,sha256=RMK0g0HrbDVrOAvai4PgsGvsAn_GIo9aFmh-tWlyieY,4183
6
+ snowflake/snowpark_checkpoints/singleton.py,sha256=7AgIHQBXVRvPBBCkmBplzkdrrm-xVWf_N8svzA2vF8E,836
7
+ snowflake/snowpark_checkpoints/snowpark_sampler.py,sha256=Qxv-8nRGuf-ab3GoSUt8_MNL0ppjoBIMOFIMkqmwN5I,4668
8
+ snowflake/snowpark_checkpoints/spark_migration.py,sha256=s2HqomYx76Hqn71g9TleBeHI3t1nirgfPvkggqQQdts,10253
9
+ snowflake/snowpark_checkpoints/validation_result_metadata.py,sha256=5C8f1g-Grs2ydpXiZBLGt5n9cvEHBaw2-CDeb2vnhpg,5847
10
+ snowflake/snowpark_checkpoints/validation_results.py,sha256=J8OcpNty6hQD8RbAy8xmA0UMbPWfXSmQnHYspWWSisk,1502
11
+ snowflake/snowpark_checkpoints/io_utils/__init__.py,sha256=fmSEYcBGNASBanNvMVW-uv6hcoYre6kEH35K-RliuiA,954
12
+ snowflake/snowpark_checkpoints/io_utils/io_default_strategy.py,sha256=VMfdqj4uDgTEinmpC3D0zXncIB9FxWJod1rI-Yt3YVA,1869
13
+ snowflake/snowpark_checkpoints/io_utils/io_env_strategy.py,sha256=ltG_rxm0CkJFXpskOf__ByZw-C6B9LtycqlyB9EmaJI,3569
14
+ snowflake/snowpark_checkpoints/io_utils/io_file_manager.py,sha256=YHrxRBzTlhIUrSFrsoWkRY_Qa-TXgDWglr00T98Tc5g,2485
15
+ snowflake/snowpark_checkpoints/utils/__init__.py,sha256=I4srmZ8G1q9DU6Suo1S91aVfNvETyisKH95uvLAvEJ0,609
16
+ snowflake/snowpark_checkpoints/utils/constants.py,sha256=pgFttLDQ6fTa6obSdvivWBYClS21ap41YVDNGAS4sxY,4146
17
+ snowflake/snowpark_checkpoints/utils/extra_config.py,sha256=xOYaG6MfsUCAHI0C_7qWF_m96xcLIZWwrgxY4UlpaZI,4325
18
+ snowflake/snowpark_checkpoints/utils/logging_utils.py,sha256=yyi6X5DqKeTg0HRhvsH6ymYp2P0wbnyKIzI2RzrQS7k,2278
19
+ snowflake/snowpark_checkpoints/utils/pandera_check_manager.py,sha256=tQIozLO-2kM8WZ-gGKfRwmXBx1cDPaIZB0qIcArp8xA,16100
20
+ snowflake/snowpark_checkpoints/utils/supported_types.py,sha256=GrMX2tHdSFnK7LlPbZx20UufD6Br6TNVRkkBwIxdPy0,1433
21
+ snowflake/snowpark_checkpoints/utils/telemetry.py,sha256=GfuyIaI3QG4a4_qWwyJHvWRM0GENunNexuEJ6IgscF4,32684
22
+ snowflake/snowpark_checkpoints/utils/utils_checks.py,sha256=oQ1c4n-uAA2kFIpWIRPWhbCW8e-wwOIL8qDqLvr5Fok,14398
23
+ snowpark_checkpoints_validators-0.3.0.dist-info/METADATA,sha256=RbOlEHK5kumiBPP2S7-7k7zxzzLYag7Yb6TtQeOYbV0,11557
24
+ snowpark_checkpoints_validators-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
+ snowpark_checkpoints_validators-0.3.0.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
26
+ snowpark_checkpoints_validators-0.3.0.dist-info/RECORD,,