snowpark-checkpoints-validators 0.2.0rc1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. snowflake/snowpark_checkpoints/__init__.py +44 -0
  2. snowflake/snowpark_checkpoints/__version__.py +16 -0
  3. snowflake/snowpark_checkpoints/checkpoint.py +580 -0
  4. snowflake/snowpark_checkpoints/errors.py +60 -0
  5. snowflake/snowpark_checkpoints/job_context.py +128 -0
  6. snowflake/snowpark_checkpoints/singleton.py +23 -0
  7. snowflake/snowpark_checkpoints/snowpark_sampler.py +124 -0
  8. snowflake/snowpark_checkpoints/spark_migration.py +255 -0
  9. snowflake/snowpark_checkpoints/utils/__init__.py +14 -0
  10. snowflake/snowpark_checkpoints/utils/constants.py +134 -0
  11. snowflake/snowpark_checkpoints/utils/extra_config.py +89 -0
  12. snowflake/snowpark_checkpoints/utils/logging_utils.py +67 -0
  13. snowflake/snowpark_checkpoints/utils/pandera_check_manager.py +399 -0
  14. snowflake/snowpark_checkpoints/utils/supported_types.py +65 -0
  15. snowflake/snowpark_checkpoints/utils/telemetry.py +900 -0
  16. snowflake/snowpark_checkpoints/utils/utils_checks.py +395 -0
  17. snowflake/snowpark_checkpoints/validation_result_metadata.py +155 -0
  18. snowflake/snowpark_checkpoints/validation_results.py +49 -0
  19. snowpark_checkpoints_validators-0.2.1.dist-info/METADATA +323 -0
  20. snowpark_checkpoints_validators-0.2.1.dist-info/RECORD +22 -0
  21. snowpark_checkpoints_validators-0.2.0rc1.dist-info/METADATA +0 -514
  22. snowpark_checkpoints_validators-0.2.0rc1.dist-info/RECORD +0 -4
  23. {snowpark_checkpoints_validators-0.2.0rc1.dist-info → snowpark_checkpoints_validators-0.2.1.dist-info}/WHEEL +0 -0
  24. {snowpark_checkpoints_validators-0.2.0rc1.dist-info → snowpark_checkpoints_validators-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,323 @@
1
+ Metadata-Version: 2.4
2
+ Name: snowpark-checkpoints-validators
3
+ Version: 0.2.1
4
+ Summary: Migration tools for Snowpark
5
+ Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
+ Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
7
+ Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
8
+ License: Apache License, Version 2.0
9
+ License-File: LICENSE
10
+ Keywords: Snowflake,Snowpark,analytics,cloud,database,db
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Environment :: Other Environment
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: Information Technology
17
+ Classifier: Intended Audience :: System Administrators
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: SQL
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: Topic :: Software Development
25
+ Classifier: Topic :: Software Development :: Libraries
26
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Requires-Python: <3.12,>=3.9
29
+ Requires-Dist: pandera[io]==0.20.4
30
+ Requires-Dist: snowflake-connector-python[pandas]
31
+ Requires-Dist: snowflake-snowpark-python>=1.23.0
32
+ Provides-Extra: development
33
+ Requires-Dist: coverage>=7.6.7; extra == 'development'
34
+ Requires-Dist: deepdiff==8.1.1; extra == 'development'
35
+ Requires-Dist: deepdiff>=8.0.0; extra == 'development'
36
+ Requires-Dist: hatchling==1.25.0; extra == 'development'
37
+ Requires-Dist: pre-commit>=4.0.1; extra == 'development'
38
+ Requires-Dist: pyarrow>=18.0.0; extra == 'development'
39
+ Requires-Dist: pyspark>=3.5.0; extra == 'development'
40
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
41
+ Requires-Dist: pytest>=8.3.3; extra == 'development'
42
+ Requires-Dist: setuptools>=70.0.0; extra == 'development'
43
+ Requires-Dist: twine==5.1.1; extra == 'development'
44
+ Provides-Extra: pyspark
45
+ Requires-Dist: pyspark>=3.5.0; extra == 'pyspark'
46
+ Description-Content-Type: text/markdown
47
+
48
+ # snowpark-checkpoints-validators
49
+
50
+ ---
51
+ ##### This package is on Public Preview.
52
+ ---
53
+
54
+ **snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
55
+
56
+ ---
57
+ ## Install the library
58
+ ```bash
59
+ pip install snowpark-checkpoints-validators
60
+ ```
61
+ This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
62
+ ```bash
63
+ pip install "snowpark-checkpoints-validators[pyspark]"
64
+ ```
65
+ ---
66
+
67
+ ## Features
68
+
69
+ - Validate Snowpark DataFrames against predefined Pandera schemas.
70
+ - Perform custom checks and skip specific checks as needed.
71
+ - Generate validation results and log them for further analysis.
72
+ - Support for sampling strategies to validate large datasets efficiently.
73
+ - Integration with PySpark for cross-validation between Snowpark and PySpark DataFrames.
74
+
75
+ ## Functionalities
76
+
77
+ ### Validate DataFrame Schema from File
78
+
79
+ The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
80
+
81
+ ```python
82
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
83
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
84
+ from snowflake.snowpark_checkpoints.utils.constant import (
85
+ CheckpointMode,
86
+ )
87
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
88
+ from typing import Any, Optional
89
+
90
+ # Signature of the function
91
+ def validate_dataframe_checkpoint(
92
+ df: SnowparkDataFrame,
93
+ checkpoint_name: str,
94
+ job_context: Optional[SnowparkJobContext] = None,
95
+ mode: Optional[CheckpointMode] = CheckpointMode.SCHEMA,
96
+ custom_checks: Optional[dict[Any, Any]] = None,
97
+ skip_checks: Optional[dict[Any, Any]] = None,
98
+ sample_frac: Optional[float] = 1.0,
99
+ sample_number: Optional[int] = None,
100
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
101
+ output_path: Optional[str] = None,
102
+ ):
103
+ ...
104
+ ```
105
+
106
+ - `df`: Snowpark dataframe to validate.
107
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
108
+ - `job_context`: Snowpark job context.
109
+ - `mode`: Checkpoint mode (schema or data).
110
+ - `custom_checks`: Custom checks to perform.
111
+ - `skip_checks`: Checks to skip.
112
+ - `sample_frac`: Fraction of the dataframe to sample.
113
+ - `sample_number`: Number of rows to sample.
114
+ - `sampling_strategy`: Sampling strategy to use.
115
+ - `output_path`: Output path for the checkpoint report.
116
+
117
+ ### Usage Example
118
+
119
+ ```python
120
+ from snowflake.snowpark import Session
121
+ from snowflake.snowpark_checkpoints.utils.constant import (
122
+ CheckpointMode,
123
+ )
124
+ from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
125
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
126
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
127
+ from pyspark.sql import SparkSession
128
+
129
+ session = Session.builder.getOrCreate()
130
+ job_context = SnowparkJobContext(
131
+ session, SparkSession.builder.getOrCreate(), "job_context", True
132
+ )
133
+ df = session.read.format("csv").load("data.csv")
134
+
135
+ validate_dataframe_checkpoint(
136
+ df,
137
+ "schema_checkpoint",
138
+ job_context=job_context,
139
+ mode=CheckpointMode.SCHEMA,
140
+ sample_frac=0.1,
141
+ sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
142
+ )
143
+ ```
144
+
145
+ ### Check with Spark Decorator
146
+
147
+ The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
148
+
149
+ ```python
150
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
151
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
152
+ from typing import Callable, Optional, TypeVar
153
+
154
+ fn = TypeVar("F", bound=Callable)
155
+
156
+ # Signature of the decorator
157
+ def check_with_spark(
158
+ job_context: Optional[SnowparkJobContext],
159
+ spark_function: fn,
160
+ checkpoint_name: str,
161
+ sample_number: Optional[int] = 100,
162
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
163
+ output_path: Optional[str] = None,
164
+ ) -> Callable[[fn], fn]:
165
+ ...
166
+ ```
167
+
168
+ - `job_context`: Snowpark job context.
169
+ - `spark_function`: PySpark function to execute.
170
+ - `checkpoint_name`: Name of the check.
171
+ - `sample_number`: Number of rows to sample.
172
+ - `sampling_strategy`: Sampling strategy to use.
173
+ - `output_path`: Output path for the checkpoint report.
174
+
175
+ ### Usage Example
176
+
177
+ ```python
178
+ from snowflake.snowpark import Session
179
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
180
+ from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
181
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
182
+ from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
183
+
184
+ session = Session.builder.getOrCreate()
185
+ job_context = SnowparkJobContext(
186
+ session, SparkSession.builder.getOrCreate(), "job_context", True
187
+ )
188
+
189
+ def my_spark_scalar_fn(df: SparkDataFrame):
190
+ return df.count()
191
+
192
+ @check_with_spark(
193
+ job_context=job_context,
194
+ spark_function=my_spark_scalar_fn,
195
+ checkpoint_name="count_checkpoint",
196
+ )
197
+ def my_snowpark_scalar_fn(df: SnowparkDataFrame):
198
+ return df.count()
199
+
200
+ df = job_context.snowpark_session.create_dataframe(
201
+ [[1, 2], [3, 4]], schema=["a", "b"]
202
+ )
203
+ count = my_snowpark_scalar_fn(df)
204
+ ```
205
+
206
+ ### Pandera Snowpark Decorators
207
+
208
+ The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
209
+
210
+ ```python
211
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
212
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
213
+ from pandera import DataFrameSchema
214
+ from typing import Optional
215
+
216
+ # Signature of the decorator
217
+ def check_input_schema(
218
+ pandera_schema: DataFrameSchema,
219
+ checkpoint_name: str,
220
+ sample_frac: Optional[float] = 1.0,
221
+ sample_number: Optional[int] = None,
222
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
223
+ job_context: Optional[SnowparkJobContext] = None,
224
+ output_path: Optional[str] = None,
225
+ ):
226
+ ...
227
+
228
+ # Signature of the decorator
229
+ def check_output_schema(
230
+ pandera_schema: DataFrameSchema,
231
+ checkpoint_name: str,
232
+ sample_frac: Optional[float] = 1.0,
233
+ sample_number: Optional[int] = None,
234
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
235
+ job_context: Optional[SnowparkJobContext] = None,
236
+ output_path: Optional[str] = None,
237
+ ):
238
+ ...
239
+ ```
240
+
241
+ - `pandera_schema`: Pandera schema to validate.
242
+ - `checkpoint_name`: Name of the checkpoint schema file or DataFrame.
243
+ - `sample_frac`: Fraction of the DataFrame to sample.
244
+ - `sample_number`: Number of rows to sample.
245
+ - `sampling_strategy`: Sampling strategy to use.
246
+ - `job_context`: Snowpark job context.
247
+ - `output_path`: Output path for the checkpoint report.
248
+
249
+ ### Usage Example
250
+
251
+ #### Check Input Schema Example
252
+ ```python
253
+ from pandas import DataFrame as PandasDataFrame
254
+ from pandera import DataFrameSchema, Column, Check
255
+ from snowflake.snowpark import Session
256
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
257
+ from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
258
+ from numpy import int8
259
+
260
+ df = PandasDataFrame(
261
+ {
262
+ "COLUMN1": [1, 4, 0, 10, 9],
263
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
264
+ }
265
+ )
266
+
267
+ in_schema = DataFrameSchema(
268
+ {
269
+ "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
270
+ "COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
271
+ }
272
+ )
273
+
274
+ @check_input_schema(in_schema, "input_schema_checkpoint")
275
+ def preprocessor(dataframe: SnowparkDataFrame):
276
+ dataframe = dataframe.withColumn(
277
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
278
+ )
279
+ return dataframe
280
+
281
+ session = Session.builder.getOrCreate()
282
+ sp_dataframe = session.create_dataframe(df)
283
+
284
+ preprocessed_dataframe = preprocessor(sp_dataframe)
285
+ ```
286
+
287
+ #### Check Input Schema Example
288
+ ```python
289
+ from pandas import DataFrame as PandasDataFrame
290
+ from pandera import DataFrameSchema, Column, Check
291
+ from snowflake.snowpark import Session
292
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
293
+ from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
294
+ from numpy import int8
295
+
296
+ df = PandasDataFrame(
297
+ {
298
+ "COLUMN1": [1, 4, 0, 10, 9],
299
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
300
+ }
301
+ )
302
+
303
+ out_schema = DataFrameSchema(
304
+ {
305
+ "COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
306
+ "COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
307
+ "COLUMN3": Column(float, Check.less_than(10)),
308
+ }
309
+ )
310
+
311
+ @check_output_schema(out_schema, "output_schema_checkpoint")
312
+ def preprocessor(dataframe: SnowparkDataFrame):
313
+ return dataframe.with_column(
314
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
315
+ )
316
+
317
+ session = Session.builder.getOrCreate()
318
+ sp_dataframe = session.create_dataframe(df)
319
+
320
+ preprocessed_dataframe = preprocessor(sp_dataframe)
321
+ ```
322
+
323
+ ------
@@ -0,0 +1,22 @@
1
+ snowflake/snowpark_checkpoints/__init__.py,sha256=p7fzH3f8foD5nhNJHZ00JT3ODTXJGGkWTd3xRKx-8aQ,1435
2
+ snowflake/snowpark_checkpoints/__version__.py,sha256=jEnm4p_P4FqdYsTq3hnGQnhLZ4KwL0_Ew8fDF8BRL98,632
3
+ snowflake/snowpark_checkpoints/checkpoint.py,sha256=i-iDRYbGvQHy9ipW7UxHVhJhQ9BXNSO-bsCcHyg3oLA,22056
4
+ snowflake/snowpark_checkpoints/errors.py,sha256=9KjzRf8bjDZTTNL4LeySJAwuucDOyz0Ka7EFBKWFpyg,1821
5
+ snowflake/snowpark_checkpoints/job_context.py,sha256=RMK0g0HrbDVrOAvai4PgsGvsAn_GIo9aFmh-tWlyieY,4183
6
+ snowflake/snowpark_checkpoints/singleton.py,sha256=7AgIHQBXVRvPBBCkmBplzkdrrm-xVWf_N8svzA2vF8E,836
7
+ snowflake/snowpark_checkpoints/snowpark_sampler.py,sha256=Qxv-8nRGuf-ab3GoSUt8_MNL0ppjoBIMOFIMkqmwN5I,4668
8
+ snowflake/snowpark_checkpoints/spark_migration.py,sha256=s2HqomYx76Hqn71g9TleBeHI3t1nirgfPvkggqQQdts,10253
9
+ snowflake/snowpark_checkpoints/validation_result_metadata.py,sha256=fm2lKxjYlzlL6qsiv2icR9k5o7YNd2OwvFhiqGYrTpo,5745
10
+ snowflake/snowpark_checkpoints/validation_results.py,sha256=J8OcpNty6hQD8RbAy8xmA0UMbPWfXSmQnHYspWWSisk,1502
11
+ snowflake/snowpark_checkpoints/utils/__init__.py,sha256=I4srmZ8G1q9DU6Suo1S91aVfNvETyisKH95uvLAvEJ0,609
12
+ snowflake/snowpark_checkpoints/utils/constants.py,sha256=pgFttLDQ6fTa6obSdvivWBYClS21ap41YVDNGAS4sxY,4146
13
+ snowflake/snowpark_checkpoints/utils/extra_config.py,sha256=LvOdIhvE450AV0wLVK5P_hANvcNzAv8pLNe7Ksr598U,2802
14
+ snowflake/snowpark_checkpoints/utils/logging_utils.py,sha256=yyi6X5DqKeTg0HRhvsH6ymYp2P0wbnyKIzI2RzrQS7k,2278
15
+ snowflake/snowpark_checkpoints/utils/pandera_check_manager.py,sha256=tQIozLO-2kM8WZ-gGKfRwmXBx1cDPaIZB0qIcArp8xA,16100
16
+ snowflake/snowpark_checkpoints/utils/supported_types.py,sha256=GrMX2tHdSFnK7LlPbZx20UufD6Br6TNVRkkBwIxdPy0,1433
17
+ snowflake/snowpark_checkpoints/utils/telemetry.py,sha256=_WOVo19BxcF6cpQDplID6BEOvgJfHTGK1JZI1-OI4uc,31370
18
+ snowflake/snowpark_checkpoints/utils/utils_checks.py,sha256=LFdEzVgirkymXD5LlzuE_lv43yAa3OMIXEnloRAXkGc,14204
19
+ snowpark_checkpoints_validators-0.2.1.dist-info/METADATA,sha256=nhKZaDnpjcwwsH4PTAxqtFCqJEZ_UY-p0J_S5863Tvs,11470
20
+ snowpark_checkpoints_validators-0.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
21
+ snowpark_checkpoints_validators-0.2.1.dist-info/licenses/LICENSE,sha256=pmjhbh6uVhV5MBXOlou_UZgFP7CYVQITkCCdvfcS5lY,11340
22
+ snowpark_checkpoints_validators-0.2.1.dist-info/RECORD,,