snowpark-checkpoints-validators 0.1.0rc2__tar.gz → 0.1.0rc3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowpark_checkpoints_validators-0.1.0rc3/PKG-INFO +313 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/pyproject.toml +6 -4
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/telemetry.py +68 -12
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_compare_utils.py +6 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/df_mode_dataframe_mismatch_telemetry.json +5 -4
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/df_mode_dataframe_telemetry.json +5 -4
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_df_fail_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_df_pass_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_limit_sample_telemetry.json +18 -0
- snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_random_sample_telemetry.json +18 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/spark_checkpoint_scalar_fail_telemetry.json +5 -4
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/spark_checkpoint_scalar_passing_telemetry.json +5 -4
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_df_check_custom_check_telemetry.json +6 -5
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_df_check_fail_telemetry.json +6 -5
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_df_check_from_file_telemetry.json +6 -5
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_df_check_skip_check_telemetry.json +6 -5
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_df_check_telemetry.json +6 -5
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_input_fail_telemetry.json +5 -4
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_input_telemetry.json +5 -4
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_output_fail_telemetry.json +5 -4
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_output_telemetry.json +5 -4
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_telemetry.py +97 -18
- snowpark_checkpoints_validators-0.1.0rc2/PKG-INFO +0 -514
- snowpark_checkpoints_validators-0.1.0rc2/test/integ/telemetry_expected/spark_checkpoint_df_fail_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.1.0rc2/test/integ/telemetry_expected/spark_checkpoint_df_pass_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.1.0rc2/test/integ/telemetry_expected/spark_checkpoint_limit_sample_telemetry.json +0 -17
- snowpark_checkpoints_validators-0.1.0rc2/test/integ/telemetry_expected/spark_checkpoint_random_sample_telemetry.json +0 -17
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/.gitignore +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/CHANGELOG.md +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/LICENSE +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/README.md +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/__init__.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/checkpoint.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/errors.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/job_context.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/singleton.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/snowpark_sampler.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/spark_migration.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/__init__.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/checkpoint_logger.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/constants.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/extra_config.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/pandera_check_manager.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/supported_types.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/utils_checks.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/validation_result_metadata.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/validation_results.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/.coveragerc +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/e2eexample.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_pandera.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_parquet.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_spark_checkpoint.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_extra_config.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_pandera_check_manager.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_spark_migration.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_utils_checks.py +0 -0
- {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_validation_result_metadata.py +0 -0
@@ -0,0 +1,313 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: snowpark-checkpoints-validators
|
3
|
+
Version: 0.1.0rc3
|
4
|
+
Summary: Migration tools for Snowpark
|
5
|
+
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
|
+
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
7
|
+
Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
|
8
|
+
License: Apache License, Version 2.0
|
9
|
+
License-File: LICENSE
|
10
|
+
Keywords: Snowflake,Snowpark,analytics,cloud,database,db
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
12
|
+
Classifier: Environment :: Console
|
13
|
+
Classifier: Environment :: Other Environment
|
14
|
+
Classifier: Intended Audience :: Developers
|
15
|
+
Classifier: Intended Audience :: Education
|
16
|
+
Classifier: Intended Audience :: Information Technology
|
17
|
+
Classifier: Intended Audience :: System Administrators
|
18
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
19
|
+
Classifier: Operating System :: OS Independent
|
20
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
21
|
+
Classifier: Programming Language :: SQL
|
22
|
+
Classifier: Topic :: Database
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
24
|
+
Classifier: Topic :: Software Development
|
25
|
+
Classifier: Topic :: Software Development :: Libraries
|
26
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
|
+
Requires-Python: <3.12,>=3.9
|
29
|
+
Requires-Dist: pandera-report==0.1.2
|
30
|
+
Requires-Dist: pandera[io]==0.20.4
|
31
|
+
Requires-Dist: pyspark
|
32
|
+
Requires-Dist: snowflake-connector-python
|
33
|
+
Requires-Dist: snowflake-snowpark-python
|
34
|
+
Provides-Extra: development
|
35
|
+
Requires-Dist: coverage>=7.6.7; extra == 'development'
|
36
|
+
Requires-Dist: deepdiff>=8.0.0; extra == 'development'
|
37
|
+
Requires-Dist: hatchling==1.25.0; extra == 'development'
|
38
|
+
Requires-Dist: pre-commit>=4.0.1; extra == 'development'
|
39
|
+
Requires-Dist: pyarrow>=18.0.0; extra == 'development'
|
40
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
|
41
|
+
Requires-Dist: pytest>=8.3.3; extra == 'development'
|
42
|
+
Requires-Dist: setuptools>=70.0.0; extra == 'development'
|
43
|
+
Requires-Dist: twine==5.1.1; extra == 'development'
|
44
|
+
Description-Content-Type: text/markdown
|
45
|
+
|
46
|
+
# snowpark-checkpoints-validators
|
47
|
+
|
48
|
+
---
|
49
|
+
**NOTE**
|
50
|
+
|
51
|
+
This package is on Private Preview.
|
52
|
+
|
53
|
+
---
|
54
|
+
|
55
|
+
**snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
|
56
|
+
|
57
|
+
## Features
|
58
|
+
|
59
|
+
- Validate Snowpark DataFrames against predefined Pandera schemas.
|
60
|
+
- Perform custom checks and skip specific checks as needed.
|
61
|
+
- Generate validation results and log them for further analysis.
|
62
|
+
- Support for sampling strategies to validate large datasets efficiently.
|
63
|
+
- Integration with PySpark for cross-validation between Snowpark and PySpark DataFrames.
|
64
|
+
|
65
|
+
## Functionalities
|
66
|
+
|
67
|
+
### Validate DataFrame Schema from File
|
68
|
+
|
69
|
+
The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
|
70
|
+
|
71
|
+
```python
|
72
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
73
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
74
|
+
from snowflake.snowpark_checkpoints.utils.constant import (
|
75
|
+
CheckpointMode,
|
76
|
+
)
|
77
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
78
|
+
from typing import Any, Optional
|
79
|
+
|
80
|
+
# Signature of the function
|
81
|
+
def validate_dataframe_checkpoint(
|
82
|
+
df: SnowparkDataFrame,
|
83
|
+
checkpoint_name: str,
|
84
|
+
job_context: Optional[SnowparkJobContext] = None,
|
85
|
+
mode: Optional[CheckpointMode] = CheckpointMode.SCHEMA,
|
86
|
+
custom_checks: Optional[dict[Any, Any]] = None,
|
87
|
+
skip_checks: Optional[dict[Any, Any]] = None,
|
88
|
+
sample_frac: Optional[float] = 1.0,
|
89
|
+
sample_number: Optional[int] = None,
|
90
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
91
|
+
output_path: Optional[str] = None,
|
92
|
+
):
|
93
|
+
...
|
94
|
+
```
|
95
|
+
|
96
|
+
- `df`: Snowpark dataframe to validate.
|
97
|
+
- `checkpoint_name`: Name of the checkpoint schema file or dataframe.
|
98
|
+
- `job_context`: Snowpark job context.
|
99
|
+
- `mode`: Checkpoint mode (schema or data).
|
100
|
+
- `custom_checks`: Custom checks to perform.
|
101
|
+
- `skip_checks`: Checks to skip.
|
102
|
+
- `sample_frac`: Fraction of the dataframe to sample.
|
103
|
+
- `sample_number`: Number of rows to sample.
|
104
|
+
- `sampling_strategy`: Sampling strategy to use.
|
105
|
+
- `output_path`: Output path for the checkpoint report.
|
106
|
+
|
107
|
+
### Usage Example
|
108
|
+
|
109
|
+
```python
|
110
|
+
from snowflake.snowpark import Session
|
111
|
+
from snowflake.snowpark_checkpoints.utils.constant import (
|
112
|
+
CheckpointMode,
|
113
|
+
)
|
114
|
+
from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
|
115
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
116
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
117
|
+
from pyspark.sql import SparkSession
|
118
|
+
|
119
|
+
session = Session.builder.getOrCreate()
|
120
|
+
job_context = SnowparkJobContext(
|
121
|
+
session, SparkSession.builder.getOrCreate(), "job_context", True
|
122
|
+
)
|
123
|
+
df = session.read.format("csv").load("data.csv")
|
124
|
+
|
125
|
+
validate_dataframe_checkpoint(
|
126
|
+
df,
|
127
|
+
"schema_checkpoint",
|
128
|
+
job_context=job_context,
|
129
|
+
mode=CheckpointMode.SCHEMA,
|
130
|
+
sample_frac=0.1,
|
131
|
+
sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
|
132
|
+
)
|
133
|
+
```
|
134
|
+
|
135
|
+
### Check with Spark Decorator
|
136
|
+
|
137
|
+
The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
|
138
|
+
|
139
|
+
```python
|
140
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
141
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
142
|
+
from typing import Callable, Optional, TypeVar
|
143
|
+
|
144
|
+
fn = TypeVar("F", bound=Callable)
|
145
|
+
|
146
|
+
# Signature of the decorator
|
147
|
+
def check_with_spark(
|
148
|
+
job_context: Optional[SnowparkJobContext],
|
149
|
+
spark_function: fn,
|
150
|
+
checkpoint_name: str,
|
151
|
+
sample_number: Optional[int] = 100,
|
152
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
153
|
+
output_path: Optional[str] = None,
|
154
|
+
) -> Callable[[fn], fn]:
|
155
|
+
...
|
156
|
+
```
|
157
|
+
|
158
|
+
- `job_context`: Snowpark job context.
|
159
|
+
- `spark_function`: PySpark function to execute.
|
160
|
+
- `checkpoint_name`: Name of the check.
|
161
|
+
- `sample_number`: Number of rows to sample.
|
162
|
+
- `sampling_strategy`: Sampling strategy to use.
|
163
|
+
- `output_path`: Output path for the checkpoint report.
|
164
|
+
|
165
|
+
### Usage Example
|
166
|
+
|
167
|
+
```python
|
168
|
+
from snowflake.snowpark import Session
|
169
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
170
|
+
from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
|
171
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
172
|
+
from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
|
173
|
+
|
174
|
+
session = Session.builder.getOrCreate()
|
175
|
+
job_context = SnowparkJobContext(
|
176
|
+
session, SparkSession.builder.getOrCreate(), "job_context", True
|
177
|
+
)
|
178
|
+
|
179
|
+
def my_spark_scalar_fn(df: SparkDataFrame):
|
180
|
+
return df.count()
|
181
|
+
|
182
|
+
@check_with_spark(
|
183
|
+
job_context=job_context,
|
184
|
+
spark_function=my_spark_scalar_fn,
|
185
|
+
checkpoint_name="count_checkpoint",
|
186
|
+
)
|
187
|
+
def my_snowpark_scalar_fn(df: SnowparkDataFrame):
|
188
|
+
return df.count()
|
189
|
+
|
190
|
+
df = job_context.snowpark_session.create_dataframe(
|
191
|
+
[[1, 2], [3, 4]], schema=["a", "b"]
|
192
|
+
)
|
193
|
+
count = my_snowpark_scalar_fn(df)
|
194
|
+
```
|
195
|
+
|
196
|
+
### Pandera Snowpark Decorators
|
197
|
+
|
198
|
+
The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
|
199
|
+
|
200
|
+
```python
|
201
|
+
from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
|
202
|
+
from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
|
203
|
+
from pandera import DataFrameSchema
|
204
|
+
from typing import Optional
|
205
|
+
|
206
|
+
# Signature of the decorator
|
207
|
+
def check_input_schema(
|
208
|
+
pandera_schema: DataFrameSchema,
|
209
|
+
checkpoint_name: str,
|
210
|
+
sample_frac: Optional[float] = 1.0,
|
211
|
+
sample_number: Optional[int] = None,
|
212
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
213
|
+
job_context: Optional[SnowparkJobContext] = None,
|
214
|
+
output_path: Optional[str] = None,
|
215
|
+
):
|
216
|
+
...
|
217
|
+
|
218
|
+
# Signature of the decorator
|
219
|
+
def check_output_schema(
|
220
|
+
pandera_schema: DataFrameSchema,
|
221
|
+
checkpoint_name: str,
|
222
|
+
sample_frac: Optional[float] = 1.0,
|
223
|
+
sample_number: Optional[int] = None,
|
224
|
+
sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
|
225
|
+
job_context: Optional[SnowparkJobContext] = None,
|
226
|
+
output_path: Optional[str] = None,
|
227
|
+
):
|
228
|
+
...
|
229
|
+
```
|
230
|
+
|
231
|
+
- `pandera_schema`: Pandera schema to validate.
|
232
|
+
- `checkpoint_name`: Name of the checkpoint schema file or DataFrame.
|
233
|
+
- `sample_frac`: Fraction of the DataFrame to sample.
|
234
|
+
- `sample_number`: Number of rows to sample.
|
235
|
+
- `sampling_strategy`: Sampling strategy to use.
|
236
|
+
- `job_context`: Snowpark job context.
|
237
|
+
- `output_path`: Output path for the checkpoint report.
|
238
|
+
|
239
|
+
### Usage Example
|
240
|
+
|
241
|
+
#### Check Input Schema Example
|
242
|
+
```python
|
243
|
+
from pandas import DataFrame as PandasDataFrame
|
244
|
+
from pandera import DataFrameSchema, Column, Check
|
245
|
+
from snowflake.snowpark import Session
|
246
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
247
|
+
from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
|
248
|
+
from numpy import int8
|
249
|
+
|
250
|
+
df = PandasDataFrame(
|
251
|
+
{
|
252
|
+
"COLUMN1": [1, 4, 0, 10, 9],
|
253
|
+
"COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
|
254
|
+
}
|
255
|
+
)
|
256
|
+
|
257
|
+
in_schema = DataFrameSchema(
|
258
|
+
{
|
259
|
+
"COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
|
260
|
+
"COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
|
261
|
+
}
|
262
|
+
)
|
263
|
+
|
264
|
+
@check_input_schema(in_schema, "input_schema_checkpoint")
|
265
|
+
def preprocessor(dataframe: SnowparkDataFrame):
|
266
|
+
dataframe = dataframe.withColumn(
|
267
|
+
"COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
|
268
|
+
)
|
269
|
+
return dataframe
|
270
|
+
|
271
|
+
session = Session.builder.getOrCreate()
|
272
|
+
sp_dataframe = session.create_dataframe(df)
|
273
|
+
|
274
|
+
preprocessed_dataframe = preprocessor(sp_dataframe)
|
275
|
+
```
|
276
|
+
|
277
|
+
#### Check Input Schema Example
|
278
|
+
```python
|
279
|
+
from pandas import DataFrame as PandasDataFrame
|
280
|
+
from pandera import DataFrameSchema, Column, Check
|
281
|
+
from snowflake.snowpark import Session
|
282
|
+
from snowflake.snowpark import DataFrame as SnowparkDataFrame
|
283
|
+
from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
|
284
|
+
from numpy import int8
|
285
|
+
|
286
|
+
df = PandasDataFrame(
|
287
|
+
{
|
288
|
+
"COLUMN1": [1, 4, 0, 10, 9],
|
289
|
+
"COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
|
290
|
+
}
|
291
|
+
)
|
292
|
+
|
293
|
+
out_schema = DataFrameSchema(
|
294
|
+
{
|
295
|
+
"COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
|
296
|
+
"COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
|
297
|
+
"COLUMN3": Column(float, Check.less_than(10)),
|
298
|
+
}
|
299
|
+
)
|
300
|
+
|
301
|
+
@check_output_schema(out_schema, "output_schema_checkpoint")
|
302
|
+
def preprocessor(dataframe: SnowparkDataFrame):
|
303
|
+
return dataframe.with_column(
|
304
|
+
"COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
|
305
|
+
)
|
306
|
+
|
307
|
+
session = Session.builder.getOrCreate()
|
308
|
+
sp_dataframe = session.create_dataframe(df)
|
309
|
+
|
310
|
+
preprocessed_dataframe = preprocessor(sp_dataframe)
|
311
|
+
```
|
312
|
+
|
313
|
+
------
|
{snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/pyproject.toml
RENAMED
@@ -3,7 +3,9 @@ build-backend = "hatchling.build"
|
|
3
3
|
requires = ["hatchling"]
|
4
4
|
|
5
5
|
[project]
|
6
|
-
authors = [
|
6
|
+
authors = [
|
7
|
+
{name = "Snowflake, Inc.", email = "snowflake-python-libraries-dl@snowflake.com"},
|
8
|
+
]
|
7
9
|
classifiers = [
|
8
10
|
"Development Status :: 4 - Beta",
|
9
11
|
"Environment :: Console",
|
@@ -31,6 +33,7 @@ dependencies = [
|
|
31
33
|
"pandera-report==0.1.2",
|
32
34
|
]
|
33
35
|
description = "Migration tools for Snowpark"
|
36
|
+
dynamic = ['version']
|
34
37
|
keywords = [
|
35
38
|
'Snowflake',
|
36
39
|
'analytics',
|
@@ -39,11 +42,10 @@ keywords = [
|
|
39
42
|
'db',
|
40
43
|
'Snowpark',
|
41
44
|
]
|
42
|
-
license = {
|
45
|
+
license = {text = "Apache License, Version 2.0"}
|
43
46
|
name = "snowpark-checkpoints-validators"
|
44
47
|
readme = "README.md"
|
45
48
|
requires-python = '>=3.9,<3.12'
|
46
|
-
dynamic = ['version']
|
47
49
|
|
48
50
|
[project.optional-dependencies]
|
49
51
|
development = [
|
@@ -118,7 +120,7 @@ check = [
|
|
118
120
|
|
119
121
|
[tool.hatch.envs.test.scripts]
|
120
122
|
check = [
|
121
|
-
"pip install -e ../snowpark-checkpoints-configuration"
|
123
|
+
"pip install -e ../snowpark-checkpoints-configuration",
|
122
124
|
'pytest -v --junitxml=test/outcome/test-results.xml --cov=. --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
|
123
125
|
]
|
124
126
|
|
@@ -2,15 +2,17 @@
|
|
2
2
|
# Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
+
import atexit
|
5
6
|
import datetime
|
6
7
|
import hashlib
|
7
8
|
import inspect
|
8
9
|
import json
|
9
10
|
import os
|
11
|
+
import re
|
10
12
|
|
13
|
+
from contextlib import suppress
|
11
14
|
from enum import IntEnum
|
12
15
|
from functools import wraps
|
13
|
-
from os import getcwd, getenv, makedirs
|
14
16
|
from pathlib import Path
|
15
17
|
from platform import python_version
|
16
18
|
from sys import platform
|
@@ -47,6 +49,10 @@ except Exception:
|
|
47
49
|
pass
|
48
50
|
|
49
51
|
|
52
|
+
VERSION_VARIABLE_PATTERN = r"^__version__ = ['\"]([^'\"]*)['\"]"
|
53
|
+
VERSION_FILE_NAME = "__version__.py"
|
54
|
+
|
55
|
+
|
50
56
|
class TelemetryManager(TelemetryClient):
|
51
57
|
def __init__(self, rest: SnowflakeRestful):
|
52
58
|
"""TelemetryManager class to log telemetry events."""
|
@@ -62,6 +68,8 @@ class TelemetryManager(TelemetryClient):
|
|
62
68
|
self._sc_upload_local_telemetry()
|
63
69
|
self.sc_log_batch = []
|
64
70
|
self.sc_hypothesis_input_events = []
|
71
|
+
self.sc_version = _get_version()
|
72
|
+
atexit.register(self._sc_close_at_exit)
|
65
73
|
|
66
74
|
def set_sc_output_path(self, path: Path) -> None:
|
67
75
|
"""Set the output path for testing.
|
@@ -115,7 +123,9 @@ class TelemetryManager(TelemetryClient):
|
|
115
123
|
"""
|
116
124
|
if not self.sc_is_enabled:
|
117
125
|
return {}
|
118
|
-
event = _generate_event(
|
126
|
+
event = _generate_event(
|
127
|
+
event_name, event_type, parameters_info, self.sc_version
|
128
|
+
)
|
119
129
|
self._sc_add_log_to_batch(event)
|
120
130
|
return event
|
121
131
|
|
@@ -133,10 +143,10 @@ class TelemetryManager(TelemetryClient):
|
|
133
143
|
return
|
134
144
|
|
135
145
|
if len(self.sc_log_batch) >= self.sc_flush_size:
|
136
|
-
self.
|
146
|
+
self.sc_send_batch(self.sc_log_batch)
|
137
147
|
self.sc_log_batch = []
|
138
148
|
|
139
|
-
def
|
149
|
+
def sc_send_batch(self, to_sent: list) -> bool:
|
140
150
|
"""Send a request to the API to upload the events. If not have connection, write the events to local folder.
|
141
151
|
|
142
152
|
Args:
|
@@ -176,7 +186,7 @@ class TelemetryManager(TelemetryClient):
|
|
176
186
|
|
177
187
|
"""
|
178
188
|
try:
|
179
|
-
makedirs(self.sc_folder_path, exist_ok=True)
|
189
|
+
os.makedirs(self.sc_folder_path, exist_ok=True)
|
180
190
|
for event in batch:
|
181
191
|
message = event.get("message")
|
182
192
|
if message is not None:
|
@@ -241,15 +251,15 @@ class TelemetryManager(TelemetryClient):
|
|
241
251
|
"""
|
242
252
|
if self._sc_is_telemetry_testing():
|
243
253
|
return True
|
244
|
-
if getenv("SNOWPARK_CHECKPOINTS_TELEMETRY_ENABLED") == "false":
|
254
|
+
if os.getenv("SNOWPARK_CHECKPOINTS_TELEMETRY_ENABLED") == "false":
|
245
255
|
return False
|
246
256
|
return self._rest is not None
|
247
257
|
|
248
258
|
def _sc_is_telemetry_testing(self) -> bool:
|
249
|
-
is_testing = getenv("SNOWPARK_CHECKPOINTS_TELEMETRY_TESTING") == "true"
|
259
|
+
is_testing = os.getenv("SNOWPARK_CHECKPOINTS_TELEMETRY_TESTING") == "true"
|
250
260
|
if is_testing:
|
251
261
|
local_telemetry_path = (
|
252
|
-
Path(getcwd()) / "snowpark-checkpoints-output" / "telemetry"
|
262
|
+
Path(os.getcwd()) / "snowpark-checkpoints-output" / "telemetry"
|
253
263
|
)
|
254
264
|
self.set_sc_output_path(local_telemetry_path)
|
255
265
|
return is_testing
|
@@ -276,11 +286,31 @@ class TelemetryManager(TelemetryClient):
|
|
276
286
|
"""
|
277
287
|
return event_name in self.sc_hypothesis_input_events
|
278
288
|
|
289
|
+
def _sc_close(self) -> None:
|
290
|
+
"""Close the telemetry manager and upload collected events.
|
291
|
+
|
292
|
+
This function closes the telemetry manager, uploads any collected events,
|
293
|
+
and performs any necessary cleanup to ensure no data is lost.
|
294
|
+
"""
|
295
|
+
atexit.unregister(self._sc_close_at_exit)
|
296
|
+
if self.sc_log_batch and self.sc_is_enabled and not self.sc_is_testing:
|
297
|
+
self.sc_send_batch(self.sc_log_batch)
|
298
|
+
|
299
|
+
def _sc_close_at_exit(self) -> None:
|
300
|
+
"""Close the telemetry manager at exit and upload collected events.
|
301
|
+
|
302
|
+
This function ensures that the telemetry manager is closed and all collected events
|
303
|
+
are uploaded when the program exits, preventing data loss.
|
304
|
+
"""
|
305
|
+
with suppress(Exception):
|
306
|
+
self._sc_close()
|
307
|
+
|
279
308
|
|
280
309
|
def _generate_event(
|
281
310
|
event_name: str,
|
282
311
|
event_type: str,
|
283
312
|
parameters_info: Optional[dict] = None,
|
313
|
+
sc_version: Optional[str] = None,
|
284
314
|
) -> dict:
|
285
315
|
"""Generate a telemetry event.
|
286
316
|
|
@@ -288,12 +318,15 @@ def _generate_event(
|
|
288
318
|
event_name (str): The name of the event.
|
289
319
|
event_type (str): The type of the event (e.g., "error", "info").
|
290
320
|
parameters_info (dict, optional): Additional parameters for the event. Defaults to None.
|
321
|
+
sc_version (str, optional): The version of the package. Defaults to None.
|
291
322
|
|
292
323
|
Returns:
|
293
324
|
dict: The generated event.
|
294
325
|
|
295
326
|
"""
|
296
327
|
metadata = _get_metadata()
|
328
|
+
if sc_version is not None:
|
329
|
+
metadata["snowpark_checkpoints_version"] = sc_version
|
297
330
|
message = {
|
298
331
|
"type": event_type,
|
299
332
|
"event_name": event_name,
|
@@ -324,6 +357,27 @@ def _get_metadata() -> dict:
|
|
324
357
|
}
|
325
358
|
|
326
359
|
|
360
|
+
def _get_version() -> str:
|
361
|
+
"""Get the version of the package.
|
362
|
+
|
363
|
+
Returns:
|
364
|
+
str: The version of the package.
|
365
|
+
|
366
|
+
"""
|
367
|
+
try:
|
368
|
+
directory_levels_up = 4
|
369
|
+
project_root = Path(__file__).resolve().parents[directory_levels_up]
|
370
|
+
version_file_path = project_root / VERSION_FILE_NAME
|
371
|
+
with open(version_file_path) as file:
|
372
|
+
content = file.read()
|
373
|
+
version_match = re.search(VERSION_VARIABLE_PATTERN, content, re.MULTILINE)
|
374
|
+
if version_match:
|
375
|
+
return version_match.group(1)
|
376
|
+
return None
|
377
|
+
except Exception:
|
378
|
+
return None
|
379
|
+
|
380
|
+
|
327
381
|
def _get_folder_size(folder_path: Path) -> int:
|
328
382
|
"""Get the size of a folder. Only considers JSON files.
|
329
383
|
|
@@ -466,6 +520,7 @@ def check_dataframe_schema_event(
|
|
466
520
|
tuple: A tuple containing the event name and telemetry data.
|
467
521
|
|
468
522
|
"""
|
523
|
+
telemetry_data[MODE_KEY] = CheckpointMode.SCHEMA.value
|
469
524
|
try:
|
470
525
|
telemetry_data[STATUS_KEY] = param_data.get(STATUS_KEY)
|
471
526
|
pandera_schema = param_data.get(PANDERA_SCHEMA_PARAM_NAME)
|
@@ -529,8 +584,8 @@ def collect_dataframe_checkpoint_mode_schema_event(
|
|
529
584
|
tuple: A tuple containing the event name and telemetry data.
|
530
585
|
|
531
586
|
"""
|
587
|
+
telemetry_data[MODE_KEY] = CheckpointMode.SCHEMA.value
|
532
588
|
try:
|
533
|
-
telemetry_data[MODE_KEY] = CheckpointMode.SCHEMA.value
|
534
589
|
schema_types = param_data.get("column_type_dict")
|
535
590
|
telemetry_data[SCHEMA_TYPES_KEY] = [
|
536
591
|
schema_types[schema_type].dataType.typeName()
|
@@ -538,7 +593,6 @@ def collect_dataframe_checkpoint_mode_schema_event(
|
|
538
593
|
]
|
539
594
|
return DATAFRAME_COLLECTION_SCHEMA, telemetry_data
|
540
595
|
except Exception:
|
541
|
-
telemetry_data[MODE_KEY] = CheckpointMode.SCHEMA.value
|
542
596
|
return DATAFRAME_COLLECTION_ERROR, telemetry_data
|
543
597
|
|
544
598
|
|
@@ -643,7 +697,7 @@ def dataframe_strategy_event(
|
|
643
697
|
telemetry_m.sc_log_error(HYPOTHESIS_INPUT_SCHEMA_ERROR, telemetry_data)
|
644
698
|
else:
|
645
699
|
telemetry_m.sc_log_info(HYPOTHESIS_INPUT_SCHEMA, telemetry_data)
|
646
|
-
telemetry_m.
|
700
|
+
telemetry_m.sc_send_batch(telemetry_m.sc_log_batch)
|
647
701
|
return None, None
|
648
702
|
except Exception:
|
649
703
|
test_function_name = inspect.stack()[2].function
|
@@ -651,7 +705,7 @@ def dataframe_strategy_event(
|
|
651
705
|
if not is_logged:
|
652
706
|
telemetry_m.sc_hypothesis_input_events.append((test_function_name, 0))
|
653
707
|
telemetry_m.sc_log_error(HYPOTHESIS_INPUT_SCHEMA_ERROR, telemetry_data)
|
654
|
-
telemetry_m.
|
708
|
+
telemetry_m.sc_send_batch(telemetry_m.sc_log_batch)
|
655
709
|
return None, None
|
656
710
|
|
657
711
|
|
@@ -786,6 +840,8 @@ def report_telemetry(
|
|
786
840
|
telemetry_m,
|
787
841
|
return_indexes,
|
788
842
|
)
|
843
|
+
except Exception:
|
844
|
+
pass
|
789
845
|
finally:
|
790
846
|
if func_exception is not None:
|
791
847
|
if telemetry_m is not None:
|
@@ -40,6 +40,12 @@ def validate_telemetry_file_output(
|
|
40
40
|
)
|
41
41
|
|
42
42
|
assert diff_telemetry == {}
|
43
|
+
assert isinstance(
|
44
|
+
telemetry_output_obj.get("message")
|
45
|
+
.get("metadata")
|
46
|
+
.get("snowpark_checkpoints_version"),
|
47
|
+
str,
|
48
|
+
)
|
43
49
|
|
44
50
|
|
45
51
|
def get_expected(file_name: str) -> str:
|
@@ -2,16 +2,17 @@
|
|
2
2
|
"message": {
|
3
3
|
"data": "{\"function\": \"_compare_data\", \"mode\": 2, \"status\": false, \"schema_types\": [\"LongType()\", \"LongType()\", \"LongType()\", \"LongType()\", \"DoubleType()\", \"DoubleType()\", \"StringType()\", \"BooleanType()\", \"DateType()\"]}",
|
4
4
|
"driver_type": "PythonConnector",
|
5
|
-
"driver_version": "3.12.
|
5
|
+
"driver_version": "3.12.4",
|
6
6
|
"event_name": "DataFrame_Validator_DF",
|
7
7
|
"metadata": {
|
8
8
|
"device_id": "0471186eb1a6ba58c82e97d19006a08b163bc0056c05e6f770090eade2c3a809",
|
9
9
|
"os_version": "darwin",
|
10
|
-
"python_version": "3.11.
|
11
|
-
"
|
10
|
+
"python_version": "3.11.11",
|
11
|
+
"snowpark_checkpoints_version": "0.1.0rc2",
|
12
|
+
"snowpark_version": "1.26.0"
|
12
13
|
},
|
13
14
|
"source": "snowpark-checkpoints",
|
14
15
|
"type": "info"
|
15
16
|
},
|
16
|
-
"timestamp": "
|
17
|
+
"timestamp": "1737735324401"
|
17
18
|
}
|
@@ -2,16 +2,17 @@
|
|
2
2
|
"message": {
|
3
3
|
"data": "{\"function\": \"_compare_data\", \"mode\": 2, \"status\": true, \"schema_types\": [\"LongType()\", \"LongType()\", \"LongType()\", \"LongType()\", \"DoubleType()\", \"DoubleType()\", \"StringType()\", \"BooleanType()\", \"DateType()\"]}",
|
4
4
|
"driver_type": "PythonConnector",
|
5
|
-
"driver_version": "3.12.
|
5
|
+
"driver_version": "3.12.4",
|
6
6
|
"event_name": "DataFrame_Validator_DF",
|
7
7
|
"metadata": {
|
8
8
|
"device_id": "0471186eb1a6ba58c82e97d19006a08b163bc0056c05e6f770090eade2c3a809",
|
9
9
|
"os_version": "darwin",
|
10
|
-
"python_version": "3.11.
|
11
|
-
"
|
10
|
+
"python_version": "3.11.11",
|
11
|
+
"snowpark_checkpoints_version": "0.1.0rc2",
|
12
|
+
"snowpark_version": "1.26.0"
|
12
13
|
},
|
13
14
|
"source": "snowpark-checkpoints",
|
14
15
|
"type": "info"
|
15
16
|
},
|
16
|
-
"timestamp": "
|
17
|
+
"timestamp": "1737735310567"
|
17
18
|
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
{
|
2
|
+
"message": {
|
3
|
+
"data": "{\"function\": \"_assert_return\", \"status\": false, \"snowflake_schema_types\": [\"LongType()\", \"LongType()\"], \"spark_schema_types\": [\"LongType()\", \"LongType()\"]}",
|
4
|
+
"driver_type": "PythonConnector",
|
5
|
+
"driver_version": "3.12.4",
|
6
|
+
"event_name": "DataFrame_Validator_Mirror",
|
7
|
+
"metadata": {
|
8
|
+
"device_id": "0471186eb1a6ba58c82e97d19006a08b163bc0056c05e6f770090eade2c3a809",
|
9
|
+
"os_version": "darwin",
|
10
|
+
"python_version": "3.11.11",
|
11
|
+
"snowpark_checkpoints_version": "0.1.0rc2",
|
12
|
+
"snowpark_version": "1.26.0"
|
13
|
+
},
|
14
|
+
"source": "snowpark-checkpoints",
|
15
|
+
"type": "info"
|
16
|
+
},
|
17
|
+
"timestamp": "1737735364779"
|
18
|
+
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
{
|
2
|
+
"message": {
|
3
|
+
"data": "{\"function\": \"_assert_return\", \"status\": true, \"snowflake_schema_types\": [\"LongType()\", \"LongType()\"], \"spark_schema_types\": [\"LongType()\", \"LongType()\"]}",
|
4
|
+
"driver_type": "PythonConnector",
|
5
|
+
"driver_version": "3.12.4",
|
6
|
+
"event_name": "DataFrame_Validator_Mirror",
|
7
|
+
"metadata": {
|
8
|
+
"device_id": "0471186eb1a6ba58c82e97d19006a08b163bc0056c05e6f770090eade2c3a809",
|
9
|
+
"os_version": "darwin",
|
10
|
+
"python_version": "3.11.11",
|
11
|
+
"snowpark_checkpoints_version": "0.1.0rc2",
|
12
|
+
"snowpark_version": "1.26.0"
|
13
|
+
},
|
14
|
+
"source": "snowpark-checkpoints",
|
15
|
+
"type": "info"
|
16
|
+
},
|
17
|
+
"timestamp": "1737735355423"
|
18
|
+
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
{
|
2
|
+
"message": {
|
3
|
+
"data": "{\"function\": \"_assert_return\", \"status\": true, \"snowflake_schema_types\": [\"LongType()\", \"LongType()\", \"LongType()\", \"LongType()\"], \"spark_schema_types\": [\"LongType()\", \"LongType()\", \"LongType()\", \"LongType()\"]}",
|
4
|
+
"driver_type": "PythonConnector",
|
5
|
+
"driver_version": "3.12.4",
|
6
|
+
"event_name": "DataFrame_Validator_Mirror",
|
7
|
+
"metadata": {
|
8
|
+
"device_id": "0471186eb1a6ba58c82e97d19006a08b163bc0056c05e6f770090eade2c3a809",
|
9
|
+
"os_version": "darwin",
|
10
|
+
"python_version": "3.11.11",
|
11
|
+
"snowpark_checkpoints_version": "0.1.0rc2",
|
12
|
+
"snowpark_version": "1.26.0"
|
13
|
+
},
|
14
|
+
"source": "snowpark-checkpoints",
|
15
|
+
"type": "info"
|
16
|
+
},
|
17
|
+
"timestamp": "1737735377920"
|
18
|
+
}
|