snowpark-checkpoints-validators 0.1.0rc2__tar.gz → 0.1.0rc3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. snowpark_checkpoints_validators-0.1.0rc3/PKG-INFO +313 -0
  2. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/pyproject.toml +6 -4
  3. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/telemetry.py +68 -12
  4. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_compare_utils.py +6 -0
  5. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/df_mode_dataframe_mismatch_telemetry.json +5 -4
  6. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/df_mode_dataframe_telemetry.json +5 -4
  7. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_df_fail_telemetry.json +18 -0
  8. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_df_pass_telemetry.json +18 -0
  9. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_limit_sample_telemetry.json +18 -0
  10. snowpark_checkpoints_validators-0.1.0rc3/test/integ/telemetry_expected/spark_checkpoint_random_sample_telemetry.json +18 -0
  11. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/spark_checkpoint_scalar_fail_telemetry.json +5 -4
  12. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/spark_checkpoint_scalar_passing_telemetry.json +5 -4
  13. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_df_check_custom_check_telemetry.json +6 -5
  14. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_df_check_fail_telemetry.json +6 -5
  15. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_df_check_from_file_telemetry.json +6 -5
  16. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_df_check_skip_check_telemetry.json +6 -5
  17. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_df_check_telemetry.json +6 -5
  18. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_input_fail_telemetry.json +5 -4
  19. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_input_telemetry.json +5 -4
  20. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_output_fail_telemetry.json +5 -4
  21. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/telemetry_expected/test_output_telemetry.json +5 -4
  22. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_telemetry.py +97 -18
  23. snowpark_checkpoints_validators-0.1.0rc2/PKG-INFO +0 -514
  24. snowpark_checkpoints_validators-0.1.0rc2/test/integ/telemetry_expected/spark_checkpoint_df_fail_telemetry.json +0 -17
  25. snowpark_checkpoints_validators-0.1.0rc2/test/integ/telemetry_expected/spark_checkpoint_df_pass_telemetry.json +0 -17
  26. snowpark_checkpoints_validators-0.1.0rc2/test/integ/telemetry_expected/spark_checkpoint_limit_sample_telemetry.json +0 -17
  27. snowpark_checkpoints_validators-0.1.0rc2/test/integ/telemetry_expected/spark_checkpoint_random_sample_telemetry.json +0 -17
  28. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/.gitignore +0 -0
  29. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/CHANGELOG.md +0 -0
  30. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/LICENSE +0 -0
  31. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/README.md +0 -0
  32. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/__init__.py +0 -0
  33. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/checkpoint.py +0 -0
  34. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/errors.py +0 -0
  35. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/job_context.py +0 -0
  36. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/singleton.py +0 -0
  37. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/snowpark_sampler.py +0 -0
  38. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/spark_migration.py +0 -0
  39. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/__init__.py +0 -0
  40. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/checkpoint_logger.py +0 -0
  41. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/constants.py +0 -0
  42. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/extra_config.py +0 -0
  43. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/pandera_check_manager.py +0 -0
  44. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/supported_types.py +0 -0
  45. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/utils/utils_checks.py +0 -0
  46. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/validation_result_metadata.py +0 -0
  47. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/src/snowflake/snowpark_checkpoints/validation_results.py +0 -0
  48. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/.coveragerc +0 -0
  49. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/e2eexample.py +0 -0
  50. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_pandera.py +0 -0
  51. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_parquet.py +0 -0
  52. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/integ/test_spark_checkpoint.py +0 -0
  53. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_extra_config.py +0 -0
  54. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_pandera_check_manager.py +0 -0
  55. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_spark_migration.py +0 -0
  56. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_utils_checks.py +0 -0
  57. {snowpark_checkpoints_validators-0.1.0rc2 → snowpark_checkpoints_validators-0.1.0rc3}/test/unit/test_validation_result_metadata.py +0 -0
@@ -0,0 +1,313 @@
1
+ Metadata-Version: 2.4
2
+ Name: snowpark-checkpoints-validators
3
+ Version: 0.1.0rc3
4
+ Summary: Migration tools for Snowpark
5
+ Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
+ Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
7
+ Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
8
+ License: Apache License, Version 2.0
9
+ License-File: LICENSE
10
+ Keywords: Snowflake,Snowpark,analytics,cloud,database,db
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Environment :: Other Environment
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: Information Technology
17
+ Classifier: Intended Audience :: System Administrators
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: SQL
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: Topic :: Software Development
25
+ Classifier: Topic :: Software Development :: Libraries
26
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Requires-Python: <3.12,>=3.9
29
+ Requires-Dist: pandera-report==0.1.2
30
+ Requires-Dist: pandera[io]==0.20.4
31
+ Requires-Dist: pyspark
32
+ Requires-Dist: snowflake-connector-python
33
+ Requires-Dist: snowflake-snowpark-python
34
+ Provides-Extra: development
35
+ Requires-Dist: coverage>=7.6.7; extra == 'development'
36
+ Requires-Dist: deepdiff>=8.0.0; extra == 'development'
37
+ Requires-Dist: hatchling==1.25.0; extra == 'development'
38
+ Requires-Dist: pre-commit>=4.0.1; extra == 'development'
39
+ Requires-Dist: pyarrow>=18.0.0; extra == 'development'
40
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
41
+ Requires-Dist: pytest>=8.3.3; extra == 'development'
42
+ Requires-Dist: setuptools>=70.0.0; extra == 'development'
43
+ Requires-Dist: twine==5.1.1; extra == 'development'
44
+ Description-Content-Type: text/markdown
45
+
46
+ # snowpark-checkpoints-validators
47
+
48
+ ---
49
+ **NOTE**
50
+
51
+ This package is on Private Preview.
52
+
53
+ ---
54
+
55
+ **snowpark-checkpoints-validators** is a package designed to validate Snowpark DataFrames against predefined schemas and checkpoints. This package ensures data integrity and consistency by performing schema and data validation checks at various stages of a Snowpark pipeline.
56
+
57
+ ## Features
58
+
59
+ - Validate Snowpark DataFrames against predefined Pandera schemas.
60
+ - Perform custom checks and skip specific checks as needed.
61
+ - Generate validation results and log them for further analysis.
62
+ - Support for sampling strategies to validate large datasets efficiently.
63
+ - Integration with PySpark for cross-validation between Snowpark and PySpark DataFrames.
64
+
65
+ ## Functionalities
66
+
67
+ ### Validate DataFrame Schema from File
68
+
69
+ The `validate_dataframe_checkpoint` function validates a Snowpark DataFrame against a checkpoint schema file or dataframe.
70
+
71
+ ```python
72
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
73
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
74
+ from snowflake.snowpark_checkpoints.utils.constant import (
75
+ CheckpointMode,
76
+ )
77
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
78
+ from typing import Any, Optional
79
+
80
+ # Signature of the function
81
+ def validate_dataframe_checkpoint(
82
+ df: SnowparkDataFrame,
83
+ checkpoint_name: str,
84
+ job_context: Optional[SnowparkJobContext] = None,
85
+ mode: Optional[CheckpointMode] = CheckpointMode.SCHEMA,
86
+ custom_checks: Optional[dict[Any, Any]] = None,
87
+ skip_checks: Optional[dict[Any, Any]] = None,
88
+ sample_frac: Optional[float] = 1.0,
89
+ sample_number: Optional[int] = None,
90
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
91
+ output_path: Optional[str] = None,
92
+ ):
93
+ ...
94
+ ```
95
+
96
+ - `df`: Snowpark dataframe to validate.
97
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
98
+ - `job_context`: Snowpark job context.
99
+ - `mode`: Checkpoint mode (schema or data).
100
+ - `custom_checks`: Custom checks to perform.
101
+ - `skip_checks`: Checks to skip.
102
+ - `sample_frac`: Fraction of the dataframe to sample.
103
+ - `sample_number`: Number of rows to sample.
104
+ - `sampling_strategy`: Sampling strategy to use.
105
+ - `output_path`: Output path for the checkpoint report.
106
+
107
+ ### Usage Example
108
+
109
+ ```python
110
+ from snowflake.snowpark import Session
111
+ from snowflake.snowpark_checkpoints.utils.constant import (
112
+ CheckpointMode,
113
+ )
114
+ from snowflake.snowpark_checkpoints.checkpoint import validate_dataframe_checkpoint
115
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
116
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
117
+ from pyspark.sql import SparkSession
118
+
119
+ session = Session.builder.getOrCreate()
120
+ job_context = SnowparkJobContext(
121
+ session, SparkSession.builder.getOrCreate(), "job_context", True
122
+ )
123
+ df = session.read.format("csv").load("data.csv")
124
+
125
+ validate_dataframe_checkpoint(
126
+ df,
127
+ "schema_checkpoint",
128
+ job_context=job_context,
129
+ mode=CheckpointMode.SCHEMA,
130
+ sample_frac=0.1,
131
+ sampling_strategy=SamplingStrategy.RANDOM_SAMPLE
132
+ )
133
+ ```
134
+
135
+ ### Check with Spark Decorator
136
+
137
+ The `check_with_spark` decorator converts any Snowpark dataframe arguments to a function, samples them, and converts them to PySpark dataframe. It then executes a provided Spark function and compares the outputs between the two implementations.
138
+
139
+ ```python
140
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
141
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
142
+ from typing import Callable, Optional, TypeVar
143
+
144
+ fn = TypeVar("F", bound=Callable)
145
+
146
+ # Signature of the decorator
147
+ def check_with_spark(
148
+ job_context: Optional[SnowparkJobContext],
149
+ spark_function: fn,
150
+ checkpoint_name: str,
151
+ sample_number: Optional[int] = 100,
152
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
153
+ output_path: Optional[str] = None,
154
+ ) -> Callable[[fn], fn]:
155
+ ...
156
+ ```
157
+
158
+ - `job_context`: Snowpark job context.
159
+ - `spark_function`: PySpark function to execute.
160
+ - `checkpoint_name`: Name of the check.
161
+ - `sample_number`: Number of rows to sample.
162
+ - `sampling_strategy`: Sampling strategy to use.
163
+ - `output_path`: Output path for the checkpoint report.
164
+
165
+ ### Usage Example
166
+
167
+ ```python
168
+ from snowflake.snowpark import Session
169
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
170
+ from snowflake.snowpark_checkpoints.spark_migration import check_with_spark
171
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
172
+ from pyspark.sql import DataFrame as SparkDataFrame, SparkSession
173
+
174
+ session = Session.builder.getOrCreate()
175
+ job_context = SnowparkJobContext(
176
+ session, SparkSession.builder.getOrCreate(), "job_context", True
177
+ )
178
+
179
+ def my_spark_scalar_fn(df: SparkDataFrame):
180
+ return df.count()
181
+
182
+ @check_with_spark(
183
+ job_context=job_context,
184
+ spark_function=my_spark_scalar_fn,
185
+ checkpoint_name="count_checkpoint",
186
+ )
187
+ def my_snowpark_scalar_fn(df: SnowparkDataFrame):
188
+ return df.count()
189
+
190
+ df = job_context.snowpark_session.create_dataframe(
191
+ [[1, 2], [3, 4]], schema=["a", "b"]
192
+ )
193
+ count = my_snowpark_scalar_fn(df)
194
+ ```
195
+
196
+ ### Pandera Snowpark Decorators
197
+
198
+ The decorators `@check_input_schema` and `@check_output_schema` allow for sampled schema validation of Snowpark dataframes in the input arguments or in the return value.
199
+
200
+ ```python
201
+ from snowflake.snowpark_checkpoints.spark_migration import SamplingStrategy
202
+ from snowflake.snowpark_checkpoints.job_context import SnowparkJobContext
203
+ from pandera import DataFrameSchema
204
+ from typing import Optional
205
+
206
+ # Signature of the decorator
207
+ def check_input_schema(
208
+ pandera_schema: DataFrameSchema,
209
+ checkpoint_name: str,
210
+ sample_frac: Optional[float] = 1.0,
211
+ sample_number: Optional[int] = None,
212
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
213
+ job_context: Optional[SnowparkJobContext] = None,
214
+ output_path: Optional[str] = None,
215
+ ):
216
+ ...
217
+
218
+ # Signature of the decorator
219
+ def check_output_schema(
220
+ pandera_schema: DataFrameSchema,
221
+ checkpoint_name: str,
222
+ sample_frac: Optional[float] = 1.0,
223
+ sample_number: Optional[int] = None,
224
+ sampling_strategy: Optional[SamplingStrategy] = SamplingStrategy.RANDOM_SAMPLE,
225
+ job_context: Optional[SnowparkJobContext] = None,
226
+ output_path: Optional[str] = None,
227
+ ):
228
+ ...
229
+ ```
230
+
231
+ - `pandera_schema`: Pandera schema to validate.
232
+ - `checkpoint_name`: Name of the checkpoint schema file or DataFrame.
233
+ - `sample_frac`: Fraction of the DataFrame to sample.
234
+ - `sample_number`: Number of rows to sample.
235
+ - `sampling_strategy`: Sampling strategy to use.
236
+ - `job_context`: Snowpark job context.
237
+ - `output_path`: Output path for the checkpoint report.
238
+
239
+ ### Usage Example
240
+
241
+ #### Check Input Schema Example
242
+ ```python
243
+ from pandas import DataFrame as PandasDataFrame
244
+ from pandera import DataFrameSchema, Column, Check
245
+ from snowflake.snowpark import Session
246
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
247
+ from snowflake.snowpark_checkpoints.checkpoint import check_input_schema
248
+ from numpy import int8
249
+
250
+ df = PandasDataFrame(
251
+ {
252
+ "COLUMN1": [1, 4, 0, 10, 9],
253
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
254
+ }
255
+ )
256
+
257
+ in_schema = DataFrameSchema(
258
+ {
259
+ "COLUMN1": Column(int8, Check(lambda x: 0 <= x <= 10, element_wise=True)),
260
+ "COLUMN2": Column(float, Check(lambda x: x < -1.2, element_wise=True)),
261
+ }
262
+ )
263
+
264
+ @check_input_schema(in_schema, "input_schema_checkpoint")
265
+ def preprocessor(dataframe: SnowparkDataFrame):
266
+ dataframe = dataframe.withColumn(
267
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
268
+ )
269
+ return dataframe
270
+
271
+ session = Session.builder.getOrCreate()
272
+ sp_dataframe = session.create_dataframe(df)
273
+
274
+ preprocessed_dataframe = preprocessor(sp_dataframe)
275
+ ```
276
+
277
+ #### Check Input Schema Example
278
+ ```python
279
+ from pandas import DataFrame as PandasDataFrame
280
+ from pandera import DataFrameSchema, Column, Check
281
+ from snowflake.snowpark import Session
282
+ from snowflake.snowpark import DataFrame as SnowparkDataFrame
283
+ from snowflake.snowpark_checkpoints.checkpoint import check_output_schema
284
+ from numpy import int8
285
+
286
+ df = PandasDataFrame(
287
+ {
288
+ "COLUMN1": [1, 4, 0, 10, 9],
289
+ "COLUMN2": [-1.3, -1.4, -2.9, -10.1, -20.4],
290
+ }
291
+ )
292
+
293
+ out_schema = DataFrameSchema(
294
+ {
295
+ "COLUMN1": Column(int8, Check.between(0, 10, include_max=True, include_min=True)),
296
+ "COLUMN2": Column(float, Check.less_than_or_equal_to(-1.2)),
297
+ "COLUMN3": Column(float, Check.less_than(10)),
298
+ }
299
+ )
300
+
301
+ @check_output_schema(out_schema, "output_schema_checkpoint")
302
+ def preprocessor(dataframe: SnowparkDataFrame):
303
+ return dataframe.with_column(
304
+ "COLUMN3", dataframe["COLUMN1"] + dataframe["COLUMN2"]
305
+ )
306
+
307
+ session = Session.builder.getOrCreate()
308
+ sp_dataframe = session.create_dataframe(df)
309
+
310
+ preprocessed_dataframe = preprocessor(sp_dataframe)
311
+ ```
312
+
313
+ ------
@@ -3,7 +3,9 @@ build-backend = "hatchling.build"
3
3
  requires = ["hatchling"]
4
4
 
5
5
  [project]
6
- authors = [{name = "Snowflake Inc."}]
6
+ authors = [
7
+ {name = "Snowflake, Inc.", email = "snowflake-python-libraries-dl@snowflake.com"},
8
+ ]
7
9
  classifiers = [
8
10
  "Development Status :: 4 - Beta",
9
11
  "Environment :: Console",
@@ -31,6 +33,7 @@ dependencies = [
31
33
  "pandera-report==0.1.2",
32
34
  ]
33
35
  description = "Migration tools for Snowpark"
36
+ dynamic = ['version']
34
37
  keywords = [
35
38
  'Snowflake',
36
39
  'analytics',
@@ -39,11 +42,10 @@ keywords = [
39
42
  'db',
40
43
  'Snowpark',
41
44
  ]
42
- license = {file = "LICENSE"}
45
+ license = {text = "Apache License, Version 2.0"}
43
46
  name = "snowpark-checkpoints-validators"
44
47
  readme = "README.md"
45
48
  requires-python = '>=3.9,<3.12'
46
- dynamic = ['version']
47
49
 
48
50
  [project.optional-dependencies]
49
51
  development = [
@@ -118,7 +120,7 @@ check = [
118
120
 
119
121
  [tool.hatch.envs.test.scripts]
120
122
  check = [
121
- "pip install -e ../snowpark-checkpoints-configuration" ,
123
+ "pip install -e ../snowpark-checkpoints-configuration",
122
124
  'pytest -v --junitxml=test/outcome/test-results.xml --cov=. --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
123
125
  ]
124
126
 
@@ -2,15 +2,17 @@
2
2
  # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
4
 
5
+ import atexit
5
6
  import datetime
6
7
  import hashlib
7
8
  import inspect
8
9
  import json
9
10
  import os
11
+ import re
10
12
 
13
+ from contextlib import suppress
11
14
  from enum import IntEnum
12
15
  from functools import wraps
13
- from os import getcwd, getenv, makedirs
14
16
  from pathlib import Path
15
17
  from platform import python_version
16
18
  from sys import platform
@@ -47,6 +49,10 @@ except Exception:
47
49
  pass
48
50
 
49
51
 
52
+ VERSION_VARIABLE_PATTERN = r"^__version__ = ['\"]([^'\"]*)['\"]"
53
+ VERSION_FILE_NAME = "__version__.py"
54
+
55
+
50
56
  class TelemetryManager(TelemetryClient):
51
57
  def __init__(self, rest: SnowflakeRestful):
52
58
  """TelemetryManager class to log telemetry events."""
@@ -62,6 +68,8 @@ class TelemetryManager(TelemetryClient):
62
68
  self._sc_upload_local_telemetry()
63
69
  self.sc_log_batch = []
64
70
  self.sc_hypothesis_input_events = []
71
+ self.sc_version = _get_version()
72
+ atexit.register(self._sc_close_at_exit)
65
73
 
66
74
  def set_sc_output_path(self, path: Path) -> None:
67
75
  """Set the output path for testing.
@@ -115,7 +123,9 @@ class TelemetryManager(TelemetryClient):
115
123
  """
116
124
  if not self.sc_is_enabled:
117
125
  return {}
118
- event = _generate_event(event_name, event_type, parameters_info)
126
+ event = _generate_event(
127
+ event_name, event_type, parameters_info, self.sc_version
128
+ )
119
129
  self._sc_add_log_to_batch(event)
120
130
  return event
121
131
 
@@ -133,10 +143,10 @@ class TelemetryManager(TelemetryClient):
133
143
  return
134
144
 
135
145
  if len(self.sc_log_batch) >= self.sc_flush_size:
136
- self._sc_send_batch(self.sc_log_batch)
146
+ self.sc_send_batch(self.sc_log_batch)
137
147
  self.sc_log_batch = []
138
148
 
139
- def _sc_send_batch(self, to_sent: list) -> bool:
149
+ def sc_send_batch(self, to_sent: list) -> bool:
140
150
  """Send a request to the API to upload the events. If not have connection, write the events to local folder.
141
151
 
142
152
  Args:
@@ -176,7 +186,7 @@ class TelemetryManager(TelemetryClient):
176
186
 
177
187
  """
178
188
  try:
179
- makedirs(self.sc_folder_path, exist_ok=True)
189
+ os.makedirs(self.sc_folder_path, exist_ok=True)
180
190
  for event in batch:
181
191
  message = event.get("message")
182
192
  if message is not None:
@@ -241,15 +251,15 @@ class TelemetryManager(TelemetryClient):
241
251
  """
242
252
  if self._sc_is_telemetry_testing():
243
253
  return True
244
- if getenv("SNOWPARK_CHECKPOINTS_TELEMETRY_ENABLED") == "false":
254
+ if os.getenv("SNOWPARK_CHECKPOINTS_TELEMETRY_ENABLED") == "false":
245
255
  return False
246
256
  return self._rest is not None
247
257
 
248
258
  def _sc_is_telemetry_testing(self) -> bool:
249
- is_testing = getenv("SNOWPARK_CHECKPOINTS_TELEMETRY_TESTING") == "true"
259
+ is_testing = os.getenv("SNOWPARK_CHECKPOINTS_TELEMETRY_TESTING") == "true"
250
260
  if is_testing:
251
261
  local_telemetry_path = (
252
- Path(getcwd()) / "snowpark-checkpoints-output" / "telemetry"
262
+ Path(os.getcwd()) / "snowpark-checkpoints-output" / "telemetry"
253
263
  )
254
264
  self.set_sc_output_path(local_telemetry_path)
255
265
  return is_testing
@@ -276,11 +286,31 @@ class TelemetryManager(TelemetryClient):
276
286
  """
277
287
  return event_name in self.sc_hypothesis_input_events
278
288
 
289
+ def _sc_close(self) -> None:
290
+ """Close the telemetry manager and upload collected events.
291
+
292
+ This function closes the telemetry manager, uploads any collected events,
293
+ and performs any necessary cleanup to ensure no data is lost.
294
+ """
295
+ atexit.unregister(self._sc_close_at_exit)
296
+ if self.sc_log_batch and self.sc_is_enabled and not self.sc_is_testing:
297
+ self.sc_send_batch(self.sc_log_batch)
298
+
299
+ def _sc_close_at_exit(self) -> None:
300
+ """Close the telemetry manager at exit and upload collected events.
301
+
302
+ This function ensures that the telemetry manager is closed and all collected events
303
+ are uploaded when the program exits, preventing data loss.
304
+ """
305
+ with suppress(Exception):
306
+ self._sc_close()
307
+
279
308
 
280
309
  def _generate_event(
281
310
  event_name: str,
282
311
  event_type: str,
283
312
  parameters_info: Optional[dict] = None,
313
+ sc_version: Optional[str] = None,
284
314
  ) -> dict:
285
315
  """Generate a telemetry event.
286
316
 
@@ -288,12 +318,15 @@ def _generate_event(
288
318
  event_name (str): The name of the event.
289
319
  event_type (str): The type of the event (e.g., "error", "info").
290
320
  parameters_info (dict, optional): Additional parameters for the event. Defaults to None.
321
+ sc_version (str, optional): The version of the package. Defaults to None.
291
322
 
292
323
  Returns:
293
324
  dict: The generated event.
294
325
 
295
326
  """
296
327
  metadata = _get_metadata()
328
+ if sc_version is not None:
329
+ metadata["snowpark_checkpoints_version"] = sc_version
297
330
  message = {
298
331
  "type": event_type,
299
332
  "event_name": event_name,
@@ -324,6 +357,27 @@ def _get_metadata() -> dict:
324
357
  }
325
358
 
326
359
 
360
+ def _get_version() -> str:
361
+ """Get the version of the package.
362
+
363
+ Returns:
364
+ str: The version of the package.
365
+
366
+ """
367
+ try:
368
+ directory_levels_up = 4
369
+ project_root = Path(__file__).resolve().parents[directory_levels_up]
370
+ version_file_path = project_root / VERSION_FILE_NAME
371
+ with open(version_file_path) as file:
372
+ content = file.read()
373
+ version_match = re.search(VERSION_VARIABLE_PATTERN, content, re.MULTILINE)
374
+ if version_match:
375
+ return version_match.group(1)
376
+ return None
377
+ except Exception:
378
+ return None
379
+
380
+
327
381
  def _get_folder_size(folder_path: Path) -> int:
328
382
  """Get the size of a folder. Only considers JSON files.
329
383
 
@@ -466,6 +520,7 @@ def check_dataframe_schema_event(
466
520
  tuple: A tuple containing the event name and telemetry data.
467
521
 
468
522
  """
523
+ telemetry_data[MODE_KEY] = CheckpointMode.SCHEMA.value
469
524
  try:
470
525
  telemetry_data[STATUS_KEY] = param_data.get(STATUS_KEY)
471
526
  pandera_schema = param_data.get(PANDERA_SCHEMA_PARAM_NAME)
@@ -529,8 +584,8 @@ def collect_dataframe_checkpoint_mode_schema_event(
529
584
  tuple: A tuple containing the event name and telemetry data.
530
585
 
531
586
  """
587
+ telemetry_data[MODE_KEY] = CheckpointMode.SCHEMA.value
532
588
  try:
533
- telemetry_data[MODE_KEY] = CheckpointMode.SCHEMA.value
534
589
  schema_types = param_data.get("column_type_dict")
535
590
  telemetry_data[SCHEMA_TYPES_KEY] = [
536
591
  schema_types[schema_type].dataType.typeName()
@@ -538,7 +593,6 @@ def collect_dataframe_checkpoint_mode_schema_event(
538
593
  ]
539
594
  return DATAFRAME_COLLECTION_SCHEMA, telemetry_data
540
595
  except Exception:
541
- telemetry_data[MODE_KEY] = CheckpointMode.SCHEMA.value
542
596
  return DATAFRAME_COLLECTION_ERROR, telemetry_data
543
597
 
544
598
 
@@ -643,7 +697,7 @@ def dataframe_strategy_event(
643
697
  telemetry_m.sc_log_error(HYPOTHESIS_INPUT_SCHEMA_ERROR, telemetry_data)
644
698
  else:
645
699
  telemetry_m.sc_log_info(HYPOTHESIS_INPUT_SCHEMA, telemetry_data)
646
- telemetry_m._sc_send_batch(telemetry_m.sc_log_batch)
700
+ telemetry_m.sc_send_batch(telemetry_m.sc_log_batch)
647
701
  return None, None
648
702
  except Exception:
649
703
  test_function_name = inspect.stack()[2].function
@@ -651,7 +705,7 @@ def dataframe_strategy_event(
651
705
  if not is_logged:
652
706
  telemetry_m.sc_hypothesis_input_events.append((test_function_name, 0))
653
707
  telemetry_m.sc_log_error(HYPOTHESIS_INPUT_SCHEMA_ERROR, telemetry_data)
654
- telemetry_m._sc_send_batch(telemetry_m.sc_log_batch)
708
+ telemetry_m.sc_send_batch(telemetry_m.sc_log_batch)
655
709
  return None, None
656
710
 
657
711
 
@@ -786,6 +840,8 @@ def report_telemetry(
786
840
  telemetry_m,
787
841
  return_indexes,
788
842
  )
843
+ except Exception:
844
+ pass
789
845
  finally:
790
846
  if func_exception is not None:
791
847
  if telemetry_m is not None:
@@ -40,6 +40,12 @@ def validate_telemetry_file_output(
40
40
  )
41
41
 
42
42
  assert diff_telemetry == {}
43
+ assert isinstance(
44
+ telemetry_output_obj.get("message")
45
+ .get("metadata")
46
+ .get("snowpark_checkpoints_version"),
47
+ str,
48
+ )
43
49
 
44
50
 
45
51
  def get_expected(file_name: str) -> str:
@@ -2,16 +2,17 @@
2
2
  "message": {
3
3
  "data": "{\"function\": \"_compare_data\", \"mode\": 2, \"status\": false, \"schema_types\": [\"LongType()\", \"LongType()\", \"LongType()\", \"LongType()\", \"DoubleType()\", \"DoubleType()\", \"StringType()\", \"BooleanType()\", \"DateType()\"]}",
4
4
  "driver_type": "PythonConnector",
5
- "driver_version": "3.12.3",
5
+ "driver_version": "3.12.4",
6
6
  "event_name": "DataFrame_Validator_DF",
7
7
  "metadata": {
8
8
  "device_id": "0471186eb1a6ba58c82e97d19006a08b163bc0056c05e6f770090eade2c3a809",
9
9
  "os_version": "darwin",
10
- "python_version": "3.11.2",
11
- "snowpark_version": "1.25.0"
10
+ "python_version": "3.11.11",
11
+ "snowpark_checkpoints_version": "0.1.0rc2",
12
+ "snowpark_version": "1.26.0"
12
13
  },
13
14
  "source": "snowpark-checkpoints",
14
15
  "type": "info"
15
16
  },
16
- "timestamp": "1737474924360"
17
+ "timestamp": "1737735324401"
17
18
  }
@@ -2,16 +2,17 @@
2
2
  "message": {
3
3
  "data": "{\"function\": \"_compare_data\", \"mode\": 2, \"status\": true, \"schema_types\": [\"LongType()\", \"LongType()\", \"LongType()\", \"LongType()\", \"DoubleType()\", \"DoubleType()\", \"StringType()\", \"BooleanType()\", \"DateType()\"]}",
4
4
  "driver_type": "PythonConnector",
5
- "driver_version": "3.12.3",
5
+ "driver_version": "3.12.4",
6
6
  "event_name": "DataFrame_Validator_DF",
7
7
  "metadata": {
8
8
  "device_id": "0471186eb1a6ba58c82e97d19006a08b163bc0056c05e6f770090eade2c3a809",
9
9
  "os_version": "darwin",
10
- "python_version": "3.11.2",
11
- "snowpark_version": "1.25.0"
10
+ "python_version": "3.11.11",
11
+ "snowpark_checkpoints_version": "0.1.0rc2",
12
+ "snowpark_version": "1.26.0"
12
13
  },
13
14
  "source": "snowpark-checkpoints",
14
15
  "type": "info"
15
16
  },
16
- "timestamp": "1737474910439"
17
+ "timestamp": "1737735310567"
17
18
  }
@@ -0,0 +1,18 @@
1
+ {
2
+ "message": {
3
+ "data": "{\"function\": \"_assert_return\", \"status\": false, \"snowflake_schema_types\": [\"LongType()\", \"LongType()\"], \"spark_schema_types\": [\"LongType()\", \"LongType()\"]}",
4
+ "driver_type": "PythonConnector",
5
+ "driver_version": "3.12.4",
6
+ "event_name": "DataFrame_Validator_Mirror",
7
+ "metadata": {
8
+ "device_id": "0471186eb1a6ba58c82e97d19006a08b163bc0056c05e6f770090eade2c3a809",
9
+ "os_version": "darwin",
10
+ "python_version": "3.11.11",
11
+ "snowpark_checkpoints_version": "0.1.0rc2",
12
+ "snowpark_version": "1.26.0"
13
+ },
14
+ "source": "snowpark-checkpoints",
15
+ "type": "info"
16
+ },
17
+ "timestamp": "1737735364779"
18
+ }
@@ -0,0 +1,18 @@
1
+ {
2
+ "message": {
3
+ "data": "{\"function\": \"_assert_return\", \"status\": true, \"snowflake_schema_types\": [\"LongType()\", \"LongType()\"], \"spark_schema_types\": [\"LongType()\", \"LongType()\"]}",
4
+ "driver_type": "PythonConnector",
5
+ "driver_version": "3.12.4",
6
+ "event_name": "DataFrame_Validator_Mirror",
7
+ "metadata": {
8
+ "device_id": "0471186eb1a6ba58c82e97d19006a08b163bc0056c05e6f770090eade2c3a809",
9
+ "os_version": "darwin",
10
+ "python_version": "3.11.11",
11
+ "snowpark_checkpoints_version": "0.1.0rc2",
12
+ "snowpark_version": "1.26.0"
13
+ },
14
+ "source": "snowpark-checkpoints",
15
+ "type": "info"
16
+ },
17
+ "timestamp": "1737735355423"
18
+ }
@@ -0,0 +1,18 @@
1
+ {
2
+ "message": {
3
+ "data": "{\"function\": \"_assert_return\", \"status\": true, \"snowflake_schema_types\": [\"LongType()\", \"LongType()\", \"LongType()\", \"LongType()\"], \"spark_schema_types\": [\"LongType()\", \"LongType()\", \"LongType()\", \"LongType()\"]}",
4
+ "driver_type": "PythonConnector",
5
+ "driver_version": "3.12.4",
6
+ "event_name": "DataFrame_Validator_Mirror",
7
+ "metadata": {
8
+ "device_id": "0471186eb1a6ba58c82e97d19006a08b163bc0056c05e6f770090eade2c3a809",
9
+ "os_version": "darwin",
10
+ "python_version": "3.11.11",
11
+ "snowpark_checkpoints_version": "0.1.0rc2",
12
+ "snowpark_version": "1.26.0"
13
+ },
14
+ "source": "snowpark-checkpoints",
15
+ "type": "info"
16
+ },
17
+ "timestamp": "1737735377920"
18
+ }