snowpark-checkpoints-collectors 0.2.0rc1__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/.gitignore +4 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/LICENSE +0 -25
- snowpark_checkpoints_collectors-0.2.1/PKG-INFO +158 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/README.md +13 -3
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/pyproject.toml +25 -11
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/__init__.py +30 -0
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/__version__.py +16 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +19 -3
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +24 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +14 -3
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +19 -3
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +22 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +55 -21
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +14 -3
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +36 -19
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +31 -14
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +71 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +21 -10
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +74 -0
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +67 -0
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +92 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +23 -10
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +39 -20
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +49 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +39 -15
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +70 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +24 -11
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +75 -0
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +75 -0
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +20 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +88 -36
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/singleton.py +23 -0
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +20 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +52 -12
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +154 -83
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +53 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +23 -5
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +14 -3
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/utils/logging_utils.py +67 -0
- snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +889 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/.coveragerc +1 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/telemetry_compare_utils.py +69 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_checkpoint_name.py +74 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1.py +123 -108
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +1 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +18 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +1 -1
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +1 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +18 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_full_df.json +1 -1
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +1 -1
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +9 -8
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +18 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_2.py +96 -21
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_collect_invalid_mode_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_generate_parquet_for_spark_df_telemetry.json +18 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_spark_df_mode_dataframe_telemetry.json +18 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collection_result_file.py +39 -13
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_snow_connection_int.py +14 -3
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_checkpoint_name_utils.py +4 -2
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_collection_point_result_manager.py +31 -6
- snowpark_checkpoints_collectors-0.2.1/test/unit/test_column_collection.py +477 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_extra_config.py +14 -3
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_file_utils.py +14 -3
- snowpark_checkpoints_collectors-0.2.1/test/unit/test_logger.py +132 -0
- snowpark_checkpoints_collectors-0.2.1/test/unit/test_logging_utils.py +132 -0
- snowpark_checkpoints_collectors-0.2.1/test/unit/test_pandera_column_check_manager.py +194 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_snow_connection.py +15 -4
- snowpark_checkpoints_collectors-0.2.1/test/unit/test_summary_stats_collector.py +70 -0
- snowpark_checkpoints_collectors-0.2.0rc1/PKG-INFO +0 -347
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/__init__.py +0 -11
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -13
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -11
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -53
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -56
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -49
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -69
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -38
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -35
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +0 -54
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +0 -54
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -9
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -12
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -9
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +0 -49
- snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -1
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_checkpoint_name.py +0 -51
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -1
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -17
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -17
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -1
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -17
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -17
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -17
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -17
- snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -17
- snowpark_checkpoints_collectors-0.2.0rc1/test/unit/test_column_collection.py +0 -669
- snowpark_checkpoints_collectors-0.2.0rc1/test/unit/test_summary_stats_collector.py +0 -29
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/CHANGELOG.md +0 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/snowpark-testdf-schema.json +0 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
- {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_collection_point_result.py +0 -0
@@ -175,28 +175,3 @@
|
|
175
175
|
of your accepting any such warranty or additional liability.
|
176
176
|
|
177
177
|
END OF TERMS AND CONDITIONS
|
178
|
-
|
179
|
-
APPENDIX: How to apply the Apache License to your work.
|
180
|
-
|
181
|
-
To apply the Apache License to your work, attach the following
|
182
|
-
boilerplate notice, with the fields enclosed by brackets "[]"
|
183
|
-
replaced with your own identifying information. (Don't include
|
184
|
-
the brackets!) The text should be enclosed in the appropriate
|
185
|
-
comment syntax for the file format. We also recommend that a
|
186
|
-
file or class name and description of purpose be included on the
|
187
|
-
same "printed page" as the copyright notice for easier
|
188
|
-
identification within third-party archives.
|
189
|
-
|
190
|
-
Copyright 2025 Snowflake
|
191
|
-
|
192
|
-
Licensed under the Apache License, Version 2.0 (the "License");
|
193
|
-
you may not use this file except in compliance with the License.
|
194
|
-
You may obtain a copy of the License at
|
195
|
-
|
196
|
-
http://www.apache.org/licenses/LICENSE-2.0
|
197
|
-
|
198
|
-
Unless required by applicable law or agreed to in writing, software
|
199
|
-
distributed under the License is distributed on an "AS IS" BASIS,
|
200
|
-
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
201
|
-
See the License for the specific language governing permissions and
|
202
|
-
limitations under the License.
|
@@ -0,0 +1,158 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: snowpark-checkpoints-collectors
|
3
|
+
Version: 0.2.1
|
4
|
+
Summary: Snowpark column and table statistics collection
|
5
|
+
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
|
+
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
7
|
+
Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
|
8
|
+
License: Apache License, Version 2.0
|
9
|
+
License-File: LICENSE
|
10
|
+
Keywords: Snowflake,Snowpark,analytics,cloud,database,db
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
12
|
+
Classifier: Environment :: Console
|
13
|
+
Classifier: Environment :: Other Environment
|
14
|
+
Classifier: Intended Audience :: Developers
|
15
|
+
Classifier: Intended Audience :: Education
|
16
|
+
Classifier: Intended Audience :: Information Technology
|
17
|
+
Classifier: Intended Audience :: System Administrators
|
18
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
19
|
+
Classifier: Operating System :: OS Independent
|
20
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
21
|
+
Classifier: Programming Language :: SQL
|
22
|
+
Classifier: Topic :: Database
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
24
|
+
Classifier: Topic :: Software Development
|
25
|
+
Classifier: Topic :: Software Development :: Libraries
|
26
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
28
|
+
Requires-Python: <3.12,>=3.9
|
29
|
+
Requires-Dist: pandera[io]==0.20.4
|
30
|
+
Requires-Dist: snowflake-connector-python
|
31
|
+
Requires-Dist: snowflake-snowpark-python>=1.23.0
|
32
|
+
Provides-Extra: development
|
33
|
+
Requires-Dist: coverage>=7.6.7; extra == 'development'
|
34
|
+
Requires-Dist: deepdiff>=8.0.0; extra == 'development'
|
35
|
+
Requires-Dist: hatchling==1.25.0; extra == 'development'
|
36
|
+
Requires-Dist: pre-commit>=4.0.1; extra == 'development'
|
37
|
+
Requires-Dist: pyarrow>=18.0.0; extra == 'development'
|
38
|
+
Requires-Dist: pyspark>=3.5.0; extra == 'development'
|
39
|
+
Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
|
40
|
+
Requires-Dist: pytest>=8.3.3; extra == 'development'
|
41
|
+
Requires-Dist: setuptools>=70.0.0; extra == 'development'
|
42
|
+
Requires-Dist: twine==5.1.1; extra == 'development'
|
43
|
+
Provides-Extra: pyspark
|
44
|
+
Requires-Dist: pyspark>=3.5.0; extra == 'pyspark'
|
45
|
+
Description-Content-Type: text/markdown
|
46
|
+
|
47
|
+
# snowpark-checkpoints-collectors
|
48
|
+
|
49
|
+
|
50
|
+
---
|
51
|
+
##### This package is on Public Preview.
|
52
|
+
---
|
53
|
+
|
54
|
+
**snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
|
55
|
+
|
56
|
+
---
|
57
|
+
## Install the library
|
58
|
+
```bash
|
59
|
+
pip install snowpark-checkpoints-collectors
|
60
|
+
```
|
61
|
+
This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
|
62
|
+
```bash
|
63
|
+
pip install "snowpark-checkpoints-collectors[pyspark]"
|
64
|
+
```
|
65
|
+
---
|
66
|
+
|
67
|
+
## Features
|
68
|
+
|
69
|
+
- Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
|
70
|
+
- DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
|
71
|
+
|
72
|
+
|
73
|
+
|
74
|
+
## Functionalities
|
75
|
+
|
76
|
+
### Collect DataFrame Checkpoint
|
77
|
+
|
78
|
+
|
79
|
+
|
80
|
+
```python
|
81
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
82
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
83
|
+
from typing import Optional
|
84
|
+
|
85
|
+
# Signature of the function
|
86
|
+
def collect_dataframe_checkpoint(
|
87
|
+
df: SparkDataFrame,
|
88
|
+
checkpoint_name: str,
|
89
|
+
sample: Optional[float] = None,
|
90
|
+
mode: Optional[CheckpointMode] = None,
|
91
|
+
output_path: Optional[str] = None,
|
92
|
+
) -> None:
|
93
|
+
...
|
94
|
+
```
|
95
|
+
|
96
|
+
- `df`: The input Spark dataframe to collect.
|
97
|
+
- `checkpoint_name`: Name of the checkpoint schema file or dataframe.
|
98
|
+
- `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
|
99
|
+
- `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
|
100
|
+
- `output_path`: The output path to save the checkpoint, defaults to current working directory.
|
101
|
+
|
102
|
+
|
103
|
+
## Usage Example
|
104
|
+
|
105
|
+
### Schema mode
|
106
|
+
|
107
|
+
```python
|
108
|
+
from pyspark.sql import SparkSession
|
109
|
+
from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
|
110
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
111
|
+
|
112
|
+
spark_session = SparkSession.builder.getOrCreate()
|
113
|
+
sample_size = 1.0
|
114
|
+
|
115
|
+
pyspark_df = spark_session.createDataFrame(
|
116
|
+
[("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
|
117
|
+
)
|
118
|
+
|
119
|
+
collect_dataframe_checkpoint(
|
120
|
+
pyspark_df,
|
121
|
+
checkpoint_name="collect_checkpoint_mode_1",
|
122
|
+
sample=sample_size,
|
123
|
+
mode=CheckpointMode.SCHEMA,
|
124
|
+
)
|
125
|
+
```
|
126
|
+
|
127
|
+
|
128
|
+
### Dataframe mode
|
129
|
+
|
130
|
+
```python
|
131
|
+
from pyspark.sql import SparkSession
|
132
|
+
from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
|
133
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
134
|
+
from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
|
135
|
+
|
136
|
+
spark_schema = StructType(
|
137
|
+
[
|
138
|
+
StructField("BYTE", ByteType(), True),
|
139
|
+
StructField("STRING", StringType(), True),
|
140
|
+
StructField("INTEGER", IntegerType(), True)
|
141
|
+
]
|
142
|
+
)
|
143
|
+
|
144
|
+
data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
|
145
|
+
|
146
|
+
spark_session = SparkSession.builder.getOrCreate()
|
147
|
+
pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
|
148
|
+
"INTEGER"
|
149
|
+
)
|
150
|
+
|
151
|
+
collect_dataframe_checkpoint(
|
152
|
+
pyspark_df,
|
153
|
+
checkpoint_name="collect_checkpoint_mode_2",
|
154
|
+
mode=CheckpointMode.DATAFRAME,
|
155
|
+
)
|
156
|
+
```
|
157
|
+
|
158
|
+
------
|
{snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/README.md
RENAMED
@@ -1,13 +1,23 @@
|
|
1
1
|
# snowpark-checkpoints-collectors
|
2
2
|
|
3
|
+
|
4
|
+
---
|
5
|
+
##### This package is on Public Preview.
|
3
6
|
---
|
4
|
-
**NOTE**
|
5
7
|
|
6
|
-
|
8
|
+
**snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
|
7
9
|
|
10
|
+
---
|
11
|
+
## Install the library
|
12
|
+
```bash
|
13
|
+
pip install snowpark-checkpoints-collectors
|
14
|
+
```
|
15
|
+
This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
|
16
|
+
```bash
|
17
|
+
pip install "snowpark-checkpoints-collectors[pyspark]"
|
18
|
+
```
|
8
19
|
---
|
9
20
|
|
10
|
-
**snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
|
11
21
|
## Features
|
12
22
|
|
13
23
|
- Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
|
{snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/pyproject.toml
RENAMED
@@ -3,7 +3,9 @@ build-backend = "hatchling.build"
|
|
3
3
|
requires = ["hatchling"]
|
4
4
|
|
5
5
|
[project]
|
6
|
-
authors = [
|
6
|
+
authors = [
|
7
|
+
{name = "Snowflake, Inc.", email = "snowflake-python-libraries-dl@snowflake.com"},
|
8
|
+
]
|
7
9
|
classifiers = [
|
8
10
|
"Development Status :: 4 - Beta",
|
9
11
|
"Environment :: Console",
|
@@ -24,12 +26,12 @@ classifiers = [
|
|
24
26
|
"Topic :: Scientific/Engineering :: Information Analysis",
|
25
27
|
]
|
26
28
|
dependencies = [
|
27
|
-
"snowflake-snowpark-python",
|
29
|
+
"snowflake-snowpark-python>=1.23.0",
|
28
30
|
"snowflake-connector-python",
|
29
|
-
"pyspark",
|
30
31
|
"pandera[io]==0.20.4",
|
31
32
|
]
|
32
33
|
description = "Snowpark column and table statistics collection"
|
34
|
+
dynamic = ['version']
|
33
35
|
keywords = [
|
34
36
|
'Snowflake',
|
35
37
|
'analytics',
|
@@ -38,13 +40,15 @@ keywords = [
|
|
38
40
|
'db',
|
39
41
|
'Snowpark',
|
40
42
|
]
|
41
|
-
license = {
|
43
|
+
license = {text = "Apache License, Version 2.0"}
|
42
44
|
name = "snowpark-checkpoints-collectors"
|
43
45
|
readme = "README.md"
|
44
46
|
requires-python = '>=3.9,<3.12'
|
45
|
-
dynamic = ['version']
|
46
47
|
|
47
48
|
[project.optional-dependencies]
|
49
|
+
pyspark = [
|
50
|
+
"pyspark>=3.5.0",
|
51
|
+
]
|
48
52
|
development = [
|
49
53
|
"pytest>=8.3.3",
|
50
54
|
"pytest-cov>=6.0.0",
|
@@ -55,14 +59,16 @@ development = [
|
|
55
59
|
"setuptools>=70.0.0",
|
56
60
|
"pyarrow>=18.0.0",
|
57
61
|
"deepdiff>=8.0.0",
|
62
|
+
"pyspark>=3.5.0",
|
58
63
|
]
|
59
64
|
|
60
65
|
[project.urls]
|
61
66
|
"Bug Tracker" = "https://github.com/snowflakedb/snowpark-checkpoints/issues"
|
62
67
|
"Source code" = "https://github.com/snowflakedb/snowpark-checkpoints/"
|
63
68
|
|
69
|
+
|
64
70
|
[tool.hatch.version]
|
65
|
-
path = "__version__.py"
|
71
|
+
path = "src/snowflake/snowpark_checkpoints_collector/__version__.py"
|
66
72
|
pattern = '^__version__ = "(?P<version>.*)"'
|
67
73
|
source = "regex"
|
68
74
|
|
@@ -74,15 +80,22 @@ where = ["src/"]
|
|
74
80
|
dev-mode-dirs = ['src']
|
75
81
|
directory = 'snowpark-checkpoints-collectors'
|
76
82
|
|
83
|
+
[[tool.hatch.sources]]
|
84
|
+
dir = "src/snowflake/snowpark_checkpoints_collector"
|
85
|
+
name = "snowpark-checkpoints-collectors"
|
86
|
+
type = "package"
|
87
|
+
|
77
88
|
[tool.hatch.build.targets.wheel]
|
78
89
|
directory = "dist"
|
79
|
-
packages = [
|
90
|
+
packages = [
|
91
|
+
"src/snowflake",
|
92
|
+
]
|
80
93
|
|
81
94
|
[tool.hatch.build.targets.sdist]
|
82
95
|
directory = "dist"
|
83
96
|
exclude = ["/.github", "/.idea"]
|
84
97
|
include = [
|
85
|
-
'src
|
98
|
+
'src/**',
|
86
99
|
'README.md',
|
87
100
|
'LICENSE',
|
88
101
|
'test/',
|
@@ -113,7 +126,6 @@ exclude_lines = [
|
|
113
126
|
"if __name__ == .__main__.:",
|
114
127
|
]
|
115
128
|
|
116
|
-
|
117
129
|
[tool.hatch.envs.linter.scripts]
|
118
130
|
check = [
|
119
131
|
'ruff check --fix .',
|
@@ -121,8 +133,10 @@ check = [
|
|
121
133
|
|
122
134
|
[tool.hatch.envs.test.scripts]
|
123
135
|
check = [
|
124
|
-
|
125
|
-
'
|
136
|
+
'python -m pip install --upgrade pip -q',
|
137
|
+
'pip install -q -e ../snowpark-checkpoints-configuration',
|
138
|
+
'pip list',
|
139
|
+
'pytest -vvv --junitxml=test/outcome/test-results.xml --cov=. --cov-branch --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=html:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.html --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
|
126
140
|
]
|
127
141
|
|
128
142
|
coverage = [
|
snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/__init__.py
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
import logging
|
17
|
+
|
18
|
+
|
19
|
+
# Add a NullHandler to prevent logging messages from being output to
|
20
|
+
# sys.stderr if no logging configuration is provided.
|
21
|
+
logging.getLogger(__name__).addHandler(logging.NullHandler())
|
22
|
+
|
23
|
+
# ruff: noqa: E402
|
24
|
+
|
25
|
+
__all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
|
26
|
+
|
27
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
28
|
+
from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
|
29
|
+
collect_dataframe_checkpoint,
|
30
|
+
)
|
snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/__version__.py
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
__version__ = "0.2.1"
|
@@ -1,6 +1,17 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
4
15
|
|
5
16
|
import locale
|
6
17
|
|
@@ -81,11 +92,13 @@ COLUMN_IS_UNIQUE_SIZE_KEY = "is_unique_size"
|
|
81
92
|
COLUMN_KEY_TYPE_KEY = "key_type"
|
82
93
|
COLUMN_MARGIN_ERROR_KEY = "margin_error"
|
83
94
|
COLUMN_MAX_KEY = "max"
|
95
|
+
COLUMN_MAX_LENGTH_KEY = "max_length"
|
84
96
|
COLUMN_MAX_SIZE_KEY = "max_size"
|
85
97
|
COLUMN_MEAN_KEY = "mean"
|
86
98
|
COLUMN_MEAN_SIZE_KEY = "mean_size"
|
87
99
|
COLUMN_METADATA_KEY = "metadata"
|
88
100
|
COLUMN_MIN_KEY = "min"
|
101
|
+
COLUMN_MIN_LENGTH_KEY = "min_length"
|
89
102
|
COLUMN_MIN_SIZE_KEY = "min_size"
|
90
103
|
COLUMN_NAME_KEY = "name"
|
91
104
|
COLUMN_NULL_COUNT_KEY = "null_count"
|
@@ -95,6 +108,7 @@ COLUMN_ROWS_NULL_COUNT_KEY = "rows_null_count"
|
|
95
108
|
COLUMN_SIZE_KEY = "size"
|
96
109
|
COLUMN_TRUE_COUNT_KEY = "true_count"
|
97
110
|
COLUMN_TYPE_KEY = "type"
|
111
|
+
COLUMN_VALUE_KEY = "value"
|
98
112
|
COLUMN_VALUE_TYPE_KEY = "value_type"
|
99
113
|
COLUMNS_KEY = "columns"
|
100
114
|
|
@@ -126,6 +140,8 @@ UNKNOWN_SOURCE_FILE = "unknown"
|
|
126
140
|
UNKNOWN_LINE_OF_CODE = -1
|
127
141
|
BACKSLASH_TOKEN = "\\"
|
128
142
|
SLASH_TOKEN = "/"
|
143
|
+
PYSPARK_NONE_SIZE_VALUE = -1
|
144
|
+
PANDAS_LONG_TYPE = "Int64"
|
129
145
|
|
130
146
|
# ENVIRONMENT VARIABLES
|
131
147
|
SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR = (
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
__all__ = ["CollectionPointResult", "CollectionResult", "CollectionPointResultManager"]
|
17
|
+
|
18
|
+
from snowflake.snowpark_checkpoints_collector.collection_result.model.collection_point_result import (
|
19
|
+
CollectionPointResult,
|
20
|
+
CollectionResult,
|
21
|
+
)
|
22
|
+
from snowflake.snowpark_checkpoints_collector.collection_result.model.collection_point_result_manager import (
|
23
|
+
CollectionPointResultManager,
|
24
|
+
)
|
@@ -1,6 +1,17 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
4
15
|
from datetime import datetime
|
5
16
|
from enum import Enum
|
6
17
|
|
@@ -1,7 +1,20 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
4
16
|
import json
|
17
|
+
import logging
|
5
18
|
|
6
19
|
from typing import Optional
|
7
20
|
|
@@ -13,6 +26,7 @@ from snowflake.snowpark_checkpoints_collector.utils import file_utils
|
|
13
26
|
|
14
27
|
|
15
28
|
RESULTS_KEY = "results"
|
29
|
+
LOGGER = logging.getLogger(__name__)
|
16
30
|
|
17
31
|
|
18
32
|
class CollectionPointResultManager(metaclass=Singleton):
|
@@ -38,6 +52,7 @@ class CollectionPointResultManager(metaclass=Singleton):
|
|
38
52
|
|
39
53
|
"""
|
40
54
|
result_json = result.get_collection_result_data()
|
55
|
+
LOGGER.debug("Adding a new collection result: %s", result_json)
|
41
56
|
self.result_collection.append(result_json)
|
42
57
|
self._save_result()
|
43
58
|
|
@@ -54,5 +69,6 @@ class CollectionPointResultManager(metaclass=Singleton):
|
|
54
69
|
|
55
70
|
def _save_result(self) -> None:
|
56
71
|
result_collection_json = self.to_json()
|
72
|
+
LOGGER.info("Saving collection results to '%s'", self.output_file_path)
|
57
73
|
with open(self.output_file_path, "w") as f:
|
58
74
|
f.write(result_collection_json)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
__all__ = [
|
17
|
+
"ColumnCollectorManager",
|
18
|
+
]
|
19
|
+
|
20
|
+
from snowflake.snowpark_checkpoints_collector.column_collection.column_collector_manager import (
|
21
|
+
ColumnCollectorManager,
|
22
|
+
)
|