snowpark-checkpoints-collectors 0.1.0rc2__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/LICENSE +0 -25
  2. snowpark_checkpoints_collectors-0.1.1/PKG-INFO +143 -0
  3. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/README.md +1 -4
  4. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/pyproject.toml +17 -9
  5. snowpark_checkpoints_collectors-0.1.1/src/snowflake/snowpark_checkpoints_collector/__init__.py +22 -0
  6. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +14 -3
  7. snowpark_checkpoints_collectors-0.1.1/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +24 -0
  8. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +14 -3
  9. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +14 -3
  10. snowpark_checkpoints_collectors-0.1.1/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +22 -0
  11. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +14 -3
  12. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +14 -3
  13. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +14 -3
  14. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +14 -3
  15. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +14 -3
  16. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +14 -3
  17. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +14 -3
  18. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +14 -3
  19. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +14 -3
  20. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +14 -3
  21. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +14 -3
  22. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +14 -3
  23. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +14 -3
  24. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +14 -3
  25. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +14 -3
  26. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +14 -3
  27. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +14 -3
  28. snowpark_checkpoints_collectors-0.1.1/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +20 -0
  29. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +14 -3
  30. snowpark_checkpoints_collectors-0.1.1/src/snowflake/snowpark_checkpoints_collector/singleton.py +23 -0
  31. snowpark_checkpoints_collectors-0.1.1/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +20 -0
  32. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +14 -3
  33. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +14 -3
  34. snowpark_checkpoints_collectors-0.1.1/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +53 -0
  35. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +14 -3
  36. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +14 -3
  37. snowpark_checkpoints_collectors-0.1.1/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +889 -0
  38. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/.coveragerc +1 -0
  39. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/telemetry_compare_utils.py +20 -3
  40. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1.py +14 -3
  41. snowpark_checkpoints_collectors-0.1.1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +18 -0
  42. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +6 -5
  43. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +6 -5
  44. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +5 -4
  45. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +5 -4
  46. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +6 -5
  47. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +5 -4
  48. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +6 -5
  49. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_2.py +14 -3
  50. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +5 -4
  51. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +5 -4
  52. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +5 -4
  53. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_2_expected/test_collect_invalid_mode_telemetry.json +5 -4
  54. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_2_expected/test_generate_parquet_for_spark_df_telemetry.json +5 -4
  55. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_2_expected/test_spark_df_mode_dataframe_telemetry.json +5 -4
  56. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collection_result_file.py +14 -3
  57. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_snow_connection_int.py +14 -3
  58. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/unit/test_column_collection.py +14 -3
  59. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/unit/test_extra_config.py +14 -3
  60. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/unit/test_file_utils.py +14 -3
  61. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/unit/test_snow_connection.py +14 -3
  62. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/unit/test_summary_stats_collector.py +14 -3
  63. snowpark_checkpoints_collectors-0.1.0rc2/PKG-INFO +0 -347
  64. snowpark_checkpoints_collectors-0.1.0rc2/src/snowflake/snowpark_checkpoints_collector/__init__.py +0 -11
  65. snowpark_checkpoints_collectors-0.1.0rc2/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -13
  66. snowpark_checkpoints_collectors-0.1.0rc2/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -11
  67. snowpark_checkpoints_collectors-0.1.0rc2/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -9
  68. snowpark_checkpoints_collectors-0.1.0rc2/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -12
  69. snowpark_checkpoints_collectors-0.1.0rc2/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -9
  70. snowpark_checkpoints_collectors-0.1.0rc2/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +0 -49
  71. snowpark_checkpoints_collectors-0.1.0rc2/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -1
  72. snowpark_checkpoints_collectors-0.1.0rc2/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -17
  73. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/.gitignore +0 -0
  74. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/CHANGELOG.md +0 -0
  75. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/snowpark-testdf-schema.json +0 -0
  76. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_checkpoint_name.py +0 -0
  77. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -0
  78. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -0
  79. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -0
  80. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
  81. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
  82. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
  83. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -0
  84. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -0
  85. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/unit/test_checkpoint_name_utils.py +0 -0
  86. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/unit/test_collection_point_result.py +0 -0
  87. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/unit/test_collection_point_result_manager.py +0 -0
  88. {snowpark_checkpoints_collectors-0.1.0rc2 → snowpark_checkpoints_collectors-0.1.1}/test/unit/test_pandera_column_check_manager.py +0 -0
@@ -175,28 +175,3 @@
175
175
  of your accepting any such warranty or additional liability.
176
176
 
177
177
  END OF TERMS AND CONDITIONS
178
-
179
- APPENDIX: How to apply the Apache License to your work.
180
-
181
- To apply the Apache License to your work, attach the following
182
- boilerplate notice, with the fields enclosed by brackets "[]"
183
- replaced with your own identifying information. (Don't include
184
- the brackets!) The text should be enclosed in the appropriate
185
- comment syntax for the file format. We also recommend that a
186
- file or class name and description of purpose be included on the
187
- same "printed page" as the copyright notice for easier
188
- identification within third-party archives.
189
-
190
- Copyright 2025 Snowflake
191
-
192
- Licensed under the Apache License, Version 2.0 (the "License");
193
- you may not use this file except in compliance with the License.
194
- You may obtain a copy of the License at
195
-
196
- http://www.apache.org/licenses/LICENSE-2.0
197
-
198
- Unless required by applicable law or agreed to in writing, software
199
- distributed under the License is distributed on an "AS IS" BASIS,
200
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
- See the License for the specific language governing permissions and
202
- limitations under the License.
@@ -0,0 +1,143 @@
1
+ Metadata-Version: 2.4
2
+ Name: snowpark-checkpoints-collectors
3
+ Version: 0.1.1
4
+ Summary: Snowpark column and table statistics collection
5
+ Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
+ Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
7
+ Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
8
+ License: Apache License, Version 2.0
9
+ License-File: LICENSE
10
+ Keywords: Snowflake,Snowpark,analytics,cloud,database,db
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Environment :: Other Environment
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: Information Technology
17
+ Classifier: Intended Audience :: System Administrators
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: SQL
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: Topic :: Software Development
25
+ Classifier: Topic :: Software Development :: Libraries
26
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Requires-Python: <3.12,>=3.9
29
+ Requires-Dist: pandera[io]==0.20.4
30
+ Requires-Dist: pyspark
31
+ Requires-Dist: snowflake-connector-python==3.13.0
32
+ Requires-Dist: snowflake-snowpark-python==1.26.0
33
+ Provides-Extra: development
34
+ Requires-Dist: coverage>=7.6.7; extra == 'development'
35
+ Requires-Dist: deepdiff>=8.0.0; extra == 'development'
36
+ Requires-Dist: hatchling==1.25.0; extra == 'development'
37
+ Requires-Dist: pre-commit>=4.0.1; extra == 'development'
38
+ Requires-Dist: pyarrow>=18.0.0; extra == 'development'
39
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
40
+ Requires-Dist: pytest>=8.3.3; extra == 'development'
41
+ Requires-Dist: setuptools>=70.0.0; extra == 'development'
42
+ Requires-Dist: twine==5.1.1; extra == 'development'
43
+ Description-Content-Type: text/markdown
44
+
45
+ # snowpark-checkpoints-collectors
46
+
47
+ ---
48
+ **NOTE**
49
+ This package is on Public Preview.
50
+ ---
51
+ **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
52
+ ## Features
53
+
54
+ - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
55
+ - DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
56
+
57
+
58
+
59
+ ## Functionalities
60
+
61
+ ### Collect DataFrame Checkpoint
62
+
63
+
64
+
65
+ ```python
66
+ from pyspark.sql import DataFrame as SparkDataFrame
67
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
68
+ from typing import Optional
69
+
70
+ # Signature of the function
71
+ def collect_dataframe_checkpoint(
72
+ df: SparkDataFrame,
73
+ checkpoint_name: str,
74
+ sample: Optional[float] = None,
75
+ mode: Optional[CheckpointMode] = None,
76
+ output_path: Optional[str] = None,
77
+ ) -> None:
78
+ ...
79
+ ```
80
+
81
+ - `df`: The input Spark dataframe to collect.
82
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
83
+ - `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
84
+ - `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
85
+ - `output_path`: The output path to save the checkpoint, defaults to current working directory.
86
+
87
+
88
+ ## Usage Example
89
+
90
+ ### Schema mode
91
+
92
+ ```python
93
+ from pyspark.sql import SparkSession
94
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
95
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
96
+
97
+ spark_session = SparkSession.builder.getOrCreate()
98
+ sample_size = 1.0
99
+
100
+ pyspark_df = spark_session.createDataFrame(
101
+ [("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
102
+ )
103
+
104
+ collect_dataframe_checkpoint(
105
+ pyspark_df,
106
+ checkpoint_name="collect_checkpoint_mode_1",
107
+ sample=sample_size,
108
+ mode=CheckpointMode.SCHEMA,
109
+ )
110
+ ```
111
+
112
+
113
+ ### Dataframe mode
114
+
115
+ ```python
116
+ from pyspark.sql import SparkSession
117
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
118
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
119
+ from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
120
+
121
+ spark_schema = StructType(
122
+ [
123
+ StructField("BYTE", ByteType(), True),
124
+ StructField("STRING", StringType(), True),
125
+ StructField("INTEGER", IntegerType(), True)
126
+ ]
127
+ )
128
+
129
+ data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
130
+
131
+ spark_session = SparkSession.builder.getOrCreate()
132
+ pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
133
+ "INTEGER"
134
+ )
135
+
136
+ collect_dataframe_checkpoint(
137
+ pyspark_df,
138
+ checkpoint_name="collect_checkpoint_mode_2",
139
+ mode=CheckpointMode.DATAFRAME,
140
+ )
141
+ ```
142
+
143
+ ------
@@ -2,11 +2,8 @@
2
2
 
3
3
  ---
4
4
  **NOTE**
5
-
6
- This package is on Private Preview.
7
-
5
+ This package is on Public Preview.
8
6
  ---
9
-
10
7
  **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
11
8
  ## Features
12
9
 
@@ -3,7 +3,9 @@ build-backend = "hatchling.build"
3
3
  requires = ["hatchling"]
4
4
 
5
5
  [project]
6
- authors = [{name = "Snowflake Inc."}]
6
+ authors = [
7
+ {name = "Snowflake, Inc.", email = "snowflake-python-libraries-dl@snowflake.com"},
8
+ ]
7
9
  classifiers = [
8
10
  "Development Status :: 4 - Beta",
9
11
  "Environment :: Console",
@@ -24,12 +26,13 @@ classifiers = [
24
26
  "Topic :: Scientific/Engineering :: Information Analysis",
25
27
  ]
26
28
  dependencies = [
27
- "snowflake-snowpark-python",
28
- "snowflake-connector-python",
29
+ "snowflake-snowpark-python==1.26.0",
30
+ "snowflake-connector-python==3.13.0",
29
31
  "pyspark",
30
32
  "pandera[io]==0.20.4",
31
33
  ]
32
34
  description = "Snowpark column and table statistics collection"
35
+ dynamic = ['version']
33
36
  keywords = [
34
37
  'Snowflake',
35
38
  'analytics',
@@ -38,11 +41,10 @@ keywords = [
38
41
  'db',
39
42
  'Snowpark',
40
43
  ]
41
- license = {file = "LICENSE"}
44
+ license = {text = "Apache License, Version 2.0"}
42
45
  name = "snowpark-checkpoints-collectors"
43
46
  readme = "README.md"
44
47
  requires-python = '>=3.9,<3.12'
45
- dynamic = ['version']
46
48
 
47
49
  [project.optional-dependencies]
48
50
  development = [
@@ -74,15 +76,22 @@ where = ["src/"]
74
76
  dev-mode-dirs = ['src']
75
77
  directory = 'snowpark-checkpoints-collectors'
76
78
 
79
+ [[tool.hatch.sources]]
80
+ dir = "src/snowflake/snowpark_checkpoints_collector"
81
+ name = "snowpark-checkpoints-collectors"
82
+ type = "package"
83
+
77
84
  [tool.hatch.build.targets.wheel]
78
85
  directory = "dist"
79
- packages = ["snowpark-checkpoints-collectors/src/snowflake/snowpark_checkpoints_collector"]
86
+ packages = [
87
+ "src/snowflake",
88
+ ]
80
89
 
81
90
  [tool.hatch.build.targets.sdist]
82
91
  directory = "dist"
83
92
  exclude = ["/.github", "/.idea"]
84
93
  include = [
85
- 'src/',
94
+ 'src/**',
86
95
  'README.md',
87
96
  'LICENSE',
88
97
  'test/',
@@ -113,7 +122,6 @@ exclude_lines = [
113
122
  "if __name__ == .__main__.:",
114
123
  ]
115
124
 
116
-
117
125
  [tool.hatch.envs.linter.scripts]
118
126
  check = [
119
127
  'ruff check --fix .',
@@ -121,7 +129,7 @@ check = [
121
129
 
122
130
  [tool.hatch.envs.test.scripts]
123
131
  check = [
124
- "pip install -e ../snowpark-checkpoints-configuration" ,
132
+ "pip install -e ../snowpark-checkpoints-configuration",
125
133
  'pytest -v --junitxml=test/outcome/test-results.xml --cov=. --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
126
134
  ]
127
135
 
@@ -0,0 +1,22 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
17
+
18
+ from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
19
+ collect_dataframe_checkpoint,
20
+ )
21
+
22
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
 
5
16
  import locale
6
17
 
@@ -0,0 +1,24 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __all__ = ["CollectionPointResult", "CollectionResult", "CollectionPointResultManager"]
17
+
18
+ from snowflake.snowpark_checkpoints_collector.collection_result.model.collection_point_result import (
19
+ CollectionPointResult,
20
+ CollectionResult,
21
+ )
22
+ from snowflake.snowpark_checkpoints_collector.collection_result.model.collection_point_result_manager import (
23
+ CollectionPointResultManager,
24
+ )
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
  from datetime import datetime
5
16
  from enum import Enum
6
17
 
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
  import json
5
16
 
6
17
  from typing import Optional
@@ -0,0 +1,22 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __all__ = [
17
+ "ColumnCollectorManager",
18
+ ]
19
+
20
+ from snowflake.snowpark_checkpoints_collector.column_collection.column_collector_manager import (
21
+ ColumnCollectorManager,
22
+ )
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
  from pyspark.sql import DataFrame as SparkDataFrame
5
16
  from pyspark.sql.types import StructField
6
17
 
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
 
5
16
  __all__ = [
6
17
  "ArrayColumnCollector",
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
  from statistics import mean
5
16
 
6
17
  from pyspark.sql import DataFrame as SparkDataFrame
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
  from statistics import mean
5
16
 
6
17
  from pyspark.sql import DataFrame as SparkDataFrame
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
 
5
16
  from pyspark.sql import DataFrame as SparkDataFrame
6
17
  from pyspark.sql.types import StructField
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
 
5
16
  from abc import ABC, abstractmethod
6
17
 
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
  from pyspark.sql import DataFrame as SparkDataFrame
5
16
  from pyspark.sql.functions import col as spark_col
6
17
  from pyspark.sql.functions import max as spark_max
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
 
5
16
  from pyspark.sql import DataFrame as SparkDataFrame
6
17
  from pyspark.sql.functions import col as spark_col
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
 
5
16
  from pyspark.sql import DataFrame as SparkDataFrame
6
17
  from pyspark.sql.functions import col as spark_col
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
 
5
16
  from pyspark.sql import DataFrame as SparkDataFrame
6
17
  from pyspark.sql.types import StructField
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
  from statistics import mean
5
16
 
6
17
  from pyspark.sql import DataFrame as SparkDataFrame