snowpark-checkpoints-collectors 0.2.0rc1__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/.gitignore +4 -0
  2. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/LICENSE +0 -25
  3. snowpark_checkpoints_collectors-0.2.1/PKG-INFO +158 -0
  4. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/README.md +13 -3
  5. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/pyproject.toml +25 -11
  6. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/__init__.py +30 -0
  7. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/__version__.py +16 -0
  8. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +19 -3
  9. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +24 -0
  10. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +14 -3
  11. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +19 -3
  12. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +22 -0
  13. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +55 -21
  14. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +14 -3
  15. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +36 -19
  16. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +31 -14
  17. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +71 -0
  18. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +21 -10
  19. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +74 -0
  20. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +67 -0
  21. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +92 -0
  22. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +23 -10
  23. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +39 -20
  24. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +49 -0
  25. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +39 -15
  26. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +70 -0
  27. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +24 -11
  28. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +75 -0
  29. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +75 -0
  30. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +20 -0
  31. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +88 -36
  32. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/singleton.py +23 -0
  33. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +20 -0
  34. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +52 -12
  35. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +154 -83
  36. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +53 -0
  37. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +23 -5
  38. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +14 -3
  39. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/utils/logging_utils.py +67 -0
  40. snowpark_checkpoints_collectors-0.2.1/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +889 -0
  41. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/.coveragerc +1 -0
  42. snowpark_checkpoints_collectors-0.2.1/test/integ/telemetry_compare_utils.py +69 -0
  43. snowpark_checkpoints_collectors-0.2.1/test/integ/test_checkpoint_name.py +74 -0
  44. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1.py +123 -108
  45. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +1 -0
  46. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +18 -0
  47. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +1 -1
  48. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +18 -0
  49. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +1 -0
  50. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +18 -0
  51. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +18 -0
  52. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +18 -0
  53. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +18 -0
  54. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_full_df.json +1 -1
  55. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +1 -1
  56. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +9 -8
  57. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +18 -0
  58. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_2.py +96 -21
  59. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +18 -0
  60. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +18 -0
  61. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +18 -0
  62. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_collect_invalid_mode_telemetry.json +18 -0
  63. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_generate_parquet_for_spark_df_telemetry.json +18 -0
  64. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_spark_df_mode_dataframe_telemetry.json +18 -0
  65. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collection_result_file.py +39 -13
  66. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_snow_connection_int.py +14 -3
  67. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_checkpoint_name_utils.py +4 -2
  68. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_collection_point_result_manager.py +31 -6
  69. snowpark_checkpoints_collectors-0.2.1/test/unit/test_column_collection.py +477 -0
  70. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_extra_config.py +14 -3
  71. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_file_utils.py +14 -3
  72. snowpark_checkpoints_collectors-0.2.1/test/unit/test_logger.py +132 -0
  73. snowpark_checkpoints_collectors-0.2.1/test/unit/test_logging_utils.py +132 -0
  74. snowpark_checkpoints_collectors-0.2.1/test/unit/test_pandera_column_check_manager.py +194 -0
  75. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_snow_connection.py +15 -4
  76. snowpark_checkpoints_collectors-0.2.1/test/unit/test_summary_stats_collector.py +70 -0
  77. snowpark_checkpoints_collectors-0.2.0rc1/PKG-INFO +0 -347
  78. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/__init__.py +0 -11
  79. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -13
  80. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -11
  81. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -53
  82. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -56
  83. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -49
  84. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -69
  85. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -38
  86. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -35
  87. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +0 -54
  88. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +0 -54
  89. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -9
  90. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -12
  91. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -9
  92. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +0 -49
  93. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -1
  94. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_checkpoint_name.py +0 -51
  95. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -1
  96. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -17
  97. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -17
  98. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -1
  99. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -17
  100. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -17
  101. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -17
  102. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -17
  103. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -17
  104. snowpark_checkpoints_collectors-0.2.0rc1/test/unit/test_column_collection.py +0 -669
  105. snowpark_checkpoints_collectors-0.2.0rc1/test/unit/test_summary_stats_collector.py +0 -29
  106. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/CHANGELOG.md +0 -0
  107. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/snowpark-testdf-schema.json +0 -0
  108. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
  109. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
  110. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
  111. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.2.1}/test/unit/test_collection_point_result.py +0 -0
@@ -4,12 +4,16 @@
4
4
 
5
5
  # demos
6
6
  snowpark-checkpoints-output/
7
+ Demos/Demos/
8
+ Demos/snowpark-checkpoints-output/
7
9
 
8
10
  # env
9
11
  wheelvenv/
10
12
 
13
+
11
14
  # version
12
15
  !__version__.py
16
+ !**/__version__.py
13
17
 
14
18
  #ruff
15
19
  .ruff_cache
@@ -175,28 +175,3 @@
175
175
  of your accepting any such warranty or additional liability.
176
176
 
177
177
  END OF TERMS AND CONDITIONS
178
-
179
- APPENDIX: How to apply the Apache License to your work.
180
-
181
- To apply the Apache License to your work, attach the following
182
- boilerplate notice, with the fields enclosed by brackets "[]"
183
- replaced with your own identifying information. (Don't include
184
- the brackets!) The text should be enclosed in the appropriate
185
- comment syntax for the file format. We also recommend that a
186
- file or class name and description of purpose be included on the
187
- same "printed page" as the copyright notice for easier
188
- identification within third-party archives.
189
-
190
- Copyright 2025 Snowflake
191
-
192
- Licensed under the Apache License, Version 2.0 (the "License");
193
- you may not use this file except in compliance with the License.
194
- You may obtain a copy of the License at
195
-
196
- http://www.apache.org/licenses/LICENSE-2.0
197
-
198
- Unless required by applicable law or agreed to in writing, software
199
- distributed under the License is distributed on an "AS IS" BASIS,
200
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
- See the License for the specific language governing permissions and
202
- limitations under the License.
@@ -0,0 +1,158 @@
1
+ Metadata-Version: 2.4
2
+ Name: snowpark-checkpoints-collectors
3
+ Version: 0.2.1
4
+ Summary: Snowpark column and table statistics collection
5
+ Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
+ Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
7
+ Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
8
+ License: Apache License, Version 2.0
9
+ License-File: LICENSE
10
+ Keywords: Snowflake,Snowpark,analytics,cloud,database,db
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Environment :: Other Environment
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: Information Technology
17
+ Classifier: Intended Audience :: System Administrators
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: SQL
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: Topic :: Software Development
25
+ Classifier: Topic :: Software Development :: Libraries
26
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Requires-Python: <3.12,>=3.9
29
+ Requires-Dist: pandera[io]==0.20.4
30
+ Requires-Dist: snowflake-connector-python
31
+ Requires-Dist: snowflake-snowpark-python>=1.23.0
32
+ Provides-Extra: development
33
+ Requires-Dist: coverage>=7.6.7; extra == 'development'
34
+ Requires-Dist: deepdiff>=8.0.0; extra == 'development'
35
+ Requires-Dist: hatchling==1.25.0; extra == 'development'
36
+ Requires-Dist: pre-commit>=4.0.1; extra == 'development'
37
+ Requires-Dist: pyarrow>=18.0.0; extra == 'development'
38
+ Requires-Dist: pyspark>=3.5.0; extra == 'development'
39
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
40
+ Requires-Dist: pytest>=8.3.3; extra == 'development'
41
+ Requires-Dist: setuptools>=70.0.0; extra == 'development'
42
+ Requires-Dist: twine==5.1.1; extra == 'development'
43
+ Provides-Extra: pyspark
44
+ Requires-Dist: pyspark>=3.5.0; extra == 'pyspark'
45
+ Description-Content-Type: text/markdown
46
+
47
+ # snowpark-checkpoints-collectors
48
+
49
+
50
+ ---
51
+ ##### This package is on Public Preview.
52
+ ---
53
+
54
+ **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
55
+
56
+ ---
57
+ ## Install the library
58
+ ```bash
59
+ pip install snowpark-checkpoints-collectors
60
+ ```
61
+ This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
62
+ ```bash
63
+ pip install "snowpark-checkpoints-collectors[pyspark]"
64
+ ```
65
+ ---
66
+
67
+ ## Features
68
+
69
+ - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
70
+ - DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
71
+
72
+
73
+
74
+ ## Functionalities
75
+
76
+ ### Collect DataFrame Checkpoint
77
+
78
+
79
+
80
+ ```python
81
+ from pyspark.sql import DataFrame as SparkDataFrame
82
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
83
+ from typing import Optional
84
+
85
+ # Signature of the function
86
+ def collect_dataframe_checkpoint(
87
+ df: SparkDataFrame,
88
+ checkpoint_name: str,
89
+ sample: Optional[float] = None,
90
+ mode: Optional[CheckpointMode] = None,
91
+ output_path: Optional[str] = None,
92
+ ) -> None:
93
+ ...
94
+ ```
95
+
96
+ - `df`: The input Spark dataframe to collect.
97
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
98
+ - `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
99
+ - `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
100
+ - `output_path`: The output path to save the checkpoint, defaults to current working directory.
101
+
102
+
103
+ ## Usage Example
104
+
105
+ ### Schema mode
106
+
107
+ ```python
108
+ from pyspark.sql import SparkSession
109
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
110
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
111
+
112
+ spark_session = SparkSession.builder.getOrCreate()
113
+ sample_size = 1.0
114
+
115
+ pyspark_df = spark_session.createDataFrame(
116
+ [("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
117
+ )
118
+
119
+ collect_dataframe_checkpoint(
120
+ pyspark_df,
121
+ checkpoint_name="collect_checkpoint_mode_1",
122
+ sample=sample_size,
123
+ mode=CheckpointMode.SCHEMA,
124
+ )
125
+ ```
126
+
127
+
128
+ ### Dataframe mode
129
+
130
+ ```python
131
+ from pyspark.sql import SparkSession
132
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
133
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
134
+ from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
135
+
136
+ spark_schema = StructType(
137
+ [
138
+ StructField("BYTE", ByteType(), True),
139
+ StructField("STRING", StringType(), True),
140
+ StructField("INTEGER", IntegerType(), True)
141
+ ]
142
+ )
143
+
144
+ data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
145
+
146
+ spark_session = SparkSession.builder.getOrCreate()
147
+ pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
148
+ "INTEGER"
149
+ )
150
+
151
+ collect_dataframe_checkpoint(
152
+ pyspark_df,
153
+ checkpoint_name="collect_checkpoint_mode_2",
154
+ mode=CheckpointMode.DATAFRAME,
155
+ )
156
+ ```
157
+
158
+ ------
@@ -1,13 +1,23 @@
1
1
  # snowpark-checkpoints-collectors
2
2
 
3
+
4
+ ---
5
+ ##### This package is on Public Preview.
3
6
  ---
4
- **NOTE**
5
7
 
6
- This package is on Private Preview.
8
+ **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
7
9
 
10
+ ---
11
+ ## Install the library
12
+ ```bash
13
+ pip install snowpark-checkpoints-collectors
14
+ ```
15
+ This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
16
+ ```bash
17
+ pip install "snowpark-checkpoints-collectors[pyspark]"
18
+ ```
8
19
  ---
9
20
 
10
- **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
11
21
  ## Features
12
22
 
13
23
  - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
@@ -3,7 +3,9 @@ build-backend = "hatchling.build"
3
3
  requires = ["hatchling"]
4
4
 
5
5
  [project]
6
- authors = [{name = "Snowflake Inc."}]
6
+ authors = [
7
+ {name = "Snowflake, Inc.", email = "snowflake-python-libraries-dl@snowflake.com"},
8
+ ]
7
9
  classifiers = [
8
10
  "Development Status :: 4 - Beta",
9
11
  "Environment :: Console",
@@ -24,12 +26,12 @@ classifiers = [
24
26
  "Topic :: Scientific/Engineering :: Information Analysis",
25
27
  ]
26
28
  dependencies = [
27
- "snowflake-snowpark-python",
29
+ "snowflake-snowpark-python>=1.23.0",
28
30
  "snowflake-connector-python",
29
- "pyspark",
30
31
  "pandera[io]==0.20.4",
31
32
  ]
32
33
  description = "Snowpark column and table statistics collection"
34
+ dynamic = ['version']
33
35
  keywords = [
34
36
  'Snowflake',
35
37
  'analytics',
@@ -38,13 +40,15 @@ keywords = [
38
40
  'db',
39
41
  'Snowpark',
40
42
  ]
41
- license = {file = "LICENSE"}
43
+ license = {text = "Apache License, Version 2.0"}
42
44
  name = "snowpark-checkpoints-collectors"
43
45
  readme = "README.md"
44
46
  requires-python = '>=3.9,<3.12'
45
- dynamic = ['version']
46
47
 
47
48
  [project.optional-dependencies]
49
+ pyspark = [
50
+ "pyspark>=3.5.0",
51
+ ]
48
52
  development = [
49
53
  "pytest>=8.3.3",
50
54
  "pytest-cov>=6.0.0",
@@ -55,14 +59,16 @@ development = [
55
59
  "setuptools>=70.0.0",
56
60
  "pyarrow>=18.0.0",
57
61
  "deepdiff>=8.0.0",
62
+ "pyspark>=3.5.0",
58
63
  ]
59
64
 
60
65
  [project.urls]
61
66
  "Bug Tracker" = "https://github.com/snowflakedb/snowpark-checkpoints/issues"
62
67
  "Source code" = "https://github.com/snowflakedb/snowpark-checkpoints/"
63
68
 
69
+
64
70
  [tool.hatch.version]
65
- path = "__version__.py"
71
+ path = "src/snowflake/snowpark_checkpoints_collector/__version__.py"
66
72
  pattern = '^__version__ = "(?P<version>.*)"'
67
73
  source = "regex"
68
74
 
@@ -74,15 +80,22 @@ where = ["src/"]
74
80
  dev-mode-dirs = ['src']
75
81
  directory = 'snowpark-checkpoints-collectors'
76
82
 
83
+ [[tool.hatch.sources]]
84
+ dir = "src/snowflake/snowpark_checkpoints_collector"
85
+ name = "snowpark-checkpoints-collectors"
86
+ type = "package"
87
+
77
88
  [tool.hatch.build.targets.wheel]
78
89
  directory = "dist"
79
- packages = ["snowpark-checkpoints-collectors/src/snowflake/snowpark_checkpoints_collector"]
90
+ packages = [
91
+ "src/snowflake",
92
+ ]
80
93
 
81
94
  [tool.hatch.build.targets.sdist]
82
95
  directory = "dist"
83
96
  exclude = ["/.github", "/.idea"]
84
97
  include = [
85
- 'src/',
98
+ 'src/**',
86
99
  'README.md',
87
100
  'LICENSE',
88
101
  'test/',
@@ -113,7 +126,6 @@ exclude_lines = [
113
126
  "if __name__ == .__main__.:",
114
127
  ]
115
128
 
116
-
117
129
  [tool.hatch.envs.linter.scripts]
118
130
  check = [
119
131
  'ruff check --fix .',
@@ -121,8 +133,10 @@ check = [
121
133
 
122
134
  [tool.hatch.envs.test.scripts]
123
135
  check = [
124
- "pip install -e ../snowpark-checkpoints-configuration" ,
125
- 'pytest -v --junitxml=test/outcome/test-results.xml --cov=. --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
136
+ 'python -m pip install --upgrade pip -q',
137
+ 'pip install -q -e ../snowpark-checkpoints-configuration',
138
+ 'pip list',
139
+ 'pytest -vvv --junitxml=test/outcome/test-results.xml --cov=. --cov-branch --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=html:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.html --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
126
140
  ]
127
141
 
128
142
  coverage = [
@@ -0,0 +1,30 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import logging
17
+
18
+
19
+ # Add a NullHandler to prevent logging messages from being output to
20
+ # sys.stderr if no logging configuration is provided.
21
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
22
+
23
+ # ruff: noqa: E402
24
+
25
+ __all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
26
+
27
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
28
+ from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
29
+ collect_dataframe_checkpoint,
30
+ )
@@ -0,0 +1,16 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __version__ = "0.2.1"
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
 
5
16
  import locale
6
17
 
@@ -81,11 +92,13 @@ COLUMN_IS_UNIQUE_SIZE_KEY = "is_unique_size"
81
92
  COLUMN_KEY_TYPE_KEY = "key_type"
82
93
  COLUMN_MARGIN_ERROR_KEY = "margin_error"
83
94
  COLUMN_MAX_KEY = "max"
95
+ COLUMN_MAX_LENGTH_KEY = "max_length"
84
96
  COLUMN_MAX_SIZE_KEY = "max_size"
85
97
  COLUMN_MEAN_KEY = "mean"
86
98
  COLUMN_MEAN_SIZE_KEY = "mean_size"
87
99
  COLUMN_METADATA_KEY = "metadata"
88
100
  COLUMN_MIN_KEY = "min"
101
+ COLUMN_MIN_LENGTH_KEY = "min_length"
89
102
  COLUMN_MIN_SIZE_KEY = "min_size"
90
103
  COLUMN_NAME_KEY = "name"
91
104
  COLUMN_NULL_COUNT_KEY = "null_count"
@@ -95,6 +108,7 @@ COLUMN_ROWS_NULL_COUNT_KEY = "rows_null_count"
95
108
  COLUMN_SIZE_KEY = "size"
96
109
  COLUMN_TRUE_COUNT_KEY = "true_count"
97
110
  COLUMN_TYPE_KEY = "type"
111
+ COLUMN_VALUE_KEY = "value"
98
112
  COLUMN_VALUE_TYPE_KEY = "value_type"
99
113
  COLUMNS_KEY = "columns"
100
114
 
@@ -126,6 +140,8 @@ UNKNOWN_SOURCE_FILE = "unknown"
126
140
  UNKNOWN_LINE_OF_CODE = -1
127
141
  BACKSLASH_TOKEN = "\\"
128
142
  SLASH_TOKEN = "/"
143
+ PYSPARK_NONE_SIZE_VALUE = -1
144
+ PANDAS_LONG_TYPE = "Int64"
129
145
 
130
146
  # ENVIRONMENT VARIABLES
131
147
  SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR = (
@@ -0,0 +1,24 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __all__ = ["CollectionPointResult", "CollectionResult", "CollectionPointResultManager"]
17
+
18
+ from snowflake.snowpark_checkpoints_collector.collection_result.model.collection_point_result import (
19
+ CollectionPointResult,
20
+ CollectionResult,
21
+ )
22
+ from snowflake.snowpark_checkpoints_collector.collection_result.model.collection_point_result_manager import (
23
+ CollectionPointResultManager,
24
+ )
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
  from datetime import datetime
5
16
  from enum import Enum
6
17
 
@@ -1,7 +1,20 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
4
16
  import json
17
+ import logging
5
18
 
6
19
  from typing import Optional
7
20
 
@@ -13,6 +26,7 @@ from snowflake.snowpark_checkpoints_collector.utils import file_utils
13
26
 
14
27
 
15
28
  RESULTS_KEY = "results"
29
+ LOGGER = logging.getLogger(__name__)
16
30
 
17
31
 
18
32
  class CollectionPointResultManager(metaclass=Singleton):
@@ -38,6 +52,7 @@ class CollectionPointResultManager(metaclass=Singleton):
38
52
 
39
53
  """
40
54
  result_json = result.get_collection_result_data()
55
+ LOGGER.debug("Adding a new collection result: %s", result_json)
41
56
  self.result_collection.append(result_json)
42
57
  self._save_result()
43
58
 
@@ -54,5 +69,6 @@ class CollectionPointResultManager(metaclass=Singleton):
54
69
 
55
70
  def _save_result(self) -> None:
56
71
  result_collection_json = self.to_json()
72
+ LOGGER.info("Saving collection results to '%s'", self.output_file_path)
57
73
  with open(self.output_file_path, "w") as f:
58
74
  f.write(result_collection_json)
@@ -0,0 +1,22 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __all__ = [
17
+ "ColumnCollectorManager",
18
+ ]
19
+
20
+ from snowflake.snowpark_checkpoints_collector.column_collection.column_collector_manager import (
21
+ ColumnCollectorManager,
22
+ )