snowpark-checkpoints-collectors 0.2.0rc1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/.gitignore +4 -0
  2. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/LICENSE +0 -25
  3. snowpark_checkpoints_collectors-0.3.0/PKG-INFO +159 -0
  4. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/README.md +13 -3
  5. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/pyproject.toml +26 -11
  6. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/__init__.py +30 -0
  7. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/__version__.py +16 -0
  8. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +19 -3
  9. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +24 -0
  10. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +14 -3
  11. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +23 -5
  12. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +22 -0
  13. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +55 -21
  14. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +14 -3
  15. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +36 -19
  16. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +31 -14
  17. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +71 -0
  18. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +21 -10
  19. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +74 -0
  20. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +67 -0
  21. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +92 -0
  22. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +23 -10
  23. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +39 -20
  24. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +49 -0
  25. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +39 -15
  26. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +70 -0
  27. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +24 -11
  28. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +75 -0
  29. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +75 -0
  30. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +20 -0
  31. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +88 -36
  32. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/io_utils/__init__.py +26 -0
  33. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/io_utils/io_default_strategy.py +61 -0
  34. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/io_utils/io_env_strategy.py +142 -0
  35. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/io_utils/io_file_manager.py +79 -0
  36. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/singleton.py +23 -0
  37. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +20 -0
  38. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +63 -21
  39. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +159 -89
  40. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +53 -0
  41. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +69 -6
  42. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +23 -7
  43. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/utils/logging_utils.py +67 -0
  44. snowpark_checkpoints_collectors-0.3.0/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +928 -0
  45. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/.coveragerc +1 -0
  46. snowpark_checkpoints_collectors-0.3.0/test/integ/telemetry_compare_utils.py +81 -0
  47. snowpark_checkpoints_collectors-0.3.0/test/integ/test_checkpoint_name.py +74 -0
  48. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/integ/test_collect_df_mode_1.py +202 -106
  49. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +1 -0
  50. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +18 -0
  51. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +1 -1
  52. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +18 -0
  53. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +1 -0
  54. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +18 -0
  55. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +18 -0
  56. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +18 -0
  57. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +18 -0
  58. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/integ/test_collect_df_mode_1_expected/test_full_df.json +1 -1
  59. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +1 -1
  60. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +9 -8
  61. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +18 -0
  62. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_1_expected/test_io_strategy.json +1 -0
  63. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/integ/test_collect_df_mode_2.py +179 -21
  64. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +18 -0
  65. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +18 -0
  66. snowpark_checkpoints_collectors-0.3.0/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +18 -0
  67. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/integ/test_collection_result_file.py +39 -13
  68. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/integ/test_snow_connection_int.py +14 -3
  69. snowpark_checkpoints_collectors-0.3.0/test/unit/io_utils/test_default_strategy.py +308 -0
  70. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/unit/test_checkpoint_name_utils.py +4 -2
  71. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/unit/test_collection_point_result_manager.py +31 -6
  72. snowpark_checkpoints_collectors-0.3.0/test/unit/test_column_collection.py +477 -0
  73. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/unit/test_extra_config.py +50 -3
  74. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/unit/test_file_utils.py +14 -3
  75. snowpark_checkpoints_collectors-0.3.0/test/unit/test_logger.py +132 -0
  76. snowpark_checkpoints_collectors-0.3.0/test/unit/test_logging_utils.py +132 -0
  77. snowpark_checkpoints_collectors-0.3.0/test/unit/test_pandera_column_check_manager.py +194 -0
  78. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/unit/test_snow_connection.py +28 -20
  79. snowpark_checkpoints_collectors-0.3.0/test/unit/test_summary_stats_collector.py +70 -0
  80. snowpark_checkpoints_collectors-0.2.0rc1/PKG-INFO +0 -347
  81. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/__init__.py +0 -11
  82. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -13
  83. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -11
  84. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -53
  85. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -56
  86. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -49
  87. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -69
  88. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -38
  89. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -35
  90. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +0 -54
  91. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +0 -54
  92. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -9
  93. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -12
  94. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -9
  95. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +0 -49
  96. snowpark_checkpoints_collectors-0.2.0rc1/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -1
  97. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_checkpoint_name.py +0 -51
  98. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -1
  99. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -17
  100. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -17
  101. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -1
  102. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -17
  103. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -17
  104. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -17
  105. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -17
  106. snowpark_checkpoints_collectors-0.2.0rc1/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -17
  107. snowpark_checkpoints_collectors-0.2.0rc1/test/unit/test_column_collection.py +0 -669
  108. snowpark_checkpoints_collectors-0.2.0rc1/test/unit/test_summary_stats_collector.py +0 -29
  109. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/CHANGELOG.md +0 -0
  110. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/snowpark-testdf-schema.json +0 -0
  111. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
  112. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
  113. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
  114. {snowpark_checkpoints_collectors-0.2.0rc1 → snowpark_checkpoints_collectors-0.3.0}/test/unit/test_collection_point_result.py +0 -0
@@ -4,12 +4,16 @@
4
4
 
5
5
  # demos
6
6
  snowpark-checkpoints-output/
7
+ Demos/Demos/
8
+ Demos/snowpark-checkpoints-output/
7
9
 
8
10
  # env
9
11
  wheelvenv/
10
12
 
13
+
11
14
  # version
12
15
  !__version__.py
16
+ !**/__version__.py
13
17
 
14
18
  #ruff
15
19
  .ruff_cache
@@ -175,28 +175,3 @@
175
175
  of your accepting any such warranty or additional liability.
176
176
 
177
177
  END OF TERMS AND CONDITIONS
178
-
179
- APPENDIX: How to apply the Apache License to your work.
180
-
181
- To apply the Apache License to your work, attach the following
182
- boilerplate notice, with the fields enclosed by brackets "[]"
183
- replaced with your own identifying information. (Don't include
184
- the brackets!) The text should be enclosed in the appropriate
185
- comment syntax for the file format. We also recommend that a
186
- file or class name and description of purpose be included on the
187
- same "printed page" as the copyright notice for easier
188
- identification within third-party archives.
189
-
190
- Copyright 2025 Snowflake
191
-
192
- Licensed under the Apache License, Version 2.0 (the "License");
193
- you may not use this file except in compliance with the License.
194
- You may obtain a copy of the License at
195
-
196
- http://www.apache.org/licenses/LICENSE-2.0
197
-
198
- Unless required by applicable law or agreed to in writing, software
199
- distributed under the License is distributed on an "AS IS" BASIS,
200
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201
- See the License for the specific language governing permissions and
202
- limitations under the License.
@@ -0,0 +1,159 @@
1
+ Metadata-Version: 2.4
2
+ Name: snowpark-checkpoints-collectors
3
+ Version: 0.3.0
4
+ Summary: Snowpark column and table statistics collection
5
+ Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
+ Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
7
+ Author-email: "Snowflake, Inc." <snowflake-python-libraries-dl@snowflake.com>
8
+ License: Apache License, Version 2.0
9
+ License-File: LICENSE
10
+ Keywords: Snowflake,Snowpark,analytics,cloud,database,db
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Environment :: Other Environment
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: Intended Audience :: Information Technology
17
+ Classifier: Intended Audience :: System Administrators
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: SQL
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
+ Classifier: Topic :: Software Development
25
+ Classifier: Topic :: Software Development :: Libraries
26
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Requires-Python: <3.12,>=3.9
29
+ Requires-Dist: pandera[io]==0.20.4
30
+ Requires-Dist: snowflake-connector-python
31
+ Requires-Dist: snowflake-snowpark-python>=1.23.0
32
+ Provides-Extra: development
33
+ Requires-Dist: certifi==2025.1.31; extra == 'development'
34
+ Requires-Dist: coverage>=7.6.7; extra == 'development'
35
+ Requires-Dist: deepdiff>=8.0.0; extra == 'development'
36
+ Requires-Dist: hatchling==1.25.0; extra == 'development'
37
+ Requires-Dist: pre-commit>=4.0.1; extra == 'development'
38
+ Requires-Dist: pyarrow>=18.0.0; extra == 'development'
39
+ Requires-Dist: pyspark>=3.5.0; extra == 'development'
40
+ Requires-Dist: pytest-cov>=6.0.0; extra == 'development'
41
+ Requires-Dist: pytest>=8.3.3; extra == 'development'
42
+ Requires-Dist: setuptools>=70.0.0; extra == 'development'
43
+ Requires-Dist: twine==5.1.1; extra == 'development'
44
+ Provides-Extra: pyspark
45
+ Requires-Dist: pyspark>=3.5.0; extra == 'pyspark'
46
+ Description-Content-Type: text/markdown
47
+
48
+ # snowpark-checkpoints-collectors
49
+
50
+
51
+ ---
52
+ ##### This package is on Public Preview.
53
+ ---
54
+
55
+ **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
56
+
57
+ ---
58
+ ## Install the library
59
+ ```bash
60
+ pip install snowpark-checkpoints-collectors
61
+ ```
62
+ This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
63
+ ```bash
64
+ pip install "snowpark-checkpoints-collectors[pyspark]"
65
+ ```
66
+ ---
67
+
68
+ ## Features
69
+
70
+ - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
71
+ - DataFrame collected data mode (DataFrame): This mode collects the data of the PySpark dataframe. In this case, the mechanism saves all data of the given dataframe in parquet format. Using the default user Snowflake connection, it tries to upload the parquet files into the Snowflake temporal stage and create a table based on the information in the stage. The name of the file and the table is the same as the checkpoint.
72
+
73
+
74
+
75
+ ## Functionalities
76
+
77
+ ### Collect DataFrame Checkpoint
78
+
79
+
80
+
81
+ ```python
82
+ from pyspark.sql import DataFrame as SparkDataFrame
83
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
84
+ from typing import Optional
85
+
86
+ # Signature of the function
87
+ def collect_dataframe_checkpoint(
88
+ df: SparkDataFrame,
89
+ checkpoint_name: str,
90
+ sample: Optional[float] = None,
91
+ mode: Optional[CheckpointMode] = None,
92
+ output_path: Optional[str] = None,
93
+ ) -> None:
94
+ ...
95
+ ```
96
+
97
+ - `df`: The input Spark dataframe to collect.
98
+ - `checkpoint_name`: Name of the checkpoint schema file or dataframe.
99
+ - `sample`: Fraction of DataFrame to sample for schema inference, defaults to 1.0.
100
+ - `mode`: The mode to execution the collection (Schema or Dataframe), defaults to CheckpointMode.Schema.
101
+ - `output_path`: The output path to save the checkpoint, defaults to current working directory.
102
+
103
+
104
+ ## Usage Example
105
+
106
+ ### Schema mode
107
+
108
+ ```python
109
+ from pyspark.sql import SparkSession
110
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
111
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
112
+
113
+ spark_session = SparkSession.builder.getOrCreate()
114
+ sample_size = 1.0
115
+
116
+ pyspark_df = spark_session.createDataFrame(
117
+ [("apple", 21), ("lemon", 34), ("banana", 50)], schema="fruit string, age integer"
118
+ )
119
+
120
+ collect_dataframe_checkpoint(
121
+ pyspark_df,
122
+ checkpoint_name="collect_checkpoint_mode_1",
123
+ sample=sample_size,
124
+ mode=CheckpointMode.SCHEMA,
125
+ )
126
+ ```
127
+
128
+
129
+ ### Dataframe mode
130
+
131
+ ```python
132
+ from pyspark.sql import SparkSession
133
+ from snowflake.snowpark_checkpoints_collector import collect_dataframe_checkpoint
134
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
135
+ from pyspark.sql.types import StructType, StructField, ByteType, StringType, IntegerType
136
+
137
+ spark_schema = StructType(
138
+ [
139
+ StructField("BYTE", ByteType(), True),
140
+ StructField("STRING", StringType(), True),
141
+ StructField("INTEGER", IntegerType(), True)
142
+ ]
143
+ )
144
+
145
+ data = [(1, "apple", 21), (2, "lemon", 34), (3, "banana", 50)]
146
+
147
+ spark_session = SparkSession.builder.getOrCreate()
148
+ pyspark_df = spark_session.createDataFrame(data, schema=spark_schema).orderBy(
149
+ "INTEGER"
150
+ )
151
+
152
+ collect_dataframe_checkpoint(
153
+ pyspark_df,
154
+ checkpoint_name="collect_checkpoint_mode_2",
155
+ mode=CheckpointMode.DATAFRAME,
156
+ )
157
+ ```
158
+
159
+ ------
@@ -1,13 +1,23 @@
1
1
  # snowpark-checkpoints-collectors
2
2
 
3
+
4
+ ---
5
+ ##### This package is on Public Preview.
3
6
  ---
4
- **NOTE**
5
7
 
6
- This package is on Private Preview.
8
+ **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
7
9
 
10
+ ---
11
+ ## Install the library
12
+ ```bash
13
+ pip install snowpark-checkpoints-collectors
14
+ ```
15
+ This package requires PySpark to be installed in the same environment. If you do not have it, you can install PySpark alongside Snowpark Checkpoints by running the following command:
16
+ ```bash
17
+ pip install "snowpark-checkpoints-collectors[pyspark]"
18
+ ```
8
19
  ---
9
20
 
10
- **snowpark-checkpoints-collector** package offers a function for extracting information from PySpark dataframes. We can then use that data to validate against the converted Snowpark dataframes to ensure that behavioral equivalence has been achieved.
11
21
  ## Features
12
22
 
13
23
  - Schema inference collected data mode (Schema): This is the default mode, which leverages Pandera schema inference to obtain the metadata and checks that will be evaluated for the specified dataframe. This mode also collects custom data from columns of the DataFrame based on the PySpark type.
@@ -3,7 +3,9 @@ build-backend = "hatchling.build"
3
3
  requires = ["hatchling"]
4
4
 
5
5
  [project]
6
- authors = [{name = "Snowflake Inc."}]
6
+ authors = [
7
+ {name = "Snowflake, Inc.", email = "snowflake-python-libraries-dl@snowflake.com"},
8
+ ]
7
9
  classifiers = [
8
10
  "Development Status :: 4 - Beta",
9
11
  "Environment :: Console",
@@ -24,12 +26,12 @@ classifiers = [
24
26
  "Topic :: Scientific/Engineering :: Information Analysis",
25
27
  ]
26
28
  dependencies = [
27
- "snowflake-snowpark-python",
29
+ "snowflake-snowpark-python>=1.23.0",
28
30
  "snowflake-connector-python",
29
- "pyspark",
30
31
  "pandera[io]==0.20.4",
31
32
  ]
32
33
  description = "Snowpark column and table statistics collection"
34
+ dynamic = ['version']
33
35
  keywords = [
34
36
  'Snowflake',
35
37
  'analytics',
@@ -38,13 +40,15 @@ keywords = [
38
40
  'db',
39
41
  'Snowpark',
40
42
  ]
41
- license = {file = "LICENSE"}
43
+ license = {text = "Apache License, Version 2.0"}
42
44
  name = "snowpark-checkpoints-collectors"
43
45
  readme = "README.md"
44
46
  requires-python = '>=3.9,<3.12'
45
- dynamic = ['version']
46
47
 
47
48
  [project.optional-dependencies]
49
+ pyspark = [
50
+ "pyspark>=3.5.0",
51
+ ]
48
52
  development = [
49
53
  "pytest>=8.3.3",
50
54
  "pytest-cov>=6.0.0",
@@ -55,14 +59,17 @@ development = [
55
59
  "setuptools>=70.0.0",
56
60
  "pyarrow>=18.0.0",
57
61
  "deepdiff>=8.0.0",
62
+ "pyspark>=3.5.0",
63
+ "certifi==2025.1.31",
58
64
  ]
59
65
 
60
66
  [project.urls]
61
67
  "Bug Tracker" = "https://github.com/snowflakedb/snowpark-checkpoints/issues"
62
68
  "Source code" = "https://github.com/snowflakedb/snowpark-checkpoints/"
63
69
 
70
+
64
71
  [tool.hatch.version]
65
- path = "__version__.py"
72
+ path = "src/snowflake/snowpark_checkpoints_collector/__version__.py"
66
73
  pattern = '^__version__ = "(?P<version>.*)"'
67
74
  source = "regex"
68
75
 
@@ -74,15 +81,22 @@ where = ["src/"]
74
81
  dev-mode-dirs = ['src']
75
82
  directory = 'snowpark-checkpoints-collectors'
76
83
 
84
+ [[tool.hatch.sources]]
85
+ dir = "src/snowflake/snowpark_checkpoints_collector"
86
+ name = "snowpark-checkpoints-collectors"
87
+ type = "package"
88
+
77
89
  [tool.hatch.build.targets.wheel]
78
90
  directory = "dist"
79
- packages = ["snowpark-checkpoints-collectors/src/snowflake/snowpark_checkpoints_collector"]
91
+ packages = [
92
+ "src/snowflake",
93
+ ]
80
94
 
81
95
  [tool.hatch.build.targets.sdist]
82
96
  directory = "dist"
83
97
  exclude = ["/.github", "/.idea"]
84
98
  include = [
85
- 'src/',
99
+ 'src/**',
86
100
  'README.md',
87
101
  'LICENSE',
88
102
  'test/',
@@ -113,7 +127,6 @@ exclude_lines = [
113
127
  "if __name__ == .__main__.:",
114
128
  ]
115
129
 
116
-
117
130
  [tool.hatch.envs.linter.scripts]
118
131
  check = [
119
132
  'ruff check --fix .',
@@ -121,8 +134,10 @@ check = [
121
134
 
122
135
  [tool.hatch.envs.test.scripts]
123
136
  check = [
124
- "pip install -e ../snowpark-checkpoints-configuration" ,
125
- 'pytest -v --junitxml=test/outcome/test-results.xml --cov=. --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
137
+ 'python -m pip install --upgrade pip -q',
138
+ 'pip install -q -e ../snowpark-checkpoints-configuration',
139
+ 'pip list',
140
+ 'pytest -vvv --junitxml=test/outcome/test-results.xml --cov=. --cov-branch --cov-config=test/.coveragerc --cov-report=xml:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.xml {args:test} --cov-report=term --cov-report=html:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.html --cov-report=json:test/outcome/coverage-{matrix:python:{env:PYTHON_VERSION:unset}}.json',
126
141
  ]
127
142
 
128
143
  coverage = [
@@ -0,0 +1,30 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import logging
17
+
18
+
19
+ # Add a NullHandler to prevent logging messages from being output to
20
+ # sys.stderr if no logging configuration is provided.
21
+ logging.getLogger(__name__).addHandler(logging.NullHandler())
22
+
23
+ # ruff: noqa: E402
24
+
25
+ __all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
26
+
27
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
28
+ from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
29
+ collect_dataframe_checkpoint,
30
+ )
@@ -0,0 +1,16 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __version__ = "0.3.0"
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
 
5
16
  import locale
6
17
 
@@ -81,11 +92,13 @@ COLUMN_IS_UNIQUE_SIZE_KEY = "is_unique_size"
81
92
  COLUMN_KEY_TYPE_KEY = "key_type"
82
93
  COLUMN_MARGIN_ERROR_KEY = "margin_error"
83
94
  COLUMN_MAX_KEY = "max"
95
+ COLUMN_MAX_LENGTH_KEY = "max_length"
84
96
  COLUMN_MAX_SIZE_KEY = "max_size"
85
97
  COLUMN_MEAN_KEY = "mean"
86
98
  COLUMN_MEAN_SIZE_KEY = "mean_size"
87
99
  COLUMN_METADATA_KEY = "metadata"
88
100
  COLUMN_MIN_KEY = "min"
101
+ COLUMN_MIN_LENGTH_KEY = "min_length"
89
102
  COLUMN_MIN_SIZE_KEY = "min_size"
90
103
  COLUMN_NAME_KEY = "name"
91
104
  COLUMN_NULL_COUNT_KEY = "null_count"
@@ -95,6 +108,7 @@ COLUMN_ROWS_NULL_COUNT_KEY = "rows_null_count"
95
108
  COLUMN_SIZE_KEY = "size"
96
109
  COLUMN_TRUE_COUNT_KEY = "true_count"
97
110
  COLUMN_TYPE_KEY = "type"
111
+ COLUMN_VALUE_KEY = "value"
98
112
  COLUMN_VALUE_TYPE_KEY = "value_type"
99
113
  COLUMNS_KEY = "columns"
100
114
 
@@ -126,6 +140,8 @@ UNKNOWN_SOURCE_FILE = "unknown"
126
140
  UNKNOWN_LINE_OF_CODE = -1
127
141
  BACKSLASH_TOKEN = "\\"
128
142
  SLASH_TOKEN = "/"
143
+ PYSPARK_NONE_SIZE_VALUE = -1
144
+ PANDAS_LONG_TYPE = "Int64"
129
145
 
130
146
  # ENVIRONMENT VARIABLES
131
147
  SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR = (
@@ -0,0 +1,24 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __all__ = ["CollectionPointResult", "CollectionResult", "CollectionPointResultManager"]
17
+
18
+ from snowflake.snowpark_checkpoints_collector.collection_result.model.collection_point_result import (
19
+ CollectionPointResult,
20
+ CollectionResult,
21
+ )
22
+ from snowflake.snowpark_checkpoints_collector.collection_result.model.collection_point_result_manager import (
23
+ CollectionPointResultManager,
24
+ )
@@ -1,6 +1,17 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
4
15
  from datetime import datetime
5
16
  from enum import Enum
6
17
 
@@ -1,18 +1,35 @@
1
- #
2
- # Copyright (c) 2012-2024 Snowflake Computing Inc. All rights reserved.
3
- #
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
4
16
  import json
17
+ import logging
5
18
 
6
19
  from typing import Optional
7
20
 
8
21
  from snowflake.snowpark_checkpoints_collector.collection_result.model import (
9
22
  CollectionPointResult,
10
23
  )
24
+ from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
25
+ get_io_file_manager,
26
+ )
11
27
  from snowflake.snowpark_checkpoints_collector.singleton import Singleton
12
28
  from snowflake.snowpark_checkpoints_collector.utils import file_utils
13
29
 
14
30
 
15
31
  RESULTS_KEY = "results"
32
+ LOGGER = logging.getLogger(__name__)
16
33
 
17
34
 
18
35
  class CollectionPointResultManager(metaclass=Singleton):
@@ -38,6 +55,7 @@ class CollectionPointResultManager(metaclass=Singleton):
38
55
 
39
56
  """
40
57
  result_json = result.get_collection_result_data()
58
+ LOGGER.debug("Adding a new collection result: %s", result_json)
41
59
  self.result_collection.append(result_json)
42
60
  self._save_result()
43
61
 
@@ -54,5 +72,5 @@ class CollectionPointResultManager(metaclass=Singleton):
54
72
 
55
73
  def _save_result(self) -> None:
56
74
  result_collection_json = self.to_json()
57
- with open(self.output_file_path, "w") as f:
58
- f.write(result_collection_json)
75
+ LOGGER.info("Saving collection results to '%s'", self.output_file_path)
76
+ get_io_file_manager().write(self.output_file_path, result_collection_json)
@@ -0,0 +1,22 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __all__ = [
17
+ "ColumnCollectorManager",
18
+ ]
19
+
20
+ from snowflake.snowpark_checkpoints_collector.column_collection.column_collector_manager import (
21
+ ColumnCollectorManager,
22
+ )