snowpark-checkpoints-collectors 0.3.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/PKG-INFO +23 -1
  2. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/README.md +22 -0
  3. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/__init__.py +6 -1
  4. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/__version__.py +1 -1
  5. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +1 -0
  6. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +65 -4
  7. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +26 -4
  8. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_file_utils.py +30 -16
  9. snowpark_checkpoints_collectors-0.3.1/test/unit/test_summary_stats_collector.py +100 -0
  10. snowpark_checkpoints_collectors-0.3.0/test/unit/test_summary_stats_collector.py +0 -70
  11. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/.gitignore +0 -0
  12. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/CHANGELOG.md +0 -0
  13. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/LICENSE +0 -0
  14. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/pyproject.toml +0 -0
  15. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/snowpark-testdf-schema.json +0 -0
  16. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +0 -0
  17. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
  18. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +0 -0
  19. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
  20. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +0 -0
  21. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
  22. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +0 -0
  23. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +0 -0
  24. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -0
  25. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +0 -0
  26. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -0
  27. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -0
  28. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -0
  29. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +0 -0
  30. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +0 -0
  31. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -0
  32. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +0 -0
  33. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -0
  34. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +0 -0
  35. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +0 -0
  36. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +0 -0
  37. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
  38. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +0 -0
  39. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/io_utils/__init__.py +0 -0
  40. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/io_utils/io_default_strategy.py +0 -0
  41. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/io_utils/io_env_strategy.py +0 -0
  42. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/io_utils/io_file_manager.py +0 -0
  43. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
  44. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
  45. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +0 -0
  46. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +0 -0
  47. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +0 -0
  48. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/logging_utils.py +0 -0
  49. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -0
  50. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/.coveragerc +0 -0
  51. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/telemetry_compare_utils.py +0 -0
  52. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_checkpoint_name.py +0 -0
  53. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1.py +0 -0
  54. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -0
  55. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -0
  56. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -0
  57. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -0
  58. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -0
  59. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -0
  60. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
  61. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -0
  62. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
  63. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -0
  64. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
  65. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -0
  66. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -0
  67. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -0
  68. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +0 -0
  69. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -0
  70. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_io_strategy.json +0 -0
  71. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2.py +0 -0
  72. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +0 -0
  73. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +0 -0
  74. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +0 -0
  75. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collection_result_file.py +0 -0
  76. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_snow_connection_int.py +0 -0
  77. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/io_utils/test_default_strategy.py +0 -0
  78. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_checkpoint_name_utils.py +0 -0
  79. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_collection_point_result.py +0 -0
  80. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_collection_point_result_manager.py +0 -0
  81. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_column_collection.py +0 -0
  82. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_extra_config.py +0 -0
  83. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_logger.py +0 -0
  84. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_logging_utils.py +0 -0
  85. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_pandera_column_check_manager.py +0 -0
  86. {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_snow_connection.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: snowpark-checkpoints-collectors
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Snowpark column and table statistics collection
5
5
  Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
6
  Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
@@ -101,6 +101,28 @@ def collect_dataframe_checkpoint(
101
101
  - `output_path`: The output path to save the checkpoint, defaults to current working directory.
102
102
 
103
103
 
104
+ ### Skip DataFrame Checkpoint Collection
105
+
106
+
107
+
108
+ ```python
109
+ from pyspark.sql import DataFrame as SparkDataFrame
110
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
111
+ from typing import Optional
112
+
113
+ # Signature of the function
114
+ def xcollect_dataframe_checkpoint(
115
+ df: SparkDataFrame,
116
+ checkpoint_name: str,
117
+ sample: Optional[float] = None,
118
+ mode: Optional[CheckpointMode] = None,
119
+ output_path: Optional[str] = None,
120
+ ) -> None:
121
+ ...
122
+ ```
123
+
124
+ The signature of the method is the same of `collect_dataframe_checkpoint`.
125
+
104
126
  ## Usage Example
105
127
 
106
128
  ### Schema mode
@@ -54,6 +54,28 @@ def collect_dataframe_checkpoint(
54
54
  - `output_path`: The output path to save the checkpoint, defaults to current working directory.
55
55
 
56
56
 
57
+ ### Skip DataFrame Checkpoint Collection
58
+
59
+
60
+
61
+ ```python
62
+ from pyspark.sql import DataFrame as SparkDataFrame
63
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
64
+ from typing import Optional
65
+
66
+ # Signature of the function
67
+ def xcollect_dataframe_checkpoint(
68
+ df: SparkDataFrame,
69
+ checkpoint_name: str,
70
+ sample: Optional[float] = None,
71
+ mode: Optional[CheckpointMode] = None,
72
+ output_path: Optional[str] = None,
73
+ ) -> None:
74
+ ...
75
+ ```
76
+
77
+ The signature of the method is the same of `collect_dataframe_checkpoint`.
78
+
57
79
  ## Usage Example
58
80
 
59
81
  ### Schema mode
@@ -22,9 +22,14 @@ logging.getLogger(__name__).addHandler(logging.NullHandler())
22
22
 
23
23
  # ruff: noqa: E402
24
24
 
25
- __all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
25
+ __all__ = [
26
+ "collect_dataframe_checkpoint",
27
+ "CheckpointMode",
28
+ "xcollect_dataframe_checkpoint",
29
+ ]
26
30
 
27
31
  from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
28
32
  from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
29
33
  collect_dataframe_checkpoint,
34
+ xcollect_dataframe_checkpoint,
30
35
  )
@@ -13,4 +13,4 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- __version__ = "0.3.0"
16
+ __version__ = "0.3.1"
@@ -30,6 +30,7 @@ CHECKPOINT_NAME_KEY = "checkpoint_name"
30
30
  class CollectionResult(Enum):
31
31
  FAIL = "FAIL"
32
32
  PASS = "PASS"
33
+ SKIP = "SKIP"
33
34
 
34
35
 
35
36
  class CollectionPointResult:
@@ -117,11 +117,10 @@ def collect_dataframe_checkpoint(
117
117
  "Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
118
118
  )
119
119
  if not is_checkpoint_enabled(normalized_checkpoint_name):
120
- LOGGER.info(
121
- "Checkpoint '%s' is disabled. Skipping collection.",
122
- normalized_checkpoint_name,
120
+ raise Exception(
121
+ f"Checkpoint '{normalized_checkpoint_name}' is disabled. Please enable it in the checkpoints.json file.",
122
+ "In case you want to skip it, use the xcollect_dataframe_checkpoint method instead.",
123
123
  )
124
- return
125
124
 
126
125
  LOGGER.info("Starting to collect checkpoint '%s'", normalized_checkpoint_name)
127
126
  LOGGER.debug("DataFrame size: %s rows", df.count())
@@ -184,6 +183,68 @@ def collect_dataframe_checkpoint(
184
183
  collection_point_result_manager.add_result(collection_point_result)
185
184
 
186
185
 
186
+ @log
187
+ def xcollect_dataframe_checkpoint(
188
+ df: SparkDataFrame,
189
+ checkpoint_name: str,
190
+ sample: Optional[float] = None,
191
+ mode: Optional[CheckpointMode] = None,
192
+ output_path: Optional[str] = None,
193
+ ) -> None:
194
+ """Skips the collection of metadata from a Dataframe checkpoint.
195
+
196
+ Args:
197
+ df (SparkDataFrame): The input Spark DataFrame to skip.
198
+ checkpoint_name (str): The name of the checkpoint.
199
+ sample (float, optional): Fraction of DataFrame to sample for schema inference.
200
+ Defaults to 1.0.
201
+ mode (CheckpointMode): The mode to execution the collection.
202
+ Defaults to CheckpointMode.Schema
203
+ output_path (str, optional): The output path to save the checkpoint.
204
+ Defaults to Current working Directory.
205
+
206
+ Raises:
207
+ Exception: Invalid mode value.
208
+ Exception: Invalid checkpoint name. Checkpoint names must only contain alphanumeric characters,
209
+ underscores and dollar signs.
210
+
211
+ """
212
+ normalized_checkpoint_name = checkpoint_name_utils.normalize_checkpoint_name(
213
+ checkpoint_name
214
+ )
215
+ if normalized_checkpoint_name != checkpoint_name:
216
+ LOGGER.warning(
217
+ "Checkpoint name '%s' was normalized to '%s'",
218
+ checkpoint_name,
219
+ normalized_checkpoint_name,
220
+ )
221
+ is_valid_checkpoint_name = checkpoint_name_utils.is_valid_checkpoint_name(
222
+ normalized_checkpoint_name
223
+ )
224
+ if not is_valid_checkpoint_name:
225
+ raise Exception(
226
+ f"Invalid checkpoint name: {normalized_checkpoint_name}. "
227
+ "Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
228
+ )
229
+
230
+ LOGGER.warning(
231
+ "Checkpoint '%s' is disabled. Skipping collection.",
232
+ normalized_checkpoint_name,
233
+ )
234
+
235
+ collection_point_file_path = file_utils.get_collection_point_source_file_path()
236
+ collection_point_line_of_code = file_utils.get_collection_point_line_of_code()
237
+ collection_point_result = CollectionPointResult(
238
+ collection_point_file_path,
239
+ collection_point_line_of_code,
240
+ normalized_checkpoint_name,
241
+ )
242
+
243
+ collection_point_result.result = CollectionResult.SKIP
244
+ collection_point_result_manager = CollectionPointResultManager(output_path)
245
+ collection_point_result_manager.add_result(collection_point_result)
246
+
247
+
187
248
  @report_telemetry(params_list=["column_type_dict"])
188
249
  def _collect_dataframe_checkpoint_mode_schema(
189
250
  checkpoint_name: str,
@@ -14,6 +14,7 @@
14
14
  # limitations under the License.
15
15
  import inspect
16
16
  import os
17
+ import re
17
18
  import tempfile
18
19
 
19
20
  from typing import Optional
@@ -84,7 +85,10 @@ def get_collection_point_source_file_path() -> str:
84
85
 
85
86
  """
86
87
  try:
87
- collection_point_file_path = inspect.stack()[2].filename
88
+ stack_frame = _get_stack_frame()
89
+ if not stack_frame:
90
+ return UNKNOWN_SOURCE_FILE
91
+ collection_point_file_path = stack_frame.filename
88
92
  is_temporal_file_path = _is_temporal_path(collection_point_file_path)
89
93
  if is_temporal_file_path:
90
94
  ipynb_file_path_collection = _get_ipynb_file_path_collection()
@@ -100,15 +104,18 @@ def get_collection_point_source_file_path() -> str:
100
104
 
101
105
 
102
106
  def get_collection_point_line_of_code() -> int:
103
- """Find the line of code of the source file where collection point it is.
107
+ """Find the line of code of the source file where collection point is.
104
108
 
105
109
  Returns:
106
110
  int: returns the line of code of the source file where collection point it is.
107
111
 
108
112
  """
109
113
  try:
110
- collection_point_file_path = inspect.stack()[2].filename
111
- collection_point_line_of_code = inspect.stack()[2].lineno
114
+ stack_frame = _get_stack_frame()
115
+ if not stack_frame:
116
+ return UNKNOWN_LINE_OF_CODE
117
+ collection_point_file_path = stack_frame.filename
118
+ collection_point_line_of_code = stack_frame.lineno
112
119
  is_temporal_file_path = _is_temporal_path(collection_point_file_path)
113
120
  if is_temporal_file_path:
114
121
  collection_point_line_of_code = UNKNOWN_LINE_OF_CODE
@@ -124,6 +131,21 @@ def _is_temporal_path(path: str) -> bool:
124
131
  return is_temporal_path
125
132
 
126
133
 
134
+ def _get_stack_frame() -> inspect.FrameInfo:
135
+ batch = inspect.stack()[:7]
136
+ batch.reverse()
137
+ collect_frame_regex = r"(collect_dataframe_checkpoint)"
138
+
139
+ for frame in batch:
140
+ if (
141
+ frame.code_context is not None
142
+ and len(frame.code_context) >= 0
143
+ and re.search(collect_frame_regex, frame.code_context[0])
144
+ ):
145
+ return frame
146
+ return None
147
+
148
+
127
149
  def _get_ipynb_file_path_collection() -> list[str]:
128
150
  current_working_directory_path = get_io_file_manager().getcwd()
129
151
  cwd_file_name_collection = get_io_file_manager().ls(current_working_directory_path)
@@ -54,25 +54,35 @@ def test_get_output_file_path_create():
54
54
 
55
55
 
56
56
  def test_get_collection_point_source_file_path_scenario_python_source_file():
57
- collection_point_source_file_path = (
58
- file_utils.get_collection_point_source_file_path()
59
- )
60
- assert collection_point_source_file_path != UNKNOWN_SOURCE_FILE
57
+ with mock.patch(
58
+ "snowflake.snowpark_checkpoints_collector.utils.file_utils._get_stack_frame",
59
+ return_value=mock.MagicMock(filename="abc.py", lineno=1),
60
+ ):
61
+ collection_point_source_file_path = (
62
+ file_utils.get_collection_point_source_file_path()
63
+ )
64
+ assert collection_point_source_file_path != UNKNOWN_SOURCE_FILE
61
65
 
62
66
 
63
67
  def test_get_collection_point_source_file_path_scenario_notebook_source_file():
64
- with mock.patch(
65
- "snowflake.snowpark_checkpoints_collector.utils.file_utils._is_temporal_path",
66
- return_value=True,
67
- ):
68
- with mock.patch(
68
+ with (
69
+ mock.patch(
70
+ "snowflake.snowpark_checkpoints_collector.utils.file_utils._is_temporal_path",
71
+ return_value=True,
72
+ ),
73
+ mock.patch(
74
+ "snowflake.snowpark_checkpoints_collector.utils.file_utils._get_stack_frame",
75
+ return_value=mock.MagicMock(filename="abc.ipynb", lineno=1),
76
+ ),
77
+ mock.patch(
69
78
  "snowflake.snowpark_checkpoints_collector.utils.file_utils._get_ipynb_file_path_collection",
70
79
  return_value=["abc.ipynb"],
71
- ):
72
- collection_point_source_file_path = (
73
- file_utils.get_collection_point_source_file_path()
74
- )
75
- assert collection_point_source_file_path != UNKNOWN_SOURCE_FILE
80
+ ),
81
+ ):
82
+ collection_point_source_file_path = (
83
+ file_utils.get_collection_point_source_file_path()
84
+ )
85
+ assert collection_point_source_file_path != UNKNOWN_SOURCE_FILE
76
86
 
77
87
 
78
88
  def test_get_collection_point_source_file_path_scenario_unknown_source_file():
@@ -102,8 +112,12 @@ def test_get_collection_point_source_file_path_scenario_exception():
102
112
 
103
113
 
104
114
  def test_get_collection_point_line_of_code_scenario_python_source_file():
105
- collection_point_line_of_code = file_utils.get_collection_point_line_of_code()
106
- assert collection_point_line_of_code != UNKNOWN_LINE_OF_CODE
115
+ with mock.patch(
116
+ "snowflake.snowpark_checkpoints_collector.utils.file_utils._get_stack_frame",
117
+ return_value=mock.MagicMock(filename=__file__, lineno=1),
118
+ ):
119
+ collection_point_line_of_code = file_utils.get_collection_point_line_of_code()
120
+ assert collection_point_line_of_code != UNKNOWN_LINE_OF_CODE
107
121
 
108
122
 
109
123
  def test_get_collection_point_line_of_code_scenario_notebook_source_file():
@@ -0,0 +1,100 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import logging
17
+ import os
18
+ import tempfile
19
+
20
+ from datetime import datetime
21
+ from typing import get_type_hints
22
+ from unittest.mock import MagicMock, patch
23
+
24
+ import pytest
25
+
26
+ from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
27
+ collect_dataframe_checkpoint,
28
+ generate_parquet_for_spark_df,
29
+ xcollect_dataframe_checkpoint,
30
+ )
31
+
32
+
33
+ def test_generate_parquet_for_spark_df_exception():
34
+ spark = MagicMock()
35
+ spark_df = MagicMock()
36
+ spark_df.dtypes = []
37
+ spark_df.select = MagicMock()
38
+ spark_df = spark.createDataFrame()
39
+ parquet_directory = os.path.join(
40
+ tempfile.gettempdir(),
41
+ f"test_spark_df_checkpoint_{datetime.now().strftime('%Y%m%d%H%M%S')}",
42
+ )
43
+
44
+ with pytest.raises(Exception, match="No parquet files were generated."):
45
+ generate_parquet_for_spark_df(spark_df, parquet_directory)
46
+
47
+
48
+ def test_collect_dataframe_checkpoint_disabled_checkpoint(
49
+ caplog: pytest.LogCaptureFixture,
50
+ ):
51
+ """Test that collect_dataframe_checkpoint logs a message when the checkpoint is disabled."""
52
+ pyspark_df = MagicMock()
53
+ checkpoint_name = "my_checkpoint"
54
+ module_name = "snowflake.snowpark_checkpoints_collector.summary_stats_collector"
55
+ expected_exception_error_msg = "Checkpoint 'my_checkpoint' is disabled. Please enable it in the checkpoints.json file."
56
+ expected_fix_suggestion_msg = "In case you want to skip it, use the xcollect_dataframe_checkpoint method instead."
57
+ try:
58
+ with (
59
+ caplog.at_level(
60
+ level=logging.INFO,
61
+ logger=module_name,
62
+ ),
63
+ patch(
64
+ f"{module_name}.is_checkpoint_enabled",
65
+ return_value=False,
66
+ ) as mock_is_checkpoint_enabled,
67
+ ):
68
+ collect_dataframe_checkpoint(pyspark_df, checkpoint_name)
69
+ except Exception as e:
70
+ mock_is_checkpoint_enabled.assert_called_once_with(checkpoint_name)
71
+ error_msg = e.args[0]
72
+ fix_suggestion_msg = e.args[1]
73
+ assert error_msg == expected_exception_error_msg
74
+ assert fix_suggestion_msg == expected_fix_suggestion_msg
75
+
76
+
77
+ def test_skip_collector_parameters_commutability():
78
+ collect_hints = get_type_hints(collect_dataframe_checkpoint)
79
+ x_collect_hints = get_type_hints(xcollect_dataframe_checkpoint)
80
+
81
+ collect_params = {
82
+ name: hint for name, hint in collect_hints.items() if name != "return"
83
+ }
84
+ x_collect_params = {
85
+ name: hint for name, hint in x_collect_hints.items() if name != "return"
86
+ }
87
+ assert (
88
+ collect_params == x_collect_params
89
+ ), "The parameters of collect_dataframe_checkpoint and xcollect_dataframe_checkpoint must be the same."
90
+
91
+
92
+ def test_skip_collector_return_type_commutability():
93
+ collect_hints = get_type_hints(collect_dataframe_checkpoint)
94
+ x_collect_hints = get_type_hints(xcollect_dataframe_checkpoint)
95
+
96
+ collect_return = collect_hints.get("return")
97
+ x_collect_return = x_collect_hints.get("return")
98
+ assert (
99
+ collect_return == x_collect_return
100
+ ), "The return type of collect_dataframe_checkpoint and xcollect_dataframe_checkpoint must be the same."
@@ -1,70 +0,0 @@
1
- # Copyright 2025 Snowflake Inc.
2
- # SPDX-License-Identifier: Apache-2.0
3
-
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
-
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
-
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import logging
17
- import os
18
- import tempfile
19
-
20
- from datetime import datetime
21
- from unittest.mock import MagicMock, patch
22
-
23
- import pytest
24
-
25
- from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
26
- collect_dataframe_checkpoint,
27
- generate_parquet_for_spark_df,
28
- )
29
-
30
-
31
- def test_generate_parquet_for_spark_df_exception():
32
- spark = MagicMock()
33
- spark_df = MagicMock()
34
- spark_df.dtypes = []
35
- spark_df.select = MagicMock()
36
- spark_df = spark.createDataFrame()
37
- parquet_directory = os.path.join(
38
- tempfile.gettempdir(),
39
- f"test_spark_df_checkpoint_{datetime.now().strftime('%Y%m%d%H%M%S')}",
40
- )
41
-
42
- with pytest.raises(Exception, match="No parquet files were generated."):
43
- generate_parquet_for_spark_df(spark_df, parquet_directory)
44
-
45
-
46
- def test_collect_dataframe_checkpoint_disabled_checkpoint(
47
- caplog: pytest.LogCaptureFixture,
48
- ):
49
- """Test that collect_dataframe_checkpoint logs a message when the checkpoint is disabled."""
50
- pyspark_df = MagicMock()
51
- checkpoint_name = "my_checkpoint"
52
- module_name = "snowflake.snowpark_checkpoints_collector.summary_stats_collector"
53
- expected_log_msg = (
54
- f"Checkpoint '{checkpoint_name}' is disabled. Skipping collection."
55
- )
56
-
57
- with (
58
- caplog.at_level(
59
- level=logging.INFO,
60
- logger=module_name,
61
- ),
62
- patch(
63
- f"{module_name}.is_checkpoint_enabled",
64
- return_value=False,
65
- ) as mock_is_checkpoint_enabled,
66
- ):
67
- collect_dataframe_checkpoint(pyspark_df, checkpoint_name)
68
-
69
- mock_is_checkpoint_enabled.assert_called_once_with(checkpoint_name)
70
- assert expected_log_msg in caplog.messages