snowpark-checkpoints-collectors 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,9 +22,14 @@ logging.getLogger(__name__).addHandler(logging.NullHandler())
22
22
 
23
23
  # ruff: noqa: E402
24
24
 
25
- __all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
25
+ __all__ = [
26
+ "collect_dataframe_checkpoint",
27
+ "CheckpointMode",
28
+ "xcollect_dataframe_checkpoint",
29
+ ]
26
30
 
27
31
  from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
28
32
  from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
29
33
  collect_dataframe_checkpoint,
34
+ xcollect_dataframe_checkpoint,
30
35
  )
@@ -13,4 +13,4 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- __version__ = "0.3.0"
16
+ __version__ = "0.3.1"
@@ -30,6 +30,7 @@ CHECKPOINT_NAME_KEY = "checkpoint_name"
30
30
  class CollectionResult(Enum):
31
31
  FAIL = "FAIL"
32
32
  PASS = "PASS"
33
+ SKIP = "SKIP"
33
34
 
34
35
 
35
36
  class CollectionPointResult:
@@ -117,11 +117,10 @@ def collect_dataframe_checkpoint(
117
117
  "Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
118
118
  )
119
119
  if not is_checkpoint_enabled(normalized_checkpoint_name):
120
- LOGGER.info(
121
- "Checkpoint '%s' is disabled. Skipping collection.",
122
- normalized_checkpoint_name,
120
+ raise Exception(
121
+ f"Checkpoint '{normalized_checkpoint_name}' is disabled. Please enable it in the checkpoints.json file.",
122
+ "In case you want to skip it, use the xcollect_dataframe_checkpoint method instead.",
123
123
  )
124
- return
125
124
 
126
125
  LOGGER.info("Starting to collect checkpoint '%s'", normalized_checkpoint_name)
127
126
  LOGGER.debug("DataFrame size: %s rows", df.count())
@@ -184,6 +183,68 @@ def collect_dataframe_checkpoint(
184
183
  collection_point_result_manager.add_result(collection_point_result)
185
184
 
186
185
 
186
+ @log
187
+ def xcollect_dataframe_checkpoint(
188
+ df: SparkDataFrame,
189
+ checkpoint_name: str,
190
+ sample: Optional[float] = None,
191
+ mode: Optional[CheckpointMode] = None,
192
+ output_path: Optional[str] = None,
193
+ ) -> None:
194
+ """Skips the collection of metadata from a Dataframe checkpoint.
195
+
196
+ Args:
197
+ df (SparkDataFrame): The input Spark DataFrame to skip.
198
+ checkpoint_name (str): The name of the checkpoint.
199
+ sample (float, optional): Fraction of DataFrame to sample for schema inference.
200
+ Defaults to 1.0.
201
+ mode (CheckpointMode): The mode to execution the collection.
202
+ Defaults to CheckpointMode.Schema
203
+ output_path (str, optional): The output path to save the checkpoint.
204
+ Defaults to Current working Directory.
205
+
206
+ Raises:
207
+ Exception: Invalid mode value.
208
+ Exception: Invalid checkpoint name. Checkpoint names must only contain alphanumeric characters,
209
+ underscores and dollar signs.
210
+
211
+ """
212
+ normalized_checkpoint_name = checkpoint_name_utils.normalize_checkpoint_name(
213
+ checkpoint_name
214
+ )
215
+ if normalized_checkpoint_name != checkpoint_name:
216
+ LOGGER.warning(
217
+ "Checkpoint name '%s' was normalized to '%s'",
218
+ checkpoint_name,
219
+ normalized_checkpoint_name,
220
+ )
221
+ is_valid_checkpoint_name = checkpoint_name_utils.is_valid_checkpoint_name(
222
+ normalized_checkpoint_name
223
+ )
224
+ if not is_valid_checkpoint_name:
225
+ raise Exception(
226
+ f"Invalid checkpoint name: {normalized_checkpoint_name}. "
227
+ "Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
228
+ )
229
+
230
+ LOGGER.warning(
231
+ "Checkpoint '%s' is disabled. Skipping collection.",
232
+ normalized_checkpoint_name,
233
+ )
234
+
235
+ collection_point_file_path = file_utils.get_collection_point_source_file_path()
236
+ collection_point_line_of_code = file_utils.get_collection_point_line_of_code()
237
+ collection_point_result = CollectionPointResult(
238
+ collection_point_file_path,
239
+ collection_point_line_of_code,
240
+ normalized_checkpoint_name,
241
+ )
242
+
243
+ collection_point_result.result = CollectionResult.SKIP
244
+ collection_point_result_manager = CollectionPointResultManager(output_path)
245
+ collection_point_result_manager.add_result(collection_point_result)
246
+
247
+
187
248
  @report_telemetry(params_list=["column_type_dict"])
188
249
  def _collect_dataframe_checkpoint_mode_schema(
189
250
  checkpoint_name: str,
@@ -14,6 +14,7 @@
14
14
  # limitations under the License.
15
15
  import inspect
16
16
  import os
17
+ import re
17
18
  import tempfile
18
19
 
19
20
  from typing import Optional
@@ -84,7 +85,10 @@ def get_collection_point_source_file_path() -> str:
84
85
 
85
86
  """
86
87
  try:
87
- collection_point_file_path = inspect.stack()[2].filename
88
+ stack_frame = _get_stack_frame()
89
+ if not stack_frame:
90
+ return UNKNOWN_SOURCE_FILE
91
+ collection_point_file_path = stack_frame.filename
88
92
  is_temporal_file_path = _is_temporal_path(collection_point_file_path)
89
93
  if is_temporal_file_path:
90
94
  ipynb_file_path_collection = _get_ipynb_file_path_collection()
@@ -100,15 +104,18 @@ def get_collection_point_source_file_path() -> str:
100
104
 
101
105
 
102
106
  def get_collection_point_line_of_code() -> int:
103
- """Find the line of code of the source file where collection point it is.
107
+ """Find the line of code of the source file where collection point is.
104
108
 
105
109
  Returns:
106
110
  int: returns the line of code of the source file where collection point it is.
107
111
 
108
112
  """
109
113
  try:
110
- collection_point_file_path = inspect.stack()[2].filename
111
- collection_point_line_of_code = inspect.stack()[2].lineno
114
+ stack_frame = _get_stack_frame()
115
+ if not stack_frame:
116
+ return UNKNOWN_LINE_OF_CODE
117
+ collection_point_file_path = stack_frame.filename
118
+ collection_point_line_of_code = stack_frame.lineno
112
119
  is_temporal_file_path = _is_temporal_path(collection_point_file_path)
113
120
  if is_temporal_file_path:
114
121
  collection_point_line_of_code = UNKNOWN_LINE_OF_CODE
@@ -124,6 +131,21 @@ def _is_temporal_path(path: str) -> bool:
124
131
  return is_temporal_path
125
132
 
126
133
 
134
+ def _get_stack_frame() -> inspect.FrameInfo:
135
+ batch = inspect.stack()[:7]
136
+ batch.reverse()
137
+ collect_frame_regex = r"(collect_dataframe_checkpoint)"
138
+
139
+ for frame in batch:
140
+ if (
141
+ frame.code_context is not None
142
+ and len(frame.code_context) >= 0
143
+ and re.search(collect_frame_regex, frame.code_context[0])
144
+ ):
145
+ return frame
146
+ return None
147
+
148
+
127
149
  def _get_ipynb_file_path_collection() -> list[str]:
128
150
  current_working_directory_path = get_io_file_manager().getcwd()
129
151
  cwd_file_name_collection = get_io_file_manager().ls(current_working_directory_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: snowpark-checkpoints-collectors
3
- Version: 0.3.0
3
+ Version: 0.3.1
4
4
  Summary: Snowpark column and table statistics collection
5
5
  Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
6
  Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
@@ -101,6 +101,28 @@ def collect_dataframe_checkpoint(
101
101
  - `output_path`: The output path to save the checkpoint, defaults to current working directory.
102
102
 
103
103
 
104
+ ### Skip DataFrame Checkpoint Collection
105
+
106
+
107
+
108
+ ```python
109
+ from pyspark.sql import DataFrame as SparkDataFrame
110
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
111
+ from typing import Optional
112
+
113
+ # Signature of the function
114
+ def xcollect_dataframe_checkpoint(
115
+ df: SparkDataFrame,
116
+ checkpoint_name: str,
117
+ sample: Optional[float] = None,
118
+ mode: Optional[CheckpointMode] = None,
119
+ output_path: Optional[str] = None,
120
+ ) -> None:
121
+ ...
122
+ ```
123
+
124
+ The signature of the method is the same of `collect_dataframe_checkpoint`.
125
+
104
126
  ## Usage Example
105
127
 
106
128
  ### Schema mode
@@ -1,10 +1,10 @@
1
- snowflake/snowpark_checkpoints_collector/__init__.py,sha256=GIESlH2W6g_qdcnyRqw9yjsvEkt0aniFvGixKlF4K7A,1096
2
- snowflake/snowpark_checkpoints_collector/__version__.py,sha256=kbbDnlkY7JOLNHvfWYkCO_mOBOV9GniMGdxYoQpLhyg,632
1
+ snowflake/snowpark_checkpoints_collector/__init__.py,sha256=g4NemuA6Mj4O2jkK0yLQ8sEV3owHiiJnBEz_OWvlW1I,1179
2
+ snowflake/snowpark_checkpoints_collector/__version__.py,sha256=uSRs7fRupFeQ-z3PtU_6qh6ry8YBaSAnEIAvLhJKUR8,632
3
3
  snowflake/snowpark_checkpoints_collector/collection_common.py,sha256=ff5vYffrTRjoJXZQvVQBaOlegAUj_vXBbl1IZidz8Qo,4510
4
4
  snowflake/snowpark_checkpoints_collector/singleton.py,sha256=7AgIHQBXVRvPBBCkmBplzkdrrm-xVWf_N8svzA2vF8E,836
5
- snowflake/snowpark_checkpoints_collector/summary_stats_collector.py,sha256=SD5MRF7zSDKXpxekMWdg5gO7ZcZr6Y548vkkKpG_jZs,14745
5
+ snowflake/snowpark_checkpoints_collector/summary_stats_collector.py,sha256=kRJpVRE9Iy_uqeIPT-__Aan-YLWxQbgSjkJ3w4LpvCc,17214
6
6
  snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py,sha256=jZzx29WzrjH7C_6ZsBGoe4PxbW_oM4uIjySS1axIM34,1000
7
- snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py,sha256=8xD9zGnFJ7Rz9RUXIys7JnV3kQD4mk8QwNOTxAihSjQ,2908
7
+ snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py,sha256=XelL7LughZpKl1B_6bJoKOc_PqQg3UleX6zdgVXqTus,2926
8
8
  snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py,sha256=EY6WIIXRbvkTYC4bQn7jFALHh7D2PirVoiLZ5Kq8dNs,2659
9
9
  snowflake/snowpark_checkpoints_collector/column_collection/__init__.py,sha256=hpTh1V7hqBSHxNUqISwfxdz-NLD-7oZEMLXDUuRsoOU,783
10
10
  snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py,sha256=Vav_vbiipHFIAdHxeQG4ZK1BAmWTi_18hBnVeIeXFRs,9670
@@ -34,10 +34,10 @@ snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py,sha25
34
34
  snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py,sha256=r3IPnmDMb8151PTgE4YojOhWnxWGPLyBWlgFvvhOfRY,7314
35
35
  snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py,sha256=Xc4k3JU6A96-79VFRR8NrNAUPeO3V1DEAhngg-hLlU4,1787
36
36
  snowflake/snowpark_checkpoints_collector/utils/extra_config.py,sha256=3kVf6WVA-EuyMpTO3ycTlXMSCHtytGtT6wkV4U2Hyjw,5195
37
- snowflake/snowpark_checkpoints_collector/utils/file_utils.py,sha256=C1gZmQHvLMgHMVc5kTTpcCaUPw5PtpajY_Uu18mMy6c,4515
37
+ snowflake/snowpark_checkpoints_collector/utils/file_utils.py,sha256=5ztlNCv9GdSktUvtdfydv86cCFcmSXCdD4axZXJrOQQ,5125
38
38
  snowflake/snowpark_checkpoints_collector/utils/logging_utils.py,sha256=yyi6X5DqKeTg0HRhvsH6ymYp2P0wbnyKIzI2RzrQS7k,2278
39
39
  snowflake/snowpark_checkpoints_collector/utils/telemetry.py,sha256=ueN9vM8j5YNax7jMcnEj_UrgGkoeMv_hJHVKjN7hiJE,32161
40
- snowpark_checkpoints_collectors-0.3.0.dist-info/METADATA,sha256=4nXrRjc1glZUTrb9J8brIHPzyrE43GRKNu7lrqfGMZU,6061
41
- snowpark_checkpoints_collectors-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
42
- snowpark_checkpoints_collectors-0.3.0.dist-info/licenses/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
43
- snowpark_checkpoints_collectors-0.3.0.dist-info/RECORD,,
40
+ snowpark_checkpoints_collectors-0.3.1.dist-info/METADATA,sha256=dPrRK0GO-5U3bQUntCpYyRfG3J_z-v6bV9hX2s8CxdM,6613
41
+ snowpark_checkpoints_collectors-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
42
+ snowpark_checkpoints_collectors-0.3.1.dist-info/licenses/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
43
+ snowpark_checkpoints_collectors-0.3.1.dist-info/RECORD,,