snowpark-checkpoints-collectors 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_checkpoints_collector/__init__.py +6 -1
- snowflake/snowpark_checkpoints_collector/__version__.py +1 -1
- snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +1 -0
- snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +65 -4
- snowflake/snowpark_checkpoints_collector/utils/file_utils.py +26 -4
- {snowpark_checkpoints_collectors-0.3.0.dist-info → snowpark_checkpoints_collectors-0.3.1.dist-info}/METADATA +23 -1
- {snowpark_checkpoints_collectors-0.3.0.dist-info → snowpark_checkpoints_collectors-0.3.1.dist-info}/RECORD +9 -9
- {snowpark_checkpoints_collectors-0.3.0.dist-info → snowpark_checkpoints_collectors-0.3.1.dist-info}/WHEEL +0 -0
- {snowpark_checkpoints_collectors-0.3.0.dist-info → snowpark_checkpoints_collectors-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -22,9 +22,14 @@ logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
22
22
|
|
23
23
|
# ruff: noqa: E402
|
24
24
|
|
25
|
-
__all__ = [
|
25
|
+
__all__ = [
|
26
|
+
"collect_dataframe_checkpoint",
|
27
|
+
"CheckpointMode",
|
28
|
+
"xcollect_dataframe_checkpoint",
|
29
|
+
]
|
26
30
|
|
27
31
|
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
28
32
|
from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
|
29
33
|
collect_dataframe_checkpoint,
|
34
|
+
xcollect_dataframe_checkpoint,
|
30
35
|
)
|
@@ -117,11 +117,10 @@ def collect_dataframe_checkpoint(
|
|
117
117
|
"Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
|
118
118
|
)
|
119
119
|
if not is_checkpoint_enabled(normalized_checkpoint_name):
|
120
|
-
|
121
|
-
"Checkpoint '
|
122
|
-
|
120
|
+
raise Exception(
|
121
|
+
f"Checkpoint '{normalized_checkpoint_name}' is disabled. Please enable it in the checkpoints.json file.",
|
122
|
+
"In case you want to skip it, use the xcollect_dataframe_checkpoint method instead.",
|
123
123
|
)
|
124
|
-
return
|
125
124
|
|
126
125
|
LOGGER.info("Starting to collect checkpoint '%s'", normalized_checkpoint_name)
|
127
126
|
LOGGER.debug("DataFrame size: %s rows", df.count())
|
@@ -184,6 +183,68 @@ def collect_dataframe_checkpoint(
|
|
184
183
|
collection_point_result_manager.add_result(collection_point_result)
|
185
184
|
|
186
185
|
|
186
|
+
@log
|
187
|
+
def xcollect_dataframe_checkpoint(
|
188
|
+
df: SparkDataFrame,
|
189
|
+
checkpoint_name: str,
|
190
|
+
sample: Optional[float] = None,
|
191
|
+
mode: Optional[CheckpointMode] = None,
|
192
|
+
output_path: Optional[str] = None,
|
193
|
+
) -> None:
|
194
|
+
"""Skips the collection of metadata from a Dataframe checkpoint.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
df (SparkDataFrame): The input Spark DataFrame to skip.
|
198
|
+
checkpoint_name (str): The name of the checkpoint.
|
199
|
+
sample (float, optional): Fraction of DataFrame to sample for schema inference.
|
200
|
+
Defaults to 1.0.
|
201
|
+
mode (CheckpointMode): The mode to execution the collection.
|
202
|
+
Defaults to CheckpointMode.Schema
|
203
|
+
output_path (str, optional): The output path to save the checkpoint.
|
204
|
+
Defaults to Current working Directory.
|
205
|
+
|
206
|
+
Raises:
|
207
|
+
Exception: Invalid mode value.
|
208
|
+
Exception: Invalid checkpoint name. Checkpoint names must only contain alphanumeric characters,
|
209
|
+
underscores and dollar signs.
|
210
|
+
|
211
|
+
"""
|
212
|
+
normalized_checkpoint_name = checkpoint_name_utils.normalize_checkpoint_name(
|
213
|
+
checkpoint_name
|
214
|
+
)
|
215
|
+
if normalized_checkpoint_name != checkpoint_name:
|
216
|
+
LOGGER.warning(
|
217
|
+
"Checkpoint name '%s' was normalized to '%s'",
|
218
|
+
checkpoint_name,
|
219
|
+
normalized_checkpoint_name,
|
220
|
+
)
|
221
|
+
is_valid_checkpoint_name = checkpoint_name_utils.is_valid_checkpoint_name(
|
222
|
+
normalized_checkpoint_name
|
223
|
+
)
|
224
|
+
if not is_valid_checkpoint_name:
|
225
|
+
raise Exception(
|
226
|
+
f"Invalid checkpoint name: {normalized_checkpoint_name}. "
|
227
|
+
"Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
|
228
|
+
)
|
229
|
+
|
230
|
+
LOGGER.warning(
|
231
|
+
"Checkpoint '%s' is disabled. Skipping collection.",
|
232
|
+
normalized_checkpoint_name,
|
233
|
+
)
|
234
|
+
|
235
|
+
collection_point_file_path = file_utils.get_collection_point_source_file_path()
|
236
|
+
collection_point_line_of_code = file_utils.get_collection_point_line_of_code()
|
237
|
+
collection_point_result = CollectionPointResult(
|
238
|
+
collection_point_file_path,
|
239
|
+
collection_point_line_of_code,
|
240
|
+
normalized_checkpoint_name,
|
241
|
+
)
|
242
|
+
|
243
|
+
collection_point_result.result = CollectionResult.SKIP
|
244
|
+
collection_point_result_manager = CollectionPointResultManager(output_path)
|
245
|
+
collection_point_result_manager.add_result(collection_point_result)
|
246
|
+
|
247
|
+
|
187
248
|
@report_telemetry(params_list=["column_type_dict"])
|
188
249
|
def _collect_dataframe_checkpoint_mode_schema(
|
189
250
|
checkpoint_name: str,
|
@@ -14,6 +14,7 @@
|
|
14
14
|
# limitations under the License.
|
15
15
|
import inspect
|
16
16
|
import os
|
17
|
+
import re
|
17
18
|
import tempfile
|
18
19
|
|
19
20
|
from typing import Optional
|
@@ -84,7 +85,10 @@ def get_collection_point_source_file_path() -> str:
|
|
84
85
|
|
85
86
|
"""
|
86
87
|
try:
|
87
|
-
|
88
|
+
stack_frame = _get_stack_frame()
|
89
|
+
if not stack_frame:
|
90
|
+
return UNKNOWN_SOURCE_FILE
|
91
|
+
collection_point_file_path = stack_frame.filename
|
88
92
|
is_temporal_file_path = _is_temporal_path(collection_point_file_path)
|
89
93
|
if is_temporal_file_path:
|
90
94
|
ipynb_file_path_collection = _get_ipynb_file_path_collection()
|
@@ -100,15 +104,18 @@ def get_collection_point_source_file_path() -> str:
|
|
100
104
|
|
101
105
|
|
102
106
|
def get_collection_point_line_of_code() -> int:
|
103
|
-
"""Find the line of code of the source file where collection point
|
107
|
+
"""Find the line of code of the source file where collection point is.
|
104
108
|
|
105
109
|
Returns:
|
106
110
|
int: returns the line of code of the source file where collection point it is.
|
107
111
|
|
108
112
|
"""
|
109
113
|
try:
|
110
|
-
|
111
|
-
|
114
|
+
stack_frame = _get_stack_frame()
|
115
|
+
if not stack_frame:
|
116
|
+
return UNKNOWN_LINE_OF_CODE
|
117
|
+
collection_point_file_path = stack_frame.filename
|
118
|
+
collection_point_line_of_code = stack_frame.lineno
|
112
119
|
is_temporal_file_path = _is_temporal_path(collection_point_file_path)
|
113
120
|
if is_temporal_file_path:
|
114
121
|
collection_point_line_of_code = UNKNOWN_LINE_OF_CODE
|
@@ -124,6 +131,21 @@ def _is_temporal_path(path: str) -> bool:
|
|
124
131
|
return is_temporal_path
|
125
132
|
|
126
133
|
|
134
|
+
def _get_stack_frame() -> inspect.FrameInfo:
|
135
|
+
batch = inspect.stack()[:7]
|
136
|
+
batch.reverse()
|
137
|
+
collect_frame_regex = r"(collect_dataframe_checkpoint)"
|
138
|
+
|
139
|
+
for frame in batch:
|
140
|
+
if (
|
141
|
+
frame.code_context is not None
|
142
|
+
and len(frame.code_context) >= 0
|
143
|
+
and re.search(collect_frame_regex, frame.code_context[0])
|
144
|
+
):
|
145
|
+
return frame
|
146
|
+
return None
|
147
|
+
|
148
|
+
|
127
149
|
def _get_ipynb_file_path_collection() -> list[str]:
|
128
150
|
current_working_directory_path = get_io_file_manager().getcwd()
|
129
151
|
cwd_file_name_collection = get_io_file_manager().ls(current_working_directory_path)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: snowpark-checkpoints-collectors
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Snowpark column and table statistics collection
|
5
5
|
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
6
|
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
@@ -101,6 +101,28 @@ def collect_dataframe_checkpoint(
|
|
101
101
|
- `output_path`: The output path to save the checkpoint, defaults to current working directory.
|
102
102
|
|
103
103
|
|
104
|
+
### Skip DataFrame Checkpoint Collection
|
105
|
+
|
106
|
+
|
107
|
+
|
108
|
+
```python
|
109
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
110
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
111
|
+
from typing import Optional
|
112
|
+
|
113
|
+
# Signature of the function
|
114
|
+
def xcollect_dataframe_checkpoint(
|
115
|
+
df: SparkDataFrame,
|
116
|
+
checkpoint_name: str,
|
117
|
+
sample: Optional[float] = None,
|
118
|
+
mode: Optional[CheckpointMode] = None,
|
119
|
+
output_path: Optional[str] = None,
|
120
|
+
) -> None:
|
121
|
+
...
|
122
|
+
```
|
123
|
+
|
124
|
+
The signature of the method is the same of `collect_dataframe_checkpoint`.
|
125
|
+
|
104
126
|
## Usage Example
|
105
127
|
|
106
128
|
### Schema mode
|
@@ -1,10 +1,10 @@
|
|
1
|
-
snowflake/snowpark_checkpoints_collector/__init__.py,sha256=
|
2
|
-
snowflake/snowpark_checkpoints_collector/__version__.py,sha256=
|
1
|
+
snowflake/snowpark_checkpoints_collector/__init__.py,sha256=g4NemuA6Mj4O2jkK0yLQ8sEV3owHiiJnBEz_OWvlW1I,1179
|
2
|
+
snowflake/snowpark_checkpoints_collector/__version__.py,sha256=uSRs7fRupFeQ-z3PtU_6qh6ry8YBaSAnEIAvLhJKUR8,632
|
3
3
|
snowflake/snowpark_checkpoints_collector/collection_common.py,sha256=ff5vYffrTRjoJXZQvVQBaOlegAUj_vXBbl1IZidz8Qo,4510
|
4
4
|
snowflake/snowpark_checkpoints_collector/singleton.py,sha256=7AgIHQBXVRvPBBCkmBplzkdrrm-xVWf_N8svzA2vF8E,836
|
5
|
-
snowflake/snowpark_checkpoints_collector/summary_stats_collector.py,sha256=
|
5
|
+
snowflake/snowpark_checkpoints_collector/summary_stats_collector.py,sha256=kRJpVRE9Iy_uqeIPT-__Aan-YLWxQbgSjkJ3w4LpvCc,17214
|
6
6
|
snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py,sha256=jZzx29WzrjH7C_6ZsBGoe4PxbW_oM4uIjySS1axIM34,1000
|
7
|
-
snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py,sha256=
|
7
|
+
snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py,sha256=XelL7LughZpKl1B_6bJoKOc_PqQg3UleX6zdgVXqTus,2926
|
8
8
|
snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py,sha256=EY6WIIXRbvkTYC4bQn7jFALHh7D2PirVoiLZ5Kq8dNs,2659
|
9
9
|
snowflake/snowpark_checkpoints_collector/column_collection/__init__.py,sha256=hpTh1V7hqBSHxNUqISwfxdz-NLD-7oZEMLXDUuRsoOU,783
|
10
10
|
snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py,sha256=Vav_vbiipHFIAdHxeQG4ZK1BAmWTi_18hBnVeIeXFRs,9670
|
@@ -34,10 +34,10 @@ snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py,sha25
|
|
34
34
|
snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py,sha256=r3IPnmDMb8151PTgE4YojOhWnxWGPLyBWlgFvvhOfRY,7314
|
35
35
|
snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py,sha256=Xc4k3JU6A96-79VFRR8NrNAUPeO3V1DEAhngg-hLlU4,1787
|
36
36
|
snowflake/snowpark_checkpoints_collector/utils/extra_config.py,sha256=3kVf6WVA-EuyMpTO3ycTlXMSCHtytGtT6wkV4U2Hyjw,5195
|
37
|
-
snowflake/snowpark_checkpoints_collector/utils/file_utils.py,sha256=
|
37
|
+
snowflake/snowpark_checkpoints_collector/utils/file_utils.py,sha256=5ztlNCv9GdSktUvtdfydv86cCFcmSXCdD4axZXJrOQQ,5125
|
38
38
|
snowflake/snowpark_checkpoints_collector/utils/logging_utils.py,sha256=yyi6X5DqKeTg0HRhvsH6ymYp2P0wbnyKIzI2RzrQS7k,2278
|
39
39
|
snowflake/snowpark_checkpoints_collector/utils/telemetry.py,sha256=ueN9vM8j5YNax7jMcnEj_UrgGkoeMv_hJHVKjN7hiJE,32161
|
40
|
-
snowpark_checkpoints_collectors-0.3.
|
41
|
-
snowpark_checkpoints_collectors-0.3.
|
42
|
-
snowpark_checkpoints_collectors-0.3.
|
43
|
-
snowpark_checkpoints_collectors-0.3.
|
40
|
+
snowpark_checkpoints_collectors-0.3.1.dist-info/METADATA,sha256=dPrRK0GO-5U3bQUntCpYyRfG3J_z-v6bV9hX2s8CxdM,6613
|
41
|
+
snowpark_checkpoints_collectors-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
42
|
+
snowpark_checkpoints_collectors-0.3.1.dist-info/licenses/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
|
43
|
+
snowpark_checkpoints_collectors-0.3.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|