snowpark-checkpoints-collectors 0.3.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/PKG-INFO +23 -1
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/README.md +22 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/__init__.py +6 -1
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/__version__.py +1 -1
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +1 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +65 -4
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +26 -4
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_file_utils.py +30 -16
- snowpark_checkpoints_collectors-0.3.1/test/unit/test_summary_stats_collector.py +100 -0
- snowpark_checkpoints_collectors-0.3.0/test/unit/test_summary_stats_collector.py +0 -70
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/.gitignore +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/CHANGELOG.md +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/LICENSE +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/pyproject.toml +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/snowpark-testdf-schema.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/io_utils/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/io_utils/io_default_strategy.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/io_utils/io_env_strategy.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/io_utils/io_file_manager.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/logging_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/.coveragerc +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/telemetry_compare_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_checkpoint_name.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_io_strategy.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collection_result_file.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_snow_connection_int.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/io_utils/test_default_strategy.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_checkpoint_name_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_collection_point_result.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_collection_point_result_manager.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_column_collection.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_extra_config.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_logger.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_logging_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_pandera_column_check_manager.py +0 -0
- {snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_snow_connection.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: snowpark-checkpoints-collectors
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Snowpark column and table statistics collection
|
5
5
|
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
6
|
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
@@ -101,6 +101,28 @@ def collect_dataframe_checkpoint(
|
|
101
101
|
- `output_path`: The output path to save the checkpoint, defaults to current working directory.
|
102
102
|
|
103
103
|
|
104
|
+
### Skip DataFrame Checkpoint Collection
|
105
|
+
|
106
|
+
|
107
|
+
|
108
|
+
```python
|
109
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
110
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
111
|
+
from typing import Optional
|
112
|
+
|
113
|
+
# Signature of the function
|
114
|
+
def xcollect_dataframe_checkpoint(
|
115
|
+
df: SparkDataFrame,
|
116
|
+
checkpoint_name: str,
|
117
|
+
sample: Optional[float] = None,
|
118
|
+
mode: Optional[CheckpointMode] = None,
|
119
|
+
output_path: Optional[str] = None,
|
120
|
+
) -> None:
|
121
|
+
...
|
122
|
+
```
|
123
|
+
|
124
|
+
The signature of the method is the same of `collect_dataframe_checkpoint`.
|
125
|
+
|
104
126
|
## Usage Example
|
105
127
|
|
106
128
|
### Schema mode
|
@@ -54,6 +54,28 @@ def collect_dataframe_checkpoint(
|
|
54
54
|
- `output_path`: The output path to save the checkpoint, defaults to current working directory.
|
55
55
|
|
56
56
|
|
57
|
+
### Skip DataFrame Checkpoint Collection
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
```python
|
62
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
63
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
64
|
+
from typing import Optional
|
65
|
+
|
66
|
+
# Signature of the function
|
67
|
+
def xcollect_dataframe_checkpoint(
|
68
|
+
df: SparkDataFrame,
|
69
|
+
checkpoint_name: str,
|
70
|
+
sample: Optional[float] = None,
|
71
|
+
mode: Optional[CheckpointMode] = None,
|
72
|
+
output_path: Optional[str] = None,
|
73
|
+
) -> None:
|
74
|
+
...
|
75
|
+
```
|
76
|
+
|
77
|
+
The signature of the method is the same of `collect_dataframe_checkpoint`.
|
78
|
+
|
57
79
|
## Usage Example
|
58
80
|
|
59
81
|
### Schema mode
|
@@ -22,9 +22,14 @@ logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
22
22
|
|
23
23
|
# ruff: noqa: E402
|
24
24
|
|
25
|
-
__all__ = [
|
25
|
+
__all__ = [
|
26
|
+
"collect_dataframe_checkpoint",
|
27
|
+
"CheckpointMode",
|
28
|
+
"xcollect_dataframe_checkpoint",
|
29
|
+
]
|
26
30
|
|
27
31
|
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
28
32
|
from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
|
29
33
|
collect_dataframe_checkpoint,
|
34
|
+
xcollect_dataframe_checkpoint,
|
30
35
|
)
|
@@ -117,11 +117,10 @@ def collect_dataframe_checkpoint(
|
|
117
117
|
"Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
|
118
118
|
)
|
119
119
|
if not is_checkpoint_enabled(normalized_checkpoint_name):
|
120
|
-
|
121
|
-
"Checkpoint '
|
122
|
-
|
120
|
+
raise Exception(
|
121
|
+
f"Checkpoint '{normalized_checkpoint_name}' is disabled. Please enable it in the checkpoints.json file.",
|
122
|
+
"In case you want to skip it, use the xcollect_dataframe_checkpoint method instead.",
|
123
123
|
)
|
124
|
-
return
|
125
124
|
|
126
125
|
LOGGER.info("Starting to collect checkpoint '%s'", normalized_checkpoint_name)
|
127
126
|
LOGGER.debug("DataFrame size: %s rows", df.count())
|
@@ -184,6 +183,68 @@ def collect_dataframe_checkpoint(
|
|
184
183
|
collection_point_result_manager.add_result(collection_point_result)
|
185
184
|
|
186
185
|
|
186
|
+
@log
|
187
|
+
def xcollect_dataframe_checkpoint(
|
188
|
+
df: SparkDataFrame,
|
189
|
+
checkpoint_name: str,
|
190
|
+
sample: Optional[float] = None,
|
191
|
+
mode: Optional[CheckpointMode] = None,
|
192
|
+
output_path: Optional[str] = None,
|
193
|
+
) -> None:
|
194
|
+
"""Skips the collection of metadata from a Dataframe checkpoint.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
df (SparkDataFrame): The input Spark DataFrame to skip.
|
198
|
+
checkpoint_name (str): The name of the checkpoint.
|
199
|
+
sample (float, optional): Fraction of DataFrame to sample for schema inference.
|
200
|
+
Defaults to 1.0.
|
201
|
+
mode (CheckpointMode): The mode to execution the collection.
|
202
|
+
Defaults to CheckpointMode.Schema
|
203
|
+
output_path (str, optional): The output path to save the checkpoint.
|
204
|
+
Defaults to Current working Directory.
|
205
|
+
|
206
|
+
Raises:
|
207
|
+
Exception: Invalid mode value.
|
208
|
+
Exception: Invalid checkpoint name. Checkpoint names must only contain alphanumeric characters,
|
209
|
+
underscores and dollar signs.
|
210
|
+
|
211
|
+
"""
|
212
|
+
normalized_checkpoint_name = checkpoint_name_utils.normalize_checkpoint_name(
|
213
|
+
checkpoint_name
|
214
|
+
)
|
215
|
+
if normalized_checkpoint_name != checkpoint_name:
|
216
|
+
LOGGER.warning(
|
217
|
+
"Checkpoint name '%s' was normalized to '%s'",
|
218
|
+
checkpoint_name,
|
219
|
+
normalized_checkpoint_name,
|
220
|
+
)
|
221
|
+
is_valid_checkpoint_name = checkpoint_name_utils.is_valid_checkpoint_name(
|
222
|
+
normalized_checkpoint_name
|
223
|
+
)
|
224
|
+
if not is_valid_checkpoint_name:
|
225
|
+
raise Exception(
|
226
|
+
f"Invalid checkpoint name: {normalized_checkpoint_name}. "
|
227
|
+
"Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
|
228
|
+
)
|
229
|
+
|
230
|
+
LOGGER.warning(
|
231
|
+
"Checkpoint '%s' is disabled. Skipping collection.",
|
232
|
+
normalized_checkpoint_name,
|
233
|
+
)
|
234
|
+
|
235
|
+
collection_point_file_path = file_utils.get_collection_point_source_file_path()
|
236
|
+
collection_point_line_of_code = file_utils.get_collection_point_line_of_code()
|
237
|
+
collection_point_result = CollectionPointResult(
|
238
|
+
collection_point_file_path,
|
239
|
+
collection_point_line_of_code,
|
240
|
+
normalized_checkpoint_name,
|
241
|
+
)
|
242
|
+
|
243
|
+
collection_point_result.result = CollectionResult.SKIP
|
244
|
+
collection_point_result_manager = CollectionPointResultManager(output_path)
|
245
|
+
collection_point_result_manager.add_result(collection_point_result)
|
246
|
+
|
247
|
+
|
187
248
|
@report_telemetry(params_list=["column_type_dict"])
|
188
249
|
def _collect_dataframe_checkpoint_mode_schema(
|
189
250
|
checkpoint_name: str,
|
@@ -14,6 +14,7 @@
|
|
14
14
|
# limitations under the License.
|
15
15
|
import inspect
|
16
16
|
import os
|
17
|
+
import re
|
17
18
|
import tempfile
|
18
19
|
|
19
20
|
from typing import Optional
|
@@ -84,7 +85,10 @@ def get_collection_point_source_file_path() -> str:
|
|
84
85
|
|
85
86
|
"""
|
86
87
|
try:
|
87
|
-
|
88
|
+
stack_frame = _get_stack_frame()
|
89
|
+
if not stack_frame:
|
90
|
+
return UNKNOWN_SOURCE_FILE
|
91
|
+
collection_point_file_path = stack_frame.filename
|
88
92
|
is_temporal_file_path = _is_temporal_path(collection_point_file_path)
|
89
93
|
if is_temporal_file_path:
|
90
94
|
ipynb_file_path_collection = _get_ipynb_file_path_collection()
|
@@ -100,15 +104,18 @@ def get_collection_point_source_file_path() -> str:
|
|
100
104
|
|
101
105
|
|
102
106
|
def get_collection_point_line_of_code() -> int:
|
103
|
-
"""Find the line of code of the source file where collection point
|
107
|
+
"""Find the line of code of the source file where collection point is.
|
104
108
|
|
105
109
|
Returns:
|
106
110
|
int: returns the line of code of the source file where collection point it is.
|
107
111
|
|
108
112
|
"""
|
109
113
|
try:
|
110
|
-
|
111
|
-
|
114
|
+
stack_frame = _get_stack_frame()
|
115
|
+
if not stack_frame:
|
116
|
+
return UNKNOWN_LINE_OF_CODE
|
117
|
+
collection_point_file_path = stack_frame.filename
|
118
|
+
collection_point_line_of_code = stack_frame.lineno
|
112
119
|
is_temporal_file_path = _is_temporal_path(collection_point_file_path)
|
113
120
|
if is_temporal_file_path:
|
114
121
|
collection_point_line_of_code = UNKNOWN_LINE_OF_CODE
|
@@ -124,6 +131,21 @@ def _is_temporal_path(path: str) -> bool:
|
|
124
131
|
return is_temporal_path
|
125
132
|
|
126
133
|
|
134
|
+
def _get_stack_frame() -> inspect.FrameInfo:
|
135
|
+
batch = inspect.stack()[:7]
|
136
|
+
batch.reverse()
|
137
|
+
collect_frame_regex = r"(collect_dataframe_checkpoint)"
|
138
|
+
|
139
|
+
for frame in batch:
|
140
|
+
if (
|
141
|
+
frame.code_context is not None
|
142
|
+
and len(frame.code_context) >= 0
|
143
|
+
and re.search(collect_frame_regex, frame.code_context[0])
|
144
|
+
):
|
145
|
+
return frame
|
146
|
+
return None
|
147
|
+
|
148
|
+
|
127
149
|
def _get_ipynb_file_path_collection() -> list[str]:
|
128
150
|
current_working_directory_path = get_io_file_manager().getcwd()
|
129
151
|
cwd_file_name_collection = get_io_file_manager().ls(current_working_directory_path)
|
@@ -54,25 +54,35 @@ def test_get_output_file_path_create():
|
|
54
54
|
|
55
55
|
|
56
56
|
def test_get_collection_point_source_file_path_scenario_python_source_file():
|
57
|
-
|
58
|
-
file_utils.
|
59
|
-
|
60
|
-
|
57
|
+
with mock.patch(
|
58
|
+
"snowflake.snowpark_checkpoints_collector.utils.file_utils._get_stack_frame",
|
59
|
+
return_value=mock.MagicMock(filename="abc.py", lineno=1),
|
60
|
+
):
|
61
|
+
collection_point_source_file_path = (
|
62
|
+
file_utils.get_collection_point_source_file_path()
|
63
|
+
)
|
64
|
+
assert collection_point_source_file_path != UNKNOWN_SOURCE_FILE
|
61
65
|
|
62
66
|
|
63
67
|
def test_get_collection_point_source_file_path_scenario_notebook_source_file():
|
64
|
-
with
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
68
|
+
with (
|
69
|
+
mock.patch(
|
70
|
+
"snowflake.snowpark_checkpoints_collector.utils.file_utils._is_temporal_path",
|
71
|
+
return_value=True,
|
72
|
+
),
|
73
|
+
mock.patch(
|
74
|
+
"snowflake.snowpark_checkpoints_collector.utils.file_utils._get_stack_frame",
|
75
|
+
return_value=mock.MagicMock(filename="abc.ipynb", lineno=1),
|
76
|
+
),
|
77
|
+
mock.patch(
|
69
78
|
"snowflake.snowpark_checkpoints_collector.utils.file_utils._get_ipynb_file_path_collection",
|
70
79
|
return_value=["abc.ipynb"],
|
71
|
-
)
|
72
|
-
|
73
|
-
|
74
|
-
)
|
75
|
-
|
80
|
+
),
|
81
|
+
):
|
82
|
+
collection_point_source_file_path = (
|
83
|
+
file_utils.get_collection_point_source_file_path()
|
84
|
+
)
|
85
|
+
assert collection_point_source_file_path != UNKNOWN_SOURCE_FILE
|
76
86
|
|
77
87
|
|
78
88
|
def test_get_collection_point_source_file_path_scenario_unknown_source_file():
|
@@ -102,8 +112,12 @@ def test_get_collection_point_source_file_path_scenario_exception():
|
|
102
112
|
|
103
113
|
|
104
114
|
def test_get_collection_point_line_of_code_scenario_python_source_file():
|
105
|
-
|
106
|
-
|
115
|
+
with mock.patch(
|
116
|
+
"snowflake.snowpark_checkpoints_collector.utils.file_utils._get_stack_frame",
|
117
|
+
return_value=mock.MagicMock(filename=__file__, lineno=1),
|
118
|
+
):
|
119
|
+
collection_point_line_of_code = file_utils.get_collection_point_line_of_code()
|
120
|
+
assert collection_point_line_of_code != UNKNOWN_LINE_OF_CODE
|
107
121
|
|
108
122
|
|
109
123
|
def test_get_collection_point_line_of_code_scenario_notebook_source_file():
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
import logging
|
17
|
+
import os
|
18
|
+
import tempfile
|
19
|
+
|
20
|
+
from datetime import datetime
|
21
|
+
from typing import get_type_hints
|
22
|
+
from unittest.mock import MagicMock, patch
|
23
|
+
|
24
|
+
import pytest
|
25
|
+
|
26
|
+
from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
|
27
|
+
collect_dataframe_checkpoint,
|
28
|
+
generate_parquet_for_spark_df,
|
29
|
+
xcollect_dataframe_checkpoint,
|
30
|
+
)
|
31
|
+
|
32
|
+
|
33
|
+
def test_generate_parquet_for_spark_df_exception():
|
34
|
+
spark = MagicMock()
|
35
|
+
spark_df = MagicMock()
|
36
|
+
spark_df.dtypes = []
|
37
|
+
spark_df.select = MagicMock()
|
38
|
+
spark_df = spark.createDataFrame()
|
39
|
+
parquet_directory = os.path.join(
|
40
|
+
tempfile.gettempdir(),
|
41
|
+
f"test_spark_df_checkpoint_{datetime.now().strftime('%Y%m%d%H%M%S')}",
|
42
|
+
)
|
43
|
+
|
44
|
+
with pytest.raises(Exception, match="No parquet files were generated."):
|
45
|
+
generate_parquet_for_spark_df(spark_df, parquet_directory)
|
46
|
+
|
47
|
+
|
48
|
+
def test_collect_dataframe_checkpoint_disabled_checkpoint(
|
49
|
+
caplog: pytest.LogCaptureFixture,
|
50
|
+
):
|
51
|
+
"""Test that collect_dataframe_checkpoint logs a message when the checkpoint is disabled."""
|
52
|
+
pyspark_df = MagicMock()
|
53
|
+
checkpoint_name = "my_checkpoint"
|
54
|
+
module_name = "snowflake.snowpark_checkpoints_collector.summary_stats_collector"
|
55
|
+
expected_exception_error_msg = "Checkpoint 'my_checkpoint' is disabled. Please enable it in the checkpoints.json file."
|
56
|
+
expected_fix_suggestion_msg = "In case you want to skip it, use the xcollect_dataframe_checkpoint method instead."
|
57
|
+
try:
|
58
|
+
with (
|
59
|
+
caplog.at_level(
|
60
|
+
level=logging.INFO,
|
61
|
+
logger=module_name,
|
62
|
+
),
|
63
|
+
patch(
|
64
|
+
f"{module_name}.is_checkpoint_enabled",
|
65
|
+
return_value=False,
|
66
|
+
) as mock_is_checkpoint_enabled,
|
67
|
+
):
|
68
|
+
collect_dataframe_checkpoint(pyspark_df, checkpoint_name)
|
69
|
+
except Exception as e:
|
70
|
+
mock_is_checkpoint_enabled.assert_called_once_with(checkpoint_name)
|
71
|
+
error_msg = e.args[0]
|
72
|
+
fix_suggestion_msg = e.args[1]
|
73
|
+
assert error_msg == expected_exception_error_msg
|
74
|
+
assert fix_suggestion_msg == expected_fix_suggestion_msg
|
75
|
+
|
76
|
+
|
77
|
+
def test_skip_collector_parameters_commutability():
|
78
|
+
collect_hints = get_type_hints(collect_dataframe_checkpoint)
|
79
|
+
x_collect_hints = get_type_hints(xcollect_dataframe_checkpoint)
|
80
|
+
|
81
|
+
collect_params = {
|
82
|
+
name: hint for name, hint in collect_hints.items() if name != "return"
|
83
|
+
}
|
84
|
+
x_collect_params = {
|
85
|
+
name: hint for name, hint in x_collect_hints.items() if name != "return"
|
86
|
+
}
|
87
|
+
assert (
|
88
|
+
collect_params == x_collect_params
|
89
|
+
), "The parameters of collect_dataframe_checkpoint and xcollect_dataframe_checkpoint must be the same."
|
90
|
+
|
91
|
+
|
92
|
+
def test_skip_collector_return_type_commutability():
|
93
|
+
collect_hints = get_type_hints(collect_dataframe_checkpoint)
|
94
|
+
x_collect_hints = get_type_hints(xcollect_dataframe_checkpoint)
|
95
|
+
|
96
|
+
collect_return = collect_hints.get("return")
|
97
|
+
x_collect_return = x_collect_hints.get("return")
|
98
|
+
assert (
|
99
|
+
collect_return == x_collect_return
|
100
|
+
), "The return type of collect_dataframe_checkpoint and xcollect_dataframe_checkpoint must be the same."
|
@@ -1,70 +0,0 @@
|
|
1
|
-
# Copyright 2025 Snowflake Inc.
|
2
|
-
# SPDX-License-Identifier: Apache-2.0
|
3
|
-
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
15
|
-
|
16
|
-
import logging
|
17
|
-
import os
|
18
|
-
import tempfile
|
19
|
-
|
20
|
-
from datetime import datetime
|
21
|
-
from unittest.mock import MagicMock, patch
|
22
|
-
|
23
|
-
import pytest
|
24
|
-
|
25
|
-
from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
|
26
|
-
collect_dataframe_checkpoint,
|
27
|
-
generate_parquet_for_spark_df,
|
28
|
-
)
|
29
|
-
|
30
|
-
|
31
|
-
def test_generate_parquet_for_spark_df_exception():
|
32
|
-
spark = MagicMock()
|
33
|
-
spark_df = MagicMock()
|
34
|
-
spark_df.dtypes = []
|
35
|
-
spark_df.select = MagicMock()
|
36
|
-
spark_df = spark.createDataFrame()
|
37
|
-
parquet_directory = os.path.join(
|
38
|
-
tempfile.gettempdir(),
|
39
|
-
f"test_spark_df_checkpoint_{datetime.now().strftime('%Y%m%d%H%M%S')}",
|
40
|
-
)
|
41
|
-
|
42
|
-
with pytest.raises(Exception, match="No parquet files were generated."):
|
43
|
-
generate_parquet_for_spark_df(spark_df, parquet_directory)
|
44
|
-
|
45
|
-
|
46
|
-
def test_collect_dataframe_checkpoint_disabled_checkpoint(
|
47
|
-
caplog: pytest.LogCaptureFixture,
|
48
|
-
):
|
49
|
-
"""Test that collect_dataframe_checkpoint logs a message when the checkpoint is disabled."""
|
50
|
-
pyspark_df = MagicMock()
|
51
|
-
checkpoint_name = "my_checkpoint"
|
52
|
-
module_name = "snowflake.snowpark_checkpoints_collector.summary_stats_collector"
|
53
|
-
expected_log_msg = (
|
54
|
-
f"Checkpoint '{checkpoint_name}' is disabled. Skipping collection."
|
55
|
-
)
|
56
|
-
|
57
|
-
with (
|
58
|
-
caplog.at_level(
|
59
|
-
level=logging.INFO,
|
60
|
-
logger=module_name,
|
61
|
-
),
|
62
|
-
patch(
|
63
|
-
f"{module_name}.is_checkpoint_enabled",
|
64
|
-
return_value=False,
|
65
|
-
) as mock_is_checkpoint_enabled,
|
66
|
-
):
|
67
|
-
collect_dataframe_checkpoint(pyspark_df, checkpoint_name)
|
68
|
-
|
69
|
-
mock_is_checkpoint_enabled.assert_called_once_with(checkpoint_name)
|
70
|
-
assert expected_log_msg in caplog.messages
|
File without changes
|
{snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/CHANGELOG.md
RENAMED
File without changes
|
File without changes
|
{snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/pyproject.toml
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{snowpark_checkpoints_collectors-0.3.0 → snowpark_checkpoints_collectors-0.3.1}/test/.coveragerc
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|