snowpark-checkpoints-collectors 0.2.1__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/PKG-INFO +24 -1
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/README.md +22 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/pyproject.toml +1 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/__init__.py +6 -1
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/__version__.py +1 -1
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +1 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +4 -2
- snowpark_checkpoints_collectors-0.3.1/src/snowflake/snowpark_checkpoints_collector/io_utils/__init__.py +26 -0
- snowpark_checkpoints_collectors-0.3.1/src/snowflake/snowpark_checkpoints_collector/io_utils/io_default_strategy.py +61 -0
- snowpark_checkpoints_collectors-0.3.1/src/snowflake/snowpark_checkpoints_collector/io_utils/io_env_strategy.py +142 -0
- snowpark_checkpoints_collectors-0.3.1/src/snowflake/snowpark_checkpoints_collector/io_utils/io_file_manager.py +79 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +11 -9
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +72 -12
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +46 -1
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +35 -8
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +67 -28
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/telemetry_compare_utils.py +13 -1
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1.py +83 -2
- snowpark_checkpoints_collectors-0.3.1/test/integ/test_collect_df_mode_1_expected/test_io_strategy.json +1 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2.py +87 -4
- snowpark_checkpoints_collectors-0.3.1/test/unit/io_utils/test_default_strategy.py +308 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_extra_config.py +36 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_file_utils.py +30 -16
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_snow_connection.py +13 -16
- snowpark_checkpoints_collectors-0.3.1/test/unit/test_summary_stats_collector.py +100 -0
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_collect_invalid_mode_telemetry.json +0 -18
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_generate_parquet_for_spark_df_telemetry.json +0 -18
- snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_spark_df_mode_dataframe_telemetry.json +0 -18
- snowpark_checkpoints_collectors-0.2.1/test/unit/test_summary_stats_collector.py +0 -70
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/.gitignore +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/CHANGELOG.md +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/LICENSE +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/snowpark-testdf-schema.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/logging_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/.coveragerc +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_checkpoint_name.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collection_result_file.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_snow_connection_int.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_checkpoint_name_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_collection_point_result.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_collection_point_result_manager.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_column_collection.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_logger.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_logging_utils.py +0 -0
- {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_pandera_column_check_manager.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: snowpark-checkpoints-collectors
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Snowpark column and table statistics collection
|
5
5
|
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
6
|
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
@@ -30,6 +30,7 @@ Requires-Dist: pandera[io]==0.20.4
|
|
30
30
|
Requires-Dist: snowflake-connector-python
|
31
31
|
Requires-Dist: snowflake-snowpark-python>=1.23.0
|
32
32
|
Provides-Extra: development
|
33
|
+
Requires-Dist: certifi==2025.1.31; extra == 'development'
|
33
34
|
Requires-Dist: coverage>=7.6.7; extra == 'development'
|
34
35
|
Requires-Dist: deepdiff>=8.0.0; extra == 'development'
|
35
36
|
Requires-Dist: hatchling==1.25.0; extra == 'development'
|
@@ -100,6 +101,28 @@ def collect_dataframe_checkpoint(
|
|
100
101
|
- `output_path`: The output path to save the checkpoint, defaults to current working directory.
|
101
102
|
|
102
103
|
|
104
|
+
### Skip DataFrame Checkpoint Collection
|
105
|
+
|
106
|
+
|
107
|
+
|
108
|
+
```python
|
109
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
110
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
111
|
+
from typing import Optional
|
112
|
+
|
113
|
+
# Signature of the function
|
114
|
+
def xcollect_dataframe_checkpoint(
|
115
|
+
df: SparkDataFrame,
|
116
|
+
checkpoint_name: str,
|
117
|
+
sample: Optional[float] = None,
|
118
|
+
mode: Optional[CheckpointMode] = None,
|
119
|
+
output_path: Optional[str] = None,
|
120
|
+
) -> None:
|
121
|
+
...
|
122
|
+
```
|
123
|
+
|
124
|
+
The signature of the method is the same of `collect_dataframe_checkpoint`.
|
125
|
+
|
103
126
|
## Usage Example
|
104
127
|
|
105
128
|
### Schema mode
|
@@ -54,6 +54,28 @@ def collect_dataframe_checkpoint(
|
|
54
54
|
- `output_path`: The output path to save the checkpoint, defaults to current working directory.
|
55
55
|
|
56
56
|
|
57
|
+
### Skip DataFrame Checkpoint Collection
|
58
|
+
|
59
|
+
|
60
|
+
|
61
|
+
```python
|
62
|
+
from pyspark.sql import DataFrame as SparkDataFrame
|
63
|
+
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
64
|
+
from typing import Optional
|
65
|
+
|
66
|
+
# Signature of the function
|
67
|
+
def xcollect_dataframe_checkpoint(
|
68
|
+
df: SparkDataFrame,
|
69
|
+
checkpoint_name: str,
|
70
|
+
sample: Optional[float] = None,
|
71
|
+
mode: Optional[CheckpointMode] = None,
|
72
|
+
output_path: Optional[str] = None,
|
73
|
+
) -> None:
|
74
|
+
...
|
75
|
+
```
|
76
|
+
|
77
|
+
The signature of the method is the same of `collect_dataframe_checkpoint`.
|
78
|
+
|
57
79
|
## Usage Example
|
58
80
|
|
59
81
|
### Schema mode
|
@@ -22,9 +22,14 @@ logging.getLogger(__name__).addHandler(logging.NullHandler())
|
|
22
22
|
|
23
23
|
# ruff: noqa: E402
|
24
24
|
|
25
|
-
__all__ = [
|
25
|
+
__all__ = [
|
26
|
+
"collect_dataframe_checkpoint",
|
27
|
+
"CheckpointMode",
|
28
|
+
"xcollect_dataframe_checkpoint",
|
29
|
+
]
|
26
30
|
|
27
31
|
from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
|
28
32
|
from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
|
29
33
|
collect_dataframe_checkpoint,
|
34
|
+
xcollect_dataframe_checkpoint,
|
30
35
|
)
|
@@ -21,6 +21,9 @@ from typing import Optional
|
|
21
21
|
from snowflake.snowpark_checkpoints_collector.collection_result.model import (
|
22
22
|
CollectionPointResult,
|
23
23
|
)
|
24
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
|
25
|
+
get_io_file_manager,
|
26
|
+
)
|
24
27
|
from snowflake.snowpark_checkpoints_collector.singleton import Singleton
|
25
28
|
from snowflake.snowpark_checkpoints_collector.utils import file_utils
|
26
29
|
|
@@ -70,5 +73,4 @@ class CollectionPointResultManager(metaclass=Singleton):
|
|
70
73
|
def _save_result(self) -> None:
|
71
74
|
result_collection_json = self.to_json()
|
72
75
|
LOGGER.info("Saving collection results to '%s'", self.output_file_path)
|
73
|
-
|
74
|
-
f.write(result_collection_json)
|
76
|
+
get_io_file_manager().write(self.output_file_path, result_collection_json)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
__all__ = ["EnvStrategy", "IOFileManager", "IODefaultStrategy"]
|
17
|
+
|
18
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_env_strategy import (
|
19
|
+
EnvStrategy,
|
20
|
+
)
|
21
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_default_strategy import (
|
22
|
+
IODefaultStrategy,
|
23
|
+
)
|
24
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
|
25
|
+
IOFileManager,
|
26
|
+
)
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
import glob
|
17
|
+
import os
|
18
|
+
import shutil
|
19
|
+
|
20
|
+
from pathlib import Path
|
21
|
+
from typing import Optional
|
22
|
+
|
23
|
+
from snowflake.snowpark_checkpoints_collector.io_utils import EnvStrategy
|
24
|
+
|
25
|
+
|
26
|
+
class IODefaultStrategy(EnvStrategy):
|
27
|
+
def mkdir(self, path: str, exist_ok: bool = False) -> None:
|
28
|
+
os.makedirs(path, exist_ok=exist_ok)
|
29
|
+
|
30
|
+
def folder_exists(self, path: str) -> bool:
|
31
|
+
return os.path.isdir(path)
|
32
|
+
|
33
|
+
def file_exists(self, path: str) -> bool:
|
34
|
+
return os.path.isfile(path)
|
35
|
+
|
36
|
+
def write(self, file_path: str, file_content: str, overwrite: bool = True) -> None:
|
37
|
+
mode = "w" if overwrite else "x"
|
38
|
+
with open(file_path, mode) as file:
|
39
|
+
file.write(file_content)
|
40
|
+
|
41
|
+
def read(
|
42
|
+
self, file_path: str, mode: str = "r", encoding: Optional[str] = None
|
43
|
+
) -> str:
|
44
|
+
with open(file_path, mode=mode, encoding=encoding) as file:
|
45
|
+
return file.read()
|
46
|
+
|
47
|
+
def read_bytes(self, file_path: str) -> bytes:
|
48
|
+
with open(file_path, mode="rb") as f:
|
49
|
+
return f.read()
|
50
|
+
|
51
|
+
def ls(self, path: str, recursive: bool = False) -> list[str]:
|
52
|
+
return glob.glob(path, recursive=recursive)
|
53
|
+
|
54
|
+
def getcwd(self) -> str:
|
55
|
+
return os.getcwd()
|
56
|
+
|
57
|
+
def remove_dir(self, path: str) -> None:
|
58
|
+
shutil.rmtree(path)
|
59
|
+
|
60
|
+
def telemetry_path_files(self, path: str) -> Path:
|
61
|
+
return Path(path)
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from abc import ABC, abstractmethod
|
17
|
+
from pathlib import Path
|
18
|
+
from typing import Optional
|
19
|
+
|
20
|
+
|
21
|
+
class EnvStrategy(ABC):
|
22
|
+
|
23
|
+
"""An abstract base class that defines methods for file and directory operations.
|
24
|
+
|
25
|
+
Subclasses should implement these methods to provide environment-specific behavior.
|
26
|
+
"""
|
27
|
+
|
28
|
+
@abstractmethod
|
29
|
+
def mkdir(self, path: str, exist_ok: bool = False) -> None:
|
30
|
+
"""Create a directory.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
path: The name of the directory to create.
|
34
|
+
exist_ok: If False, an error is raised if the directory already exists.
|
35
|
+
|
36
|
+
"""
|
37
|
+
|
38
|
+
@abstractmethod
|
39
|
+
def folder_exists(self, path: str) -> bool:
|
40
|
+
"""Check if a folder exists.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
path: The path to the folder.
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
bool: True if the folder exists, False otherwise.
|
47
|
+
|
48
|
+
"""
|
49
|
+
|
50
|
+
@abstractmethod
|
51
|
+
def file_exists(self, path: str) -> bool:
|
52
|
+
"""Check if a file exists.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
path: The path to the file.
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
bool: True if the file exists, False otherwise.
|
59
|
+
|
60
|
+
"""
|
61
|
+
|
62
|
+
@abstractmethod
|
63
|
+
def write(self, file_path: str, file_content: str, overwrite: bool = True) -> None:
|
64
|
+
"""Write content to a file.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
file_path: The name of the file to write to.
|
68
|
+
file_content: The content to write to the file.
|
69
|
+
overwrite: If True, overwrite the file if it exists.
|
70
|
+
|
71
|
+
"""
|
72
|
+
|
73
|
+
@abstractmethod
|
74
|
+
def read(
|
75
|
+
self, file_path: str, mode: str = "r", encoding: Optional[str] = None
|
76
|
+
) -> str:
|
77
|
+
"""Read content from a file.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
file_path: The path to the file to read from.
|
81
|
+
mode: The mode in which to open the file.
|
82
|
+
encoding: The encoding to use for reading the file.
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
str: The content of the file.
|
86
|
+
|
87
|
+
"""
|
88
|
+
|
89
|
+
@abstractmethod
|
90
|
+
def read_bytes(self, file_path: str) -> bytes:
|
91
|
+
"""Read binary content from a file.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
file_path: The path to the file to read from.
|
95
|
+
|
96
|
+
Returns:
|
97
|
+
bytes: The binary content of the file.
|
98
|
+
|
99
|
+
"""
|
100
|
+
|
101
|
+
@abstractmethod
|
102
|
+
def ls(self, path: str, recursive: bool = False) -> list[str]:
|
103
|
+
"""List the contents of a directory.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
path: The path to the directory.
|
107
|
+
recursive: If True, list the contents recursively.
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
list[str]: A list of the contents of the directory.
|
111
|
+
|
112
|
+
"""
|
113
|
+
|
114
|
+
@abstractmethod
|
115
|
+
def getcwd(self) -> str:
|
116
|
+
"""Get the current working directory.
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
str: The current working directory.
|
120
|
+
|
121
|
+
"""
|
122
|
+
|
123
|
+
@abstractmethod
|
124
|
+
def remove_dir(self, path: str) -> None:
|
125
|
+
"""Remove a directory and all its contents.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
path: The path to the directory to remove.
|
129
|
+
|
130
|
+
"""
|
131
|
+
|
132
|
+
@abstractmethod
|
133
|
+
def telemetry_path_files(self, path: str) -> Path:
|
134
|
+
"""Get the path to the telemetry files.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
path: The path to the telemetry directory.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
Path: The path object representing the telemetry files.
|
141
|
+
|
142
|
+
"""
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from pathlib import Path
|
17
|
+
from typing import Optional
|
18
|
+
|
19
|
+
from snowflake.snowpark_checkpoints_collector.io_utils import (
|
20
|
+
EnvStrategy,
|
21
|
+
IODefaultStrategy,
|
22
|
+
)
|
23
|
+
from snowflake.snowpark_checkpoints_collector.singleton import Singleton
|
24
|
+
|
25
|
+
|
26
|
+
class IOFileManager(metaclass=Singleton):
|
27
|
+
def __init__(self, strategy: Optional[EnvStrategy] = None):
|
28
|
+
self.strategy = strategy or IODefaultStrategy()
|
29
|
+
|
30
|
+
def mkdir(self, path: str, exist_ok: bool = False) -> None:
|
31
|
+
return self.strategy.mkdir(path, exist_ok)
|
32
|
+
|
33
|
+
def folder_exists(self, path: str) -> bool:
|
34
|
+
return self.strategy.folder_exists(path)
|
35
|
+
|
36
|
+
def file_exists(self, path: str) -> bool:
|
37
|
+
return self.strategy.file_exists(path)
|
38
|
+
|
39
|
+
def write(self, file_path: str, file_content: str, overwrite: bool = True) -> None:
|
40
|
+
return self.strategy.write(file_path, file_content, overwrite)
|
41
|
+
|
42
|
+
def read(
|
43
|
+
self, file_path: str, mode: str = "r", encoding: Optional[str] = None
|
44
|
+
) -> str:
|
45
|
+
return self.strategy.read(file_path, mode, encoding)
|
46
|
+
|
47
|
+
def read_bytes(self, file_path: str) -> bytes:
|
48
|
+
return self.strategy.read_bytes(file_path)
|
49
|
+
|
50
|
+
def ls(self, path: str, recursive: bool = False) -> list[str]:
|
51
|
+
return self.strategy.ls(path, recursive)
|
52
|
+
|
53
|
+
def getcwd(self) -> str:
|
54
|
+
return self.strategy.getcwd()
|
55
|
+
|
56
|
+
def remove_dir(self, path: str) -> None:
|
57
|
+
return self.strategy.remove_dir(path)
|
58
|
+
|
59
|
+
def telemetry_path_files(self, path: str) -> Path:
|
60
|
+
return self.strategy.telemetry_path_files(path)
|
61
|
+
|
62
|
+
def set_strategy(self, strategy: EnvStrategy):
|
63
|
+
"""Set the strategy for file and directory operations.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
strategy (EnvStrategy): The strategy to use for file and directory operations.
|
67
|
+
|
68
|
+
"""
|
69
|
+
self.strategy = strategy
|
70
|
+
|
71
|
+
|
72
|
+
def get_io_file_manager():
|
73
|
+
"""Get the singleton instance of IOFileManager.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
IOFileManager: The singleton instance of IOFileManager.
|
77
|
+
|
78
|
+
"""
|
79
|
+
return IOFileManager()
|
@@ -13,7 +13,7 @@
|
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
15
|
|
16
|
-
import
|
16
|
+
import io
|
17
17
|
import logging
|
18
18
|
import os.path
|
19
19
|
import time
|
@@ -25,6 +25,9 @@ from snowflake.snowpark import Session
|
|
25
25
|
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
26
26
|
DOT_PARQUET_EXTENSION,
|
27
27
|
)
|
28
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
|
29
|
+
get_io_file_manager,
|
30
|
+
)
|
28
31
|
|
29
32
|
|
30
33
|
STAGE_NAME = "CHECKPOINT_STAGE"
|
@@ -130,11 +133,13 @@ class SnowConnection:
|
|
130
133
|
)
|
131
134
|
|
132
135
|
def filter_files(name: str):
|
133
|
-
return
|
136
|
+
return get_io_file_manager().file_exists(name) and (
|
137
|
+
filter_func(name) if filter_func else True
|
138
|
+
)
|
134
139
|
|
135
140
|
target_dir = os.path.join(input_path, "**", "*")
|
136
141
|
LOGGER.debug("Searching for files in '%s'", input_path)
|
137
|
-
files_collection =
|
142
|
+
files_collection = get_io_file_manager().ls(target_dir, recursive=True)
|
138
143
|
|
139
144
|
files = [file for file in files_collection if filter_files(file)]
|
140
145
|
files_count = len(files)
|
@@ -152,17 +157,14 @@ class SnowConnection:
|
|
152
157
|
if not os.path.isabs(file)
|
153
158
|
else str(Path(file).resolve())
|
154
159
|
)
|
155
|
-
# Snowflake required URI format for input in the put.
|
156
|
-
normalize_file_path = Path(file_full_path).as_uri()
|
157
160
|
new_file_path = file_full_path.replace(input_path, folder_name)
|
158
161
|
# as Posix to convert Windows dir to posix
|
159
162
|
new_file_path = Path(new_file_path).as_posix()
|
160
163
|
stage_file_path = STAGE_PATH_FORMAT.format(stage_name, new_file_path)
|
161
|
-
|
162
|
-
|
163
|
-
)
|
164
|
+
parquet_file = get_io_file_manager().read_bytes(file_full_path)
|
165
|
+
binary_parquet = io.BytesIO(parquet_file)
|
164
166
|
LOGGER.info("Loading file '%s' to %s", file_full_path, stage_file_path)
|
165
|
-
self.session.
|
167
|
+
self.session.file.put_stream(binary_parquet, stage_file_path)
|
166
168
|
|
167
169
|
def create_table_from_parquet(
|
168
170
|
self, table_name: str, stage_directory_path: str
|
@@ -12,12 +12,9 @@
|
|
12
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
|
-
|
16
|
-
import glob
|
17
15
|
import json
|
18
16
|
import logging
|
19
17
|
import os
|
20
|
-
import shutil
|
21
18
|
|
22
19
|
from typing import Optional
|
23
20
|
|
@@ -54,6 +51,9 @@ from snowflake.snowpark_checkpoints_collector.column_collection import (
|
|
54
51
|
from snowflake.snowpark_checkpoints_collector.column_pandera_checks import (
|
55
52
|
PanderaColumnChecksManager,
|
56
53
|
)
|
54
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
|
55
|
+
get_io_file_manager,
|
56
|
+
)
|
57
57
|
from snowflake.snowpark_checkpoints_collector.snow_connection_model import (
|
58
58
|
SnowConnection,
|
59
59
|
)
|
@@ -117,11 +117,10 @@ def collect_dataframe_checkpoint(
|
|
117
117
|
"Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
|
118
118
|
)
|
119
119
|
if not is_checkpoint_enabled(normalized_checkpoint_name):
|
120
|
-
|
121
|
-
"Checkpoint '
|
122
|
-
|
120
|
+
raise Exception(
|
121
|
+
f"Checkpoint '{normalized_checkpoint_name}' is disabled. Please enable it in the checkpoints.json file.",
|
122
|
+
"In case you want to skip it, use the xcollect_dataframe_checkpoint method instead.",
|
123
123
|
)
|
124
|
-
return
|
125
124
|
|
126
125
|
LOGGER.info("Starting to collect checkpoint '%s'", normalized_checkpoint_name)
|
127
126
|
LOGGER.debug("DataFrame size: %s rows", df.count())
|
@@ -184,6 +183,68 @@ def collect_dataframe_checkpoint(
|
|
184
183
|
collection_point_result_manager.add_result(collection_point_result)
|
185
184
|
|
186
185
|
|
186
|
+
@log
|
187
|
+
def xcollect_dataframe_checkpoint(
|
188
|
+
df: SparkDataFrame,
|
189
|
+
checkpoint_name: str,
|
190
|
+
sample: Optional[float] = None,
|
191
|
+
mode: Optional[CheckpointMode] = None,
|
192
|
+
output_path: Optional[str] = None,
|
193
|
+
) -> None:
|
194
|
+
"""Skips the collection of metadata from a Dataframe checkpoint.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
df (SparkDataFrame): The input Spark DataFrame to skip.
|
198
|
+
checkpoint_name (str): The name of the checkpoint.
|
199
|
+
sample (float, optional): Fraction of DataFrame to sample for schema inference.
|
200
|
+
Defaults to 1.0.
|
201
|
+
mode (CheckpointMode): The mode to execution the collection.
|
202
|
+
Defaults to CheckpointMode.Schema
|
203
|
+
output_path (str, optional): The output path to save the checkpoint.
|
204
|
+
Defaults to Current working Directory.
|
205
|
+
|
206
|
+
Raises:
|
207
|
+
Exception: Invalid mode value.
|
208
|
+
Exception: Invalid checkpoint name. Checkpoint names must only contain alphanumeric characters,
|
209
|
+
underscores and dollar signs.
|
210
|
+
|
211
|
+
"""
|
212
|
+
normalized_checkpoint_name = checkpoint_name_utils.normalize_checkpoint_name(
|
213
|
+
checkpoint_name
|
214
|
+
)
|
215
|
+
if normalized_checkpoint_name != checkpoint_name:
|
216
|
+
LOGGER.warning(
|
217
|
+
"Checkpoint name '%s' was normalized to '%s'",
|
218
|
+
checkpoint_name,
|
219
|
+
normalized_checkpoint_name,
|
220
|
+
)
|
221
|
+
is_valid_checkpoint_name = checkpoint_name_utils.is_valid_checkpoint_name(
|
222
|
+
normalized_checkpoint_name
|
223
|
+
)
|
224
|
+
if not is_valid_checkpoint_name:
|
225
|
+
raise Exception(
|
226
|
+
f"Invalid checkpoint name: {normalized_checkpoint_name}. "
|
227
|
+
"Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
|
228
|
+
)
|
229
|
+
|
230
|
+
LOGGER.warning(
|
231
|
+
"Checkpoint '%s' is disabled. Skipping collection.",
|
232
|
+
normalized_checkpoint_name,
|
233
|
+
)
|
234
|
+
|
235
|
+
collection_point_file_path = file_utils.get_collection_point_source_file_path()
|
236
|
+
collection_point_line_of_code = file_utils.get_collection_point_line_of_code()
|
237
|
+
collection_point_result = CollectionPointResult(
|
238
|
+
collection_point_file_path,
|
239
|
+
collection_point_line_of_code,
|
240
|
+
normalized_checkpoint_name,
|
241
|
+
)
|
242
|
+
|
243
|
+
collection_point_result.result = CollectionResult.SKIP
|
244
|
+
collection_point_result_manager = CollectionPointResultManager(output_path)
|
245
|
+
collection_point_result_manager.add_result(collection_point_result)
|
246
|
+
|
247
|
+
|
187
248
|
@report_telemetry(params_list=["column_type_dict"])
|
188
249
|
def _collect_dataframe_checkpoint_mode_schema(
|
189
250
|
checkpoint_name: str,
|
@@ -321,8 +382,7 @@ def _generate_json_checkpoint_file(
|
|
321
382
|
output_directory_path = file_utils.get_output_directory_path(output_path)
|
322
383
|
checkpoint_file_path = os.path.join(output_directory_path, checkpoint_file_name)
|
323
384
|
LOGGER.info("Writing DataFrame JSON schema file to '%s'", checkpoint_file_path)
|
324
|
-
|
325
|
-
f.write(dataframe_schema_contract)
|
385
|
+
get_io_file_manager().write(checkpoint_file_path, dataframe_schema_contract)
|
326
386
|
|
327
387
|
|
328
388
|
@report_telemetry(params_list=["df"])
|
@@ -366,17 +426,17 @@ def generate_parquet_for_spark_df(spark_df: SparkDataFrame, output_path: str) ->
|
|
366
426
|
]
|
367
427
|
converted_df = spark_df.select(new_cols)
|
368
428
|
|
369
|
-
if
|
429
|
+
if get_io_file_manager().folder_exists(output_path):
|
370
430
|
LOGGER.warning(
|
371
431
|
"Output directory '%s' already exists. Deleting it...", output_path
|
372
432
|
)
|
373
|
-
|
433
|
+
get_io_file_manager().remove_dir(output_path)
|
374
434
|
|
375
435
|
LOGGER.info("Writing DataFrame to parquet files at '%s'", output_path)
|
376
436
|
converted_df.write.parquet(output_path, mode="overwrite")
|
377
437
|
|
378
438
|
target_dir = os.path.join(output_path, "**", f"*{DOT_PARQUET_EXTENSION}")
|
379
|
-
parquet_files =
|
439
|
+
parquet_files = get_io_file_manager().ls(target_dir, recursive=True)
|
380
440
|
parquet_files_count = len(parquet_files)
|
381
441
|
if parquet_files_count == 0:
|
382
442
|
raise Exception("No parquet files were generated.")
|