snowpark-checkpoints-collectors 0.2.1__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/PKG-INFO +24 -1
  2. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/README.md +22 -0
  3. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/pyproject.toml +1 -0
  4. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/__init__.py +6 -1
  5. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/__version__.py +1 -1
  6. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py +1 -0
  7. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +4 -2
  8. snowpark_checkpoints_collectors-0.3.1/src/snowflake/snowpark_checkpoints_collector/io_utils/__init__.py +26 -0
  9. snowpark_checkpoints_collectors-0.3.1/src/snowflake/snowpark_checkpoints_collector/io_utils/io_default_strategy.py +61 -0
  10. snowpark_checkpoints_collectors-0.3.1/src/snowflake/snowpark_checkpoints_collector/io_utils/io_env_strategy.py +142 -0
  11. snowpark_checkpoints_collectors-0.3.1/src/snowflake/snowpark_checkpoints_collector/io_utils/io_file_manager.py +79 -0
  12. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +11 -9
  13. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +72 -12
  14. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/extra_config.py +46 -1
  15. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/file_utils.py +35 -8
  16. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/telemetry.py +67 -28
  17. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/telemetry_compare_utils.py +13 -1
  18. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1.py +83 -2
  19. snowpark_checkpoints_collectors-0.3.1/test/integ/test_collect_df_mode_1_expected/test_io_strategy.json +1 -0
  20. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2.py +87 -4
  21. snowpark_checkpoints_collectors-0.3.1/test/unit/io_utils/test_default_strategy.py +308 -0
  22. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_extra_config.py +36 -0
  23. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_file_utils.py +30 -16
  24. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_snow_connection.py +13 -16
  25. snowpark_checkpoints_collectors-0.3.1/test/unit/test_summary_stats_collector.py +100 -0
  26. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_collect_invalid_mode_telemetry.json +0 -18
  27. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_generate_parquet_for_spark_df_telemetry.json +0 -18
  28. snowpark_checkpoints_collectors-0.2.1/test/integ/test_collect_df_mode_2_expected/test_spark_df_mode_dataframe_telemetry.json +0 -18
  29. snowpark_checkpoints_collectors-0.2.1/test/unit/test_summary_stats_collector.py +0 -70
  30. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/.gitignore +0 -0
  31. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/CHANGELOG.md +0 -0
  32. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/LICENSE +0 -0
  33. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/snowpark-testdf-schema.json +0 -0
  34. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_common.py +0 -0
  35. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py +0 -0
  36. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/__init__.py +0 -0
  37. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py +0 -0
  38. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py +0 -0
  39. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/array_column_collector.py +0 -0
  40. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/binary_column_collector.py +0 -0
  41. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/boolean_column_collector.py +0 -0
  42. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/column_collector_base.py +0 -0
  43. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/date_column_collector.py +0 -0
  44. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/day_time_interval_column_collector.py +0 -0
  45. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/decimal_column_collector.py +0 -0
  46. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/empty_column_collector.py +0 -0
  47. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/map_column_collector.py +0 -0
  48. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/null_column_collector.py +0 -0
  49. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/numeric_column_collector.py +0 -0
  50. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/string_column_collector.py +0 -0
  51. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/struct_column_collector.py +0 -0
  52. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_column_collector.py +0 -0
  53. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py +0 -0
  54. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py +0 -0
  55. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py +0 -0
  56. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/singleton.py +0 -0
  57. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py +0 -0
  58. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py +0 -0
  59. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/src/snowflake/snowpark_checkpoints_collector/utils/logging_utils.py +0 -0
  60. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/.coveragerc +0 -0
  61. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_checkpoint_name.py +0 -0
  62. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values.json +0 -0
  63. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_all_column_types_with_null_values_telemetry.json +0 -0
  64. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type.json +0 -0
  65. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_dataframe_with_unsupported_pandera_column_type_telemetry.json +0 -0
  66. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values.json +0 -0
  67. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_null_values_telemetry.json +0 -0
  68. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values.json +0 -0
  69. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_df_with_only_null_values_telemetry.json +0 -0
  70. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column.json +0 -0
  71. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_object_column_telemetry.json +0 -0
  72. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema.json +0 -0
  73. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_empty_df_with_schema_telemetry.json +0 -0
  74. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df.json +0 -0
  75. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type.json +0 -0
  76. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_all_column_type_telemetry.json +0 -0
  77. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_1_expected/test_full_df_telemetry.json +0 -0
  78. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_parquet_directory _telemetry.json +0 -0
  79. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_checkpoint_mode_2_telemetry.json +0 -0
  80. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collect_df_mode_2_expected/test_collect_empty_dataframe_with_schema_telemetry.json +0 -0
  81. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_collection_result_file.py +0 -0
  82. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/integ/test_snow_connection_int.py +0 -0
  83. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_checkpoint_name_utils.py +0 -0
  84. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_collection_point_result.py +0 -0
  85. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_collection_point_result_manager.py +0 -0
  86. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_column_collection.py +0 -0
  87. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_logger.py +0 -0
  88. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_logging_utils.py +0 -0
  89. {snowpark_checkpoints_collectors-0.2.1 → snowpark_checkpoints_collectors-0.3.1}/test/unit/test_pandera_column_check_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: snowpark-checkpoints-collectors
3
- Version: 0.2.1
3
+ Version: 0.3.1
4
4
  Summary: Snowpark column and table statistics collection
5
5
  Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
6
6
  Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
@@ -30,6 +30,7 @@ Requires-Dist: pandera[io]==0.20.4
30
30
  Requires-Dist: snowflake-connector-python
31
31
  Requires-Dist: snowflake-snowpark-python>=1.23.0
32
32
  Provides-Extra: development
33
+ Requires-Dist: certifi==2025.1.31; extra == 'development'
33
34
  Requires-Dist: coverage>=7.6.7; extra == 'development'
34
35
  Requires-Dist: deepdiff>=8.0.0; extra == 'development'
35
36
  Requires-Dist: hatchling==1.25.0; extra == 'development'
@@ -100,6 +101,28 @@ def collect_dataframe_checkpoint(
100
101
  - `output_path`: The output path to save the checkpoint, defaults to current working directory.
101
102
 
102
103
 
104
+ ### Skip DataFrame Checkpoint Collection
105
+
106
+
107
+
108
+ ```python
109
+ from pyspark.sql import DataFrame as SparkDataFrame
110
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
111
+ from typing import Optional
112
+
113
+ # Signature of the function
114
+ def xcollect_dataframe_checkpoint(
115
+ df: SparkDataFrame,
116
+ checkpoint_name: str,
117
+ sample: Optional[float] = None,
118
+ mode: Optional[CheckpointMode] = None,
119
+ output_path: Optional[str] = None,
120
+ ) -> None:
121
+ ...
122
+ ```
123
+
124
+ The signature of the method is the same of `collect_dataframe_checkpoint`.
125
+
103
126
  ## Usage Example
104
127
 
105
128
  ### Schema mode
@@ -54,6 +54,28 @@ def collect_dataframe_checkpoint(
54
54
  - `output_path`: The output path to save the checkpoint, defaults to current working directory.
55
55
 
56
56
 
57
+ ### Skip DataFrame Checkpoint Collection
58
+
59
+
60
+
61
+ ```python
62
+ from pyspark.sql import DataFrame as SparkDataFrame
63
+ from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
64
+ from typing import Optional
65
+
66
+ # Signature of the function
67
+ def xcollect_dataframe_checkpoint(
68
+ df: SparkDataFrame,
69
+ checkpoint_name: str,
70
+ sample: Optional[float] = None,
71
+ mode: Optional[CheckpointMode] = None,
72
+ output_path: Optional[str] = None,
73
+ ) -> None:
74
+ ...
75
+ ```
76
+
77
+ The signature of the method is the same of `collect_dataframe_checkpoint`.
78
+
57
79
  ## Usage Example
58
80
 
59
81
  ### Schema mode
@@ -60,6 +60,7 @@ development = [
60
60
  "pyarrow>=18.0.0",
61
61
  "deepdiff>=8.0.0",
62
62
  "pyspark>=3.5.0",
63
+ "certifi==2025.1.31",
63
64
  ]
64
65
 
65
66
  [project.urls]
@@ -22,9 +22,14 @@ logging.getLogger(__name__).addHandler(logging.NullHandler())
22
22
 
23
23
  # ruff: noqa: E402
24
24
 
25
- __all__ = ["collect_dataframe_checkpoint", "CheckpointMode"]
25
+ __all__ = [
26
+ "collect_dataframe_checkpoint",
27
+ "CheckpointMode",
28
+ "xcollect_dataframe_checkpoint",
29
+ ]
26
30
 
27
31
  from snowflake.snowpark_checkpoints_collector.collection_common import CheckpointMode
28
32
  from snowflake.snowpark_checkpoints_collector.summary_stats_collector import (
29
33
  collect_dataframe_checkpoint,
34
+ xcollect_dataframe_checkpoint,
30
35
  )
@@ -13,4 +13,4 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- __version__ = "0.2.1"
16
+ __version__ = "0.3.1"
@@ -30,6 +30,7 @@ CHECKPOINT_NAME_KEY = "checkpoint_name"
30
30
  class CollectionResult(Enum):
31
31
  FAIL = "FAIL"
32
32
  PASS = "PASS"
33
+ SKIP = "SKIP"
33
34
 
34
35
 
35
36
  class CollectionPointResult:
@@ -21,6 +21,9 @@ from typing import Optional
21
21
  from snowflake.snowpark_checkpoints_collector.collection_result.model import (
22
22
  CollectionPointResult,
23
23
  )
24
+ from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
25
+ get_io_file_manager,
26
+ )
24
27
  from snowflake.snowpark_checkpoints_collector.singleton import Singleton
25
28
  from snowflake.snowpark_checkpoints_collector.utils import file_utils
26
29
 
@@ -70,5 +73,4 @@ class CollectionPointResultManager(metaclass=Singleton):
70
73
  def _save_result(self) -> None:
71
74
  result_collection_json = self.to_json()
72
75
  LOGGER.info("Saving collection results to '%s'", self.output_file_path)
73
- with open(self.output_file_path, "w") as f:
74
- f.write(result_collection_json)
76
+ get_io_file_manager().write(self.output_file_path, result_collection_json)
@@ -0,0 +1,26 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ __all__ = ["EnvStrategy", "IOFileManager", "IODefaultStrategy"]
17
+
18
+ from snowflake.snowpark_checkpoints_collector.io_utils.io_env_strategy import (
19
+ EnvStrategy,
20
+ )
21
+ from snowflake.snowpark_checkpoints_collector.io_utils.io_default_strategy import (
22
+ IODefaultStrategy,
23
+ )
24
+ from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
25
+ IOFileManager,
26
+ )
@@ -0,0 +1,61 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import glob
17
+ import os
18
+ import shutil
19
+
20
+ from pathlib import Path
21
+ from typing import Optional
22
+
23
+ from snowflake.snowpark_checkpoints_collector.io_utils import EnvStrategy
24
+
25
+
26
+ class IODefaultStrategy(EnvStrategy):
27
+ def mkdir(self, path: str, exist_ok: bool = False) -> None:
28
+ os.makedirs(path, exist_ok=exist_ok)
29
+
30
+ def folder_exists(self, path: str) -> bool:
31
+ return os.path.isdir(path)
32
+
33
+ def file_exists(self, path: str) -> bool:
34
+ return os.path.isfile(path)
35
+
36
+ def write(self, file_path: str, file_content: str, overwrite: bool = True) -> None:
37
+ mode = "w" if overwrite else "x"
38
+ with open(file_path, mode) as file:
39
+ file.write(file_content)
40
+
41
+ def read(
42
+ self, file_path: str, mode: str = "r", encoding: Optional[str] = None
43
+ ) -> str:
44
+ with open(file_path, mode=mode, encoding=encoding) as file:
45
+ return file.read()
46
+
47
+ def read_bytes(self, file_path: str) -> bytes:
48
+ with open(file_path, mode="rb") as f:
49
+ return f.read()
50
+
51
+ def ls(self, path: str, recursive: bool = False) -> list[str]:
52
+ return glob.glob(path, recursive=recursive)
53
+
54
+ def getcwd(self) -> str:
55
+ return os.getcwd()
56
+
57
+ def remove_dir(self, path: str) -> None:
58
+ shutil.rmtree(path)
59
+
60
+ def telemetry_path_files(self, path: str) -> Path:
61
+ return Path(path)
@@ -0,0 +1,142 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from abc import ABC, abstractmethod
17
+ from pathlib import Path
18
+ from typing import Optional
19
+
20
+
21
+ class EnvStrategy(ABC):
22
+
23
+ """An abstract base class that defines methods for file and directory operations.
24
+
25
+ Subclasses should implement these methods to provide environment-specific behavior.
26
+ """
27
+
28
+ @abstractmethod
29
+ def mkdir(self, path: str, exist_ok: bool = False) -> None:
30
+ """Create a directory.
31
+
32
+ Args:
33
+ path: The name of the directory to create.
34
+ exist_ok: If False, an error is raised if the directory already exists.
35
+
36
+ """
37
+
38
+ @abstractmethod
39
+ def folder_exists(self, path: str) -> bool:
40
+ """Check if a folder exists.
41
+
42
+ Args:
43
+ path: The path to the folder.
44
+
45
+ Returns:
46
+ bool: True if the folder exists, False otherwise.
47
+
48
+ """
49
+
50
+ @abstractmethod
51
+ def file_exists(self, path: str) -> bool:
52
+ """Check if a file exists.
53
+
54
+ Args:
55
+ path: The path to the file.
56
+
57
+ Returns:
58
+ bool: True if the file exists, False otherwise.
59
+
60
+ """
61
+
62
+ @abstractmethod
63
+ def write(self, file_path: str, file_content: str, overwrite: bool = True) -> None:
64
+ """Write content to a file.
65
+
66
+ Args:
67
+ file_path: The name of the file to write to.
68
+ file_content: The content to write to the file.
69
+ overwrite: If True, overwrite the file if it exists.
70
+
71
+ """
72
+
73
+ @abstractmethod
74
+ def read(
75
+ self, file_path: str, mode: str = "r", encoding: Optional[str] = None
76
+ ) -> str:
77
+ """Read content from a file.
78
+
79
+ Args:
80
+ file_path: The path to the file to read from.
81
+ mode: The mode in which to open the file.
82
+ encoding: The encoding to use for reading the file.
83
+
84
+ Returns:
85
+ str: The content of the file.
86
+
87
+ """
88
+
89
+ @abstractmethod
90
+ def read_bytes(self, file_path: str) -> bytes:
91
+ """Read binary content from a file.
92
+
93
+ Args:
94
+ file_path: The path to the file to read from.
95
+
96
+ Returns:
97
+ bytes: The binary content of the file.
98
+
99
+ """
100
+
101
+ @abstractmethod
102
+ def ls(self, path: str, recursive: bool = False) -> list[str]:
103
+ """List the contents of a directory.
104
+
105
+ Args:
106
+ path: The path to the directory.
107
+ recursive: If True, list the contents recursively.
108
+
109
+ Returns:
110
+ list[str]: A list of the contents of the directory.
111
+
112
+ """
113
+
114
+ @abstractmethod
115
+ def getcwd(self) -> str:
116
+ """Get the current working directory.
117
+
118
+ Returns:
119
+ str: The current working directory.
120
+
121
+ """
122
+
123
+ @abstractmethod
124
+ def remove_dir(self, path: str) -> None:
125
+ """Remove a directory and all its contents.
126
+
127
+ Args:
128
+ path: The path to the directory to remove.
129
+
130
+ """
131
+
132
+ @abstractmethod
133
+ def telemetry_path_files(self, path: str) -> Path:
134
+ """Get the path to the telemetry files.
135
+
136
+ Args:
137
+ path: The path to the telemetry directory.
138
+
139
+ Returns:
140
+ Path: The path object representing the telemetry files.
141
+
142
+ """
@@ -0,0 +1,79 @@
1
+ # Copyright 2025 Snowflake Inc.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from pathlib import Path
17
+ from typing import Optional
18
+
19
+ from snowflake.snowpark_checkpoints_collector.io_utils import (
20
+ EnvStrategy,
21
+ IODefaultStrategy,
22
+ )
23
+ from snowflake.snowpark_checkpoints_collector.singleton import Singleton
24
+
25
+
26
+ class IOFileManager(metaclass=Singleton):
27
+ def __init__(self, strategy: Optional[EnvStrategy] = None):
28
+ self.strategy = strategy or IODefaultStrategy()
29
+
30
+ def mkdir(self, path: str, exist_ok: bool = False) -> None:
31
+ return self.strategy.mkdir(path, exist_ok)
32
+
33
+ def folder_exists(self, path: str) -> bool:
34
+ return self.strategy.folder_exists(path)
35
+
36
+ def file_exists(self, path: str) -> bool:
37
+ return self.strategy.file_exists(path)
38
+
39
+ def write(self, file_path: str, file_content: str, overwrite: bool = True) -> None:
40
+ return self.strategy.write(file_path, file_content, overwrite)
41
+
42
+ def read(
43
+ self, file_path: str, mode: str = "r", encoding: Optional[str] = None
44
+ ) -> str:
45
+ return self.strategy.read(file_path, mode, encoding)
46
+
47
+ def read_bytes(self, file_path: str) -> bytes:
48
+ return self.strategy.read_bytes(file_path)
49
+
50
+ def ls(self, path: str, recursive: bool = False) -> list[str]:
51
+ return self.strategy.ls(path, recursive)
52
+
53
+ def getcwd(self) -> str:
54
+ return self.strategy.getcwd()
55
+
56
+ def remove_dir(self, path: str) -> None:
57
+ return self.strategy.remove_dir(path)
58
+
59
+ def telemetry_path_files(self, path: str) -> Path:
60
+ return self.strategy.telemetry_path_files(path)
61
+
62
+ def set_strategy(self, strategy: EnvStrategy):
63
+ """Set the strategy for file and directory operations.
64
+
65
+ Args:
66
+ strategy (EnvStrategy): The strategy to use for file and directory operations.
67
+
68
+ """
69
+ self.strategy = strategy
70
+
71
+
72
+ def get_io_file_manager():
73
+ """Get the singleton instance of IOFileManager.
74
+
75
+ Returns:
76
+ IOFileManager: The singleton instance of IOFileManager.
77
+
78
+ """
79
+ return IOFileManager()
@@ -13,7 +13,7 @@
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
15
 
16
- import glob
16
+ import io
17
17
  import logging
18
18
  import os.path
19
19
  import time
@@ -25,6 +25,9 @@ from snowflake.snowpark import Session
25
25
  from snowflake.snowpark_checkpoints_collector.collection_common import (
26
26
  DOT_PARQUET_EXTENSION,
27
27
  )
28
+ from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
29
+ get_io_file_manager,
30
+ )
28
31
 
29
32
 
30
33
  STAGE_NAME = "CHECKPOINT_STAGE"
@@ -130,11 +133,13 @@ class SnowConnection:
130
133
  )
131
134
 
132
135
  def filter_files(name: str):
133
- return os.path.isfile(name) and (filter_func(name) if filter_func else True)
136
+ return get_io_file_manager().file_exists(name) and (
137
+ filter_func(name) if filter_func else True
138
+ )
134
139
 
135
140
  target_dir = os.path.join(input_path, "**", "*")
136
141
  LOGGER.debug("Searching for files in '%s'", input_path)
137
- files_collection = glob.glob(target_dir, recursive=True)
142
+ files_collection = get_io_file_manager().ls(target_dir, recursive=True)
138
143
 
139
144
  files = [file for file in files_collection if filter_files(file)]
140
145
  files_count = len(files)
@@ -152,17 +157,14 @@ class SnowConnection:
152
157
  if not os.path.isabs(file)
153
158
  else str(Path(file).resolve())
154
159
  )
155
- # Snowflake required URI format for input in the put.
156
- normalize_file_path = Path(file_full_path).as_uri()
157
160
  new_file_path = file_full_path.replace(input_path, folder_name)
158
161
  # as Posix to convert Windows dir to posix
159
162
  new_file_path = Path(new_file_path).as_posix()
160
163
  stage_file_path = STAGE_PATH_FORMAT.format(stage_name, new_file_path)
161
- put_statement = PUT_FILE_IN_STAGE_STATEMENT_FORMAT.format(
162
- normalize_file_path, stage_file_path
163
- )
164
+ parquet_file = get_io_file_manager().read_bytes(file_full_path)
165
+ binary_parquet = io.BytesIO(parquet_file)
164
166
  LOGGER.info("Loading file '%s' to %s", file_full_path, stage_file_path)
165
- self.session.sql(put_statement).collect()
167
+ self.session.file.put_stream(binary_parquet, stage_file_path)
166
168
 
167
169
  def create_table_from_parquet(
168
170
  self, table_name: str, stage_directory_path: str
@@ -12,12 +12,9 @@
12
12
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
-
16
- import glob
17
15
  import json
18
16
  import logging
19
17
  import os
20
- import shutil
21
18
 
22
19
  from typing import Optional
23
20
 
@@ -54,6 +51,9 @@ from snowflake.snowpark_checkpoints_collector.column_collection import (
54
51
  from snowflake.snowpark_checkpoints_collector.column_pandera_checks import (
55
52
  PanderaColumnChecksManager,
56
53
  )
54
+ from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
55
+ get_io_file_manager,
56
+ )
57
57
  from snowflake.snowpark_checkpoints_collector.snow_connection_model import (
58
58
  SnowConnection,
59
59
  )
@@ -117,11 +117,10 @@ def collect_dataframe_checkpoint(
117
117
  "Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
118
118
  )
119
119
  if not is_checkpoint_enabled(normalized_checkpoint_name):
120
- LOGGER.info(
121
- "Checkpoint '%s' is disabled. Skipping collection.",
122
- normalized_checkpoint_name,
120
+ raise Exception(
121
+ f"Checkpoint '{normalized_checkpoint_name}' is disabled. Please enable it in the checkpoints.json file.",
122
+ "In case you want to skip it, use the xcollect_dataframe_checkpoint method instead.",
123
123
  )
124
- return
125
124
 
126
125
  LOGGER.info("Starting to collect checkpoint '%s'", normalized_checkpoint_name)
127
126
  LOGGER.debug("DataFrame size: %s rows", df.count())
@@ -184,6 +183,68 @@ def collect_dataframe_checkpoint(
184
183
  collection_point_result_manager.add_result(collection_point_result)
185
184
 
186
185
 
186
+ @log
187
+ def xcollect_dataframe_checkpoint(
188
+ df: SparkDataFrame,
189
+ checkpoint_name: str,
190
+ sample: Optional[float] = None,
191
+ mode: Optional[CheckpointMode] = None,
192
+ output_path: Optional[str] = None,
193
+ ) -> None:
194
+ """Skips the collection of metadata from a Dataframe checkpoint.
195
+
196
+ Args:
197
+ df (SparkDataFrame): The input Spark DataFrame to skip.
198
+ checkpoint_name (str): The name of the checkpoint.
199
+ sample (float, optional): Fraction of DataFrame to sample for schema inference.
200
+ Defaults to 1.0.
201
+ mode (CheckpointMode): The mode to execution the collection.
202
+ Defaults to CheckpointMode.Schema
203
+ output_path (str, optional): The output path to save the checkpoint.
204
+ Defaults to Current working Directory.
205
+
206
+ Raises:
207
+ Exception: Invalid mode value.
208
+ Exception: Invalid checkpoint name. Checkpoint names must only contain alphanumeric characters,
209
+ underscores and dollar signs.
210
+
211
+ """
212
+ normalized_checkpoint_name = checkpoint_name_utils.normalize_checkpoint_name(
213
+ checkpoint_name
214
+ )
215
+ if normalized_checkpoint_name != checkpoint_name:
216
+ LOGGER.warning(
217
+ "Checkpoint name '%s' was normalized to '%s'",
218
+ checkpoint_name,
219
+ normalized_checkpoint_name,
220
+ )
221
+ is_valid_checkpoint_name = checkpoint_name_utils.is_valid_checkpoint_name(
222
+ normalized_checkpoint_name
223
+ )
224
+ if not is_valid_checkpoint_name:
225
+ raise Exception(
226
+ f"Invalid checkpoint name: {normalized_checkpoint_name}. "
227
+ "Checkpoint names must only contain alphanumeric characters, underscores and dollar signs."
228
+ )
229
+
230
+ LOGGER.warning(
231
+ "Checkpoint '%s' is disabled. Skipping collection.",
232
+ normalized_checkpoint_name,
233
+ )
234
+
235
+ collection_point_file_path = file_utils.get_collection_point_source_file_path()
236
+ collection_point_line_of_code = file_utils.get_collection_point_line_of_code()
237
+ collection_point_result = CollectionPointResult(
238
+ collection_point_file_path,
239
+ collection_point_line_of_code,
240
+ normalized_checkpoint_name,
241
+ )
242
+
243
+ collection_point_result.result = CollectionResult.SKIP
244
+ collection_point_result_manager = CollectionPointResultManager(output_path)
245
+ collection_point_result_manager.add_result(collection_point_result)
246
+
247
+
187
248
  @report_telemetry(params_list=["column_type_dict"])
188
249
  def _collect_dataframe_checkpoint_mode_schema(
189
250
  checkpoint_name: str,
@@ -321,8 +382,7 @@ def _generate_json_checkpoint_file(
321
382
  output_directory_path = file_utils.get_output_directory_path(output_path)
322
383
  checkpoint_file_path = os.path.join(output_directory_path, checkpoint_file_name)
323
384
  LOGGER.info("Writing DataFrame JSON schema file to '%s'", checkpoint_file_path)
324
- with open(checkpoint_file_path, "w") as f:
325
- f.write(dataframe_schema_contract)
385
+ get_io_file_manager().write(checkpoint_file_path, dataframe_schema_contract)
326
386
 
327
387
 
328
388
  @report_telemetry(params_list=["df"])
@@ -366,17 +426,17 @@ def generate_parquet_for_spark_df(spark_df: SparkDataFrame, output_path: str) ->
366
426
  ]
367
427
  converted_df = spark_df.select(new_cols)
368
428
 
369
- if os.path.exists(output_path):
429
+ if get_io_file_manager().folder_exists(output_path):
370
430
  LOGGER.warning(
371
431
  "Output directory '%s' already exists. Deleting it...", output_path
372
432
  )
373
- shutil.rmtree(output_path)
433
+ get_io_file_manager().remove_dir(output_path)
374
434
 
375
435
  LOGGER.info("Writing DataFrame to parquet files at '%s'", output_path)
376
436
  converted_df.write.parquet(output_path, mode="overwrite")
377
437
 
378
438
  target_dir = os.path.join(output_path, "**", f"*{DOT_PARQUET_EXTENSION}")
379
- parquet_files = glob.glob(target_dir, recursive=True)
439
+ parquet_files = get_io_file_manager().ls(target_dir, recursive=True)
380
440
  parquet_files_count = len(parquet_files)
381
441
  if parquet_files_count == 0:
382
442
  raise Exception("No parquet files were generated.")