snowpark-checkpoints-collectors 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_checkpoints_collector/__version__.py +1 -1
- snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py +4 -2
- snowflake/snowpark_checkpoints_collector/io_utils/__init__.py +26 -0
- snowflake/snowpark_checkpoints_collector/io_utils/io_default_strategy.py +61 -0
- snowflake/snowpark_checkpoints_collector/io_utils/io_env_strategy.py +142 -0
- snowflake/snowpark_checkpoints_collector/io_utils/io_file_manager.py +79 -0
- snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py +11 -9
- snowflake/snowpark_checkpoints_collector/summary_stats_collector.py +7 -8
- snowflake/snowpark_checkpoints_collector/utils/extra_config.py +46 -1
- snowflake/snowpark_checkpoints_collector/utils/file_utils.py +9 -4
- snowflake/snowpark_checkpoints_collector/utils/telemetry.py +67 -28
- {snowpark_checkpoints_collectors-0.2.1.dist-info → snowpark_checkpoints_collectors-0.3.0.dist-info}/METADATA +2 -1
- {snowpark_checkpoints_collectors-0.2.1.dist-info → snowpark_checkpoints_collectors-0.3.0.dist-info}/RECORD +15 -11
- {snowpark_checkpoints_collectors-0.2.1.dist-info → snowpark_checkpoints_collectors-0.3.0.dist-info}/WHEEL +0 -0
- {snowpark_checkpoints_collectors-0.2.1.dist-info → snowpark_checkpoints_collectors-0.3.0.dist-info}/licenses/LICENSE +0 -0
snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py
CHANGED
@@ -21,6 +21,9 @@ from typing import Optional
|
|
21
21
|
from snowflake.snowpark_checkpoints_collector.collection_result.model import (
|
22
22
|
CollectionPointResult,
|
23
23
|
)
|
24
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
|
25
|
+
get_io_file_manager,
|
26
|
+
)
|
24
27
|
from snowflake.snowpark_checkpoints_collector.singleton import Singleton
|
25
28
|
from snowflake.snowpark_checkpoints_collector.utils import file_utils
|
26
29
|
|
@@ -70,5 +73,4 @@ class CollectionPointResultManager(metaclass=Singleton):
|
|
70
73
|
def _save_result(self) -> None:
|
71
74
|
result_collection_json = self.to_json()
|
72
75
|
LOGGER.info("Saving collection results to '%s'", self.output_file_path)
|
73
|
-
|
74
|
-
f.write(result_collection_json)
|
76
|
+
get_io_file_manager().write(self.output_file_path, result_collection_json)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
__all__ = ["EnvStrategy", "IOFileManager", "IODefaultStrategy"]
|
17
|
+
|
18
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_env_strategy import (
|
19
|
+
EnvStrategy,
|
20
|
+
)
|
21
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_default_strategy import (
|
22
|
+
IODefaultStrategy,
|
23
|
+
)
|
24
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
|
25
|
+
IOFileManager,
|
26
|
+
)
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
import glob
|
17
|
+
import os
|
18
|
+
import shutil
|
19
|
+
|
20
|
+
from pathlib import Path
|
21
|
+
from typing import Optional
|
22
|
+
|
23
|
+
from snowflake.snowpark_checkpoints_collector.io_utils import EnvStrategy
|
24
|
+
|
25
|
+
|
26
|
+
class IODefaultStrategy(EnvStrategy):
|
27
|
+
def mkdir(self, path: str, exist_ok: bool = False) -> None:
|
28
|
+
os.makedirs(path, exist_ok=exist_ok)
|
29
|
+
|
30
|
+
def folder_exists(self, path: str) -> bool:
|
31
|
+
return os.path.isdir(path)
|
32
|
+
|
33
|
+
def file_exists(self, path: str) -> bool:
|
34
|
+
return os.path.isfile(path)
|
35
|
+
|
36
|
+
def write(self, file_path: str, file_content: str, overwrite: bool = True) -> None:
|
37
|
+
mode = "w" if overwrite else "x"
|
38
|
+
with open(file_path, mode) as file:
|
39
|
+
file.write(file_content)
|
40
|
+
|
41
|
+
def read(
|
42
|
+
self, file_path: str, mode: str = "r", encoding: Optional[str] = None
|
43
|
+
) -> str:
|
44
|
+
with open(file_path, mode=mode, encoding=encoding) as file:
|
45
|
+
return file.read()
|
46
|
+
|
47
|
+
def read_bytes(self, file_path: str) -> bytes:
|
48
|
+
with open(file_path, mode="rb") as f:
|
49
|
+
return f.read()
|
50
|
+
|
51
|
+
def ls(self, path: str, recursive: bool = False) -> list[str]:
|
52
|
+
return glob.glob(path, recursive=recursive)
|
53
|
+
|
54
|
+
def getcwd(self) -> str:
|
55
|
+
return os.getcwd()
|
56
|
+
|
57
|
+
def remove_dir(self, path: str) -> None:
|
58
|
+
shutil.rmtree(path)
|
59
|
+
|
60
|
+
def telemetry_path_files(self, path: str) -> Path:
|
61
|
+
return Path(path)
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from abc import ABC, abstractmethod
|
17
|
+
from pathlib import Path
|
18
|
+
from typing import Optional
|
19
|
+
|
20
|
+
|
21
|
+
class EnvStrategy(ABC):
|
22
|
+
|
23
|
+
"""An abstract base class that defines methods for file and directory operations.
|
24
|
+
|
25
|
+
Subclasses should implement these methods to provide environment-specific behavior.
|
26
|
+
"""
|
27
|
+
|
28
|
+
@abstractmethod
|
29
|
+
def mkdir(self, path: str, exist_ok: bool = False) -> None:
|
30
|
+
"""Create a directory.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
path: The name of the directory to create.
|
34
|
+
exist_ok: If False, an error is raised if the directory already exists.
|
35
|
+
|
36
|
+
"""
|
37
|
+
|
38
|
+
@abstractmethod
|
39
|
+
def folder_exists(self, path: str) -> bool:
|
40
|
+
"""Check if a folder exists.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
path: The path to the folder.
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
bool: True if the folder exists, False otherwise.
|
47
|
+
|
48
|
+
"""
|
49
|
+
|
50
|
+
@abstractmethod
|
51
|
+
def file_exists(self, path: str) -> bool:
|
52
|
+
"""Check if a file exists.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
path: The path to the file.
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
bool: True if the file exists, False otherwise.
|
59
|
+
|
60
|
+
"""
|
61
|
+
|
62
|
+
@abstractmethod
|
63
|
+
def write(self, file_path: str, file_content: str, overwrite: bool = True) -> None:
|
64
|
+
"""Write content to a file.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
file_path: The name of the file to write to.
|
68
|
+
file_content: The content to write to the file.
|
69
|
+
overwrite: If True, overwrite the file if it exists.
|
70
|
+
|
71
|
+
"""
|
72
|
+
|
73
|
+
@abstractmethod
|
74
|
+
def read(
|
75
|
+
self, file_path: str, mode: str = "r", encoding: Optional[str] = None
|
76
|
+
) -> str:
|
77
|
+
"""Read content from a file.
|
78
|
+
|
79
|
+
Args:
|
80
|
+
file_path: The path to the file to read from.
|
81
|
+
mode: The mode in which to open the file.
|
82
|
+
encoding: The encoding to use for reading the file.
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
str: The content of the file.
|
86
|
+
|
87
|
+
"""
|
88
|
+
|
89
|
+
@abstractmethod
|
90
|
+
def read_bytes(self, file_path: str) -> bytes:
|
91
|
+
"""Read binary content from a file.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
file_path: The path to the file to read from.
|
95
|
+
|
96
|
+
Returns:
|
97
|
+
bytes: The binary content of the file.
|
98
|
+
|
99
|
+
"""
|
100
|
+
|
101
|
+
@abstractmethod
|
102
|
+
def ls(self, path: str, recursive: bool = False) -> list[str]:
|
103
|
+
"""List the contents of a directory.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
path: The path to the directory.
|
107
|
+
recursive: If True, list the contents recursively.
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
list[str]: A list of the contents of the directory.
|
111
|
+
|
112
|
+
"""
|
113
|
+
|
114
|
+
@abstractmethod
|
115
|
+
def getcwd(self) -> str:
|
116
|
+
"""Get the current working directory.
|
117
|
+
|
118
|
+
Returns:
|
119
|
+
str: The current working directory.
|
120
|
+
|
121
|
+
"""
|
122
|
+
|
123
|
+
@abstractmethod
|
124
|
+
def remove_dir(self, path: str) -> None:
|
125
|
+
"""Remove a directory and all its contents.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
path: The path to the directory to remove.
|
129
|
+
|
130
|
+
"""
|
131
|
+
|
132
|
+
@abstractmethod
|
133
|
+
def telemetry_path_files(self, path: str) -> Path:
|
134
|
+
"""Get the path to the telemetry files.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
path: The path to the telemetry directory.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
Path: The path object representing the telemetry files.
|
141
|
+
|
142
|
+
"""
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# Copyright 2025 Snowflake Inc.
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
3
|
+
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
+
# you may not use this file except in compliance with the License.
|
6
|
+
# You may obtain a copy of the License at
|
7
|
+
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
+
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
+
# See the License for the specific language governing permissions and
|
14
|
+
# limitations under the License.
|
15
|
+
|
16
|
+
from pathlib import Path
|
17
|
+
from typing import Optional
|
18
|
+
|
19
|
+
from snowflake.snowpark_checkpoints_collector.io_utils import (
|
20
|
+
EnvStrategy,
|
21
|
+
IODefaultStrategy,
|
22
|
+
)
|
23
|
+
from snowflake.snowpark_checkpoints_collector.singleton import Singleton
|
24
|
+
|
25
|
+
|
26
|
+
class IOFileManager(metaclass=Singleton):
|
27
|
+
def __init__(self, strategy: Optional[EnvStrategy] = None):
|
28
|
+
self.strategy = strategy or IODefaultStrategy()
|
29
|
+
|
30
|
+
def mkdir(self, path: str, exist_ok: bool = False) -> None:
|
31
|
+
return self.strategy.mkdir(path, exist_ok)
|
32
|
+
|
33
|
+
def folder_exists(self, path: str) -> bool:
|
34
|
+
return self.strategy.folder_exists(path)
|
35
|
+
|
36
|
+
def file_exists(self, path: str) -> bool:
|
37
|
+
return self.strategy.file_exists(path)
|
38
|
+
|
39
|
+
def write(self, file_path: str, file_content: str, overwrite: bool = True) -> None:
|
40
|
+
return self.strategy.write(file_path, file_content, overwrite)
|
41
|
+
|
42
|
+
def read(
|
43
|
+
self, file_path: str, mode: str = "r", encoding: Optional[str] = None
|
44
|
+
) -> str:
|
45
|
+
return self.strategy.read(file_path, mode, encoding)
|
46
|
+
|
47
|
+
def read_bytes(self, file_path: str) -> bytes:
|
48
|
+
return self.strategy.read_bytes(file_path)
|
49
|
+
|
50
|
+
def ls(self, path: str, recursive: bool = False) -> list[str]:
|
51
|
+
return self.strategy.ls(path, recursive)
|
52
|
+
|
53
|
+
def getcwd(self) -> str:
|
54
|
+
return self.strategy.getcwd()
|
55
|
+
|
56
|
+
def remove_dir(self, path: str) -> None:
|
57
|
+
return self.strategy.remove_dir(path)
|
58
|
+
|
59
|
+
def telemetry_path_files(self, path: str) -> Path:
|
60
|
+
return self.strategy.telemetry_path_files(path)
|
61
|
+
|
62
|
+
def set_strategy(self, strategy: EnvStrategy):
|
63
|
+
"""Set the strategy for file and directory operations.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
strategy (EnvStrategy): The strategy to use for file and directory operations.
|
67
|
+
|
68
|
+
"""
|
69
|
+
self.strategy = strategy
|
70
|
+
|
71
|
+
|
72
|
+
def get_io_file_manager():
|
73
|
+
"""Get the singleton instance of IOFileManager.
|
74
|
+
|
75
|
+
Returns:
|
76
|
+
IOFileManager: The singleton instance of IOFileManager.
|
77
|
+
|
78
|
+
"""
|
79
|
+
return IOFileManager()
|
@@ -13,7 +13,7 @@
|
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
15
|
|
16
|
-
import
|
16
|
+
import io
|
17
17
|
import logging
|
18
18
|
import os.path
|
19
19
|
import time
|
@@ -25,6 +25,9 @@ from snowflake.snowpark import Session
|
|
25
25
|
from snowflake.snowpark_checkpoints_collector.collection_common import (
|
26
26
|
DOT_PARQUET_EXTENSION,
|
27
27
|
)
|
28
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
|
29
|
+
get_io_file_manager,
|
30
|
+
)
|
28
31
|
|
29
32
|
|
30
33
|
STAGE_NAME = "CHECKPOINT_STAGE"
|
@@ -130,11 +133,13 @@ class SnowConnection:
|
|
130
133
|
)
|
131
134
|
|
132
135
|
def filter_files(name: str):
|
133
|
-
return
|
136
|
+
return get_io_file_manager().file_exists(name) and (
|
137
|
+
filter_func(name) if filter_func else True
|
138
|
+
)
|
134
139
|
|
135
140
|
target_dir = os.path.join(input_path, "**", "*")
|
136
141
|
LOGGER.debug("Searching for files in '%s'", input_path)
|
137
|
-
files_collection =
|
142
|
+
files_collection = get_io_file_manager().ls(target_dir, recursive=True)
|
138
143
|
|
139
144
|
files = [file for file in files_collection if filter_files(file)]
|
140
145
|
files_count = len(files)
|
@@ -152,17 +157,14 @@ class SnowConnection:
|
|
152
157
|
if not os.path.isabs(file)
|
153
158
|
else str(Path(file).resolve())
|
154
159
|
)
|
155
|
-
# Snowflake required URI format for input in the put.
|
156
|
-
normalize_file_path = Path(file_full_path).as_uri()
|
157
160
|
new_file_path = file_full_path.replace(input_path, folder_name)
|
158
161
|
# as Posix to convert Windows dir to posix
|
159
162
|
new_file_path = Path(new_file_path).as_posix()
|
160
163
|
stage_file_path = STAGE_PATH_FORMAT.format(stage_name, new_file_path)
|
161
|
-
|
162
|
-
|
163
|
-
)
|
164
|
+
parquet_file = get_io_file_manager().read_bytes(file_full_path)
|
165
|
+
binary_parquet = io.BytesIO(parquet_file)
|
164
166
|
LOGGER.info("Loading file '%s' to %s", file_full_path, stage_file_path)
|
165
|
-
self.session.
|
167
|
+
self.session.file.put_stream(binary_parquet, stage_file_path)
|
166
168
|
|
167
169
|
def create_table_from_parquet(
|
168
170
|
self, table_name: str, stage_directory_path: str
|
@@ -12,12 +12,9 @@
|
|
12
12
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
|
-
|
16
|
-
import glob
|
17
15
|
import json
|
18
16
|
import logging
|
19
17
|
import os
|
20
|
-
import shutil
|
21
18
|
|
22
19
|
from typing import Optional
|
23
20
|
|
@@ -54,6 +51,9 @@ from snowflake.snowpark_checkpoints_collector.column_collection import (
|
|
54
51
|
from snowflake.snowpark_checkpoints_collector.column_pandera_checks import (
|
55
52
|
PanderaColumnChecksManager,
|
56
53
|
)
|
54
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
|
55
|
+
get_io_file_manager,
|
56
|
+
)
|
57
57
|
from snowflake.snowpark_checkpoints_collector.snow_connection_model import (
|
58
58
|
SnowConnection,
|
59
59
|
)
|
@@ -321,8 +321,7 @@ def _generate_json_checkpoint_file(
|
|
321
321
|
output_directory_path = file_utils.get_output_directory_path(output_path)
|
322
322
|
checkpoint_file_path = os.path.join(output_directory_path, checkpoint_file_name)
|
323
323
|
LOGGER.info("Writing DataFrame JSON schema file to '%s'", checkpoint_file_path)
|
324
|
-
|
325
|
-
f.write(dataframe_schema_contract)
|
324
|
+
get_io_file_manager().write(checkpoint_file_path, dataframe_schema_contract)
|
326
325
|
|
327
326
|
|
328
327
|
@report_telemetry(params_list=["df"])
|
@@ -366,17 +365,17 @@ def generate_parquet_for_spark_df(spark_df: SparkDataFrame, output_path: str) ->
|
|
366
365
|
]
|
367
366
|
converted_df = spark_df.select(new_cols)
|
368
367
|
|
369
|
-
if
|
368
|
+
if get_io_file_manager().folder_exists(output_path):
|
370
369
|
LOGGER.warning(
|
371
370
|
"Output directory '%s' already exists. Deleting it...", output_path
|
372
371
|
)
|
373
|
-
|
372
|
+
get_io_file_manager().remove_dir(output_path)
|
374
373
|
|
375
374
|
LOGGER.info("Writing DataFrame to parquet files at '%s'", output_path)
|
376
375
|
converted_df.write.parquet(output_path, mode="overwrite")
|
377
376
|
|
378
377
|
target_dir = os.path.join(output_path, "**", f"*{DOT_PARQUET_EXTENSION}")
|
379
|
-
parquet_files =
|
378
|
+
parquet_files = get_io_file_manager().ls(target_dir, recursive=True)
|
380
379
|
parquet_files_count = len(parquet_files)
|
381
380
|
if parquet_files_count == 0:
|
382
381
|
raise Exception("No parquet files were generated.")
|
@@ -22,13 +22,57 @@ from snowflake.snowpark_checkpoints_collector.collection_common import (
|
|
22
22
|
SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR,
|
23
23
|
CheckpointMode,
|
24
24
|
)
|
25
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
|
26
|
+
get_io_file_manager,
|
27
|
+
)
|
25
28
|
|
26
29
|
|
27
30
|
LOGGER = logging.getLogger(__name__)
|
28
31
|
|
29
32
|
# noinspection DuplicatedCode
|
30
33
|
def _get_checkpoint_contract_file_path() -> str:
|
31
|
-
return os.environ.get(
|
34
|
+
return os.environ.get(
|
35
|
+
SNOWFLAKE_CHECKPOINT_CONTRACT_FILE_PATH_ENV_VAR, get_io_file_manager().getcwd()
|
36
|
+
)
|
37
|
+
|
38
|
+
|
39
|
+
def _set_conf_io_strategy() -> None:
|
40
|
+
try:
|
41
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_default_strategy import (
|
42
|
+
IODefaultStrategy,
|
43
|
+
)
|
44
|
+
from snowflake.snowpark_checkpoints_configuration.io_utils.io_file_manager import (
|
45
|
+
EnvStrategy as ConfEnvStrategy,
|
46
|
+
)
|
47
|
+
from snowflake.snowpark_checkpoints_configuration.io_utils.io_file_manager import (
|
48
|
+
get_io_file_manager as get_conf_io_file_manager,
|
49
|
+
)
|
50
|
+
|
51
|
+
is_default_strategy = isinstance(
|
52
|
+
get_io_file_manager().strategy, IODefaultStrategy
|
53
|
+
)
|
54
|
+
|
55
|
+
if is_default_strategy:
|
56
|
+
return
|
57
|
+
|
58
|
+
class CustomConfEnvStrategy(ConfEnvStrategy):
|
59
|
+
def file_exists(self, path: str) -> bool:
|
60
|
+
return get_io_file_manager().file_exists(path)
|
61
|
+
|
62
|
+
def read(
|
63
|
+
self, file_path: str, mode: str = "r", encoding: Optional[str] = None
|
64
|
+
) -> Optional[str]:
|
65
|
+
return get_io_file_manager().read(file_path, mode, encoding)
|
66
|
+
|
67
|
+
def getcwd(self) -> str:
|
68
|
+
return get_io_file_manager().getcwd()
|
69
|
+
|
70
|
+
get_conf_io_file_manager().set_strategy(CustomConfEnvStrategy())
|
71
|
+
|
72
|
+
except ImportError:
|
73
|
+
LOGGER.debug(
|
74
|
+
"snowpark-checkpoints-configuration is not installed. Cannot get a checkpoint metadata instance."
|
75
|
+
)
|
32
76
|
|
33
77
|
|
34
78
|
# noinspection DuplicatedCode
|
@@ -38,6 +82,7 @@ def _get_metadata():
|
|
38
82
|
CheckpointMetadata,
|
39
83
|
)
|
40
84
|
|
85
|
+
_set_conf_io_strategy()
|
41
86
|
path = _get_checkpoint_contract_file_path()
|
42
87
|
LOGGER.debug("Loading checkpoint metadata from '%s'", path)
|
43
88
|
metadata = CheckpointMetadata(path)
|
@@ -25,6 +25,9 @@ from snowflake.snowpark_checkpoints_collector.collection_common import (
|
|
25
25
|
UNKNOWN_LINE_OF_CODE,
|
26
26
|
UNKNOWN_SOURCE_FILE,
|
27
27
|
)
|
28
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
|
29
|
+
get_io_file_manager,
|
30
|
+
)
|
28
31
|
|
29
32
|
|
30
33
|
def get_output_file_path(out_path: Optional[str] = None) -> str:
|
@@ -63,11 +66,13 @@ def get_output_directory_path(output_path: Optional[str] = None) -> str:
|
|
63
66
|
str: returns the output directory path.
|
64
67
|
|
65
68
|
"""
|
66
|
-
current_working_directory_path =
|
69
|
+
current_working_directory_path = (
|
70
|
+
output_path if output_path else get_io_file_manager().getcwd()
|
71
|
+
)
|
67
72
|
checkpoints_output_directory_path = os.path.join(
|
68
73
|
current_working_directory_path, SNOWPARK_CHECKPOINTS_OUTPUT_DIRECTORY_NAME
|
69
74
|
)
|
70
|
-
|
75
|
+
get_io_file_manager().mkdir(checkpoints_output_directory_path, exist_ok=True)
|
71
76
|
return checkpoints_output_directory_path
|
72
77
|
|
73
78
|
|
@@ -120,8 +125,8 @@ def _is_temporal_path(path: str) -> bool:
|
|
120
125
|
|
121
126
|
|
122
127
|
def _get_ipynb_file_path_collection() -> list[str]:
|
123
|
-
current_working_directory_path =
|
124
|
-
cwd_file_name_collection =
|
128
|
+
current_working_directory_path = get_io_file_manager().getcwd()
|
129
|
+
cwd_file_name_collection = get_io_file_manager().ls(current_working_directory_path)
|
125
130
|
ipynb_file_path_collection = []
|
126
131
|
for file_name in cwd_file_name_collection:
|
127
132
|
is_ipynb_file = file_name.endswith(DOT_IPYNB_EXTENSION)
|
@@ -19,16 +19,34 @@ from sys import platform
|
|
19
19
|
from typing import Any, Callable, Optional, TypeVar
|
20
20
|
from uuid import getnode
|
21
21
|
|
22
|
-
from snowflake.connector import
|
23
|
-
SNOWFLAKE_CONNECTOR_VERSION,
|
24
|
-
time_util,
|
25
|
-
)
|
26
|
-
from snowflake.connector.constants import DIRS as SNOWFLAKE_DIRS
|
27
|
-
from snowflake.connector.network import SnowflakeRestful
|
22
|
+
from snowflake.connector.description import PLATFORM as CONNECTOR_PLATFORM
|
28
23
|
from snowflake.connector.telemetry import TelemetryClient
|
29
24
|
from snowflake.snowpark import VERSION as SNOWPARK_VERSION
|
30
25
|
from snowflake.snowpark import dataframe as snowpark_dataframe
|
31
26
|
from snowflake.snowpark.session import Session
|
27
|
+
from snowflake.snowpark_checkpoints_collector.io_utils.io_file_manager import (
|
28
|
+
get_io_file_manager,
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
try:
|
33
|
+
"""
|
34
|
+
The following imports are used to log telemetry events in the Snowflake Connector.
|
35
|
+
"""
|
36
|
+
from snowflake.connector import (
|
37
|
+
SNOWFLAKE_CONNECTOR_VERSION,
|
38
|
+
time_util,
|
39
|
+
)
|
40
|
+
from snowflake.connector.constants import DIRS as SNOWFLAKE_DIRS
|
41
|
+
from snowflake.connector.network import SnowflakeRestful
|
42
|
+
except Exception:
|
43
|
+
"""
|
44
|
+
Set default import values for the Snowflake Connector when using snowpark-checkpoints in stored procedures.
|
45
|
+
"""
|
46
|
+
SNOWFLAKE_CONNECTOR_VERSION = ""
|
47
|
+
time_util = None
|
48
|
+
SNOWFLAKE_DIRS = ""
|
49
|
+
SnowflakeRestful = None
|
32
50
|
|
33
51
|
|
34
52
|
try:
|
@@ -81,7 +99,7 @@ class TelemetryManager(TelemetryClient):
|
|
81
99
|
path: path to write telemetry.
|
82
100
|
|
83
101
|
"""
|
84
|
-
|
102
|
+
get_io_file_manager().mkdir(str(path), exist_ok=True)
|
85
103
|
self.sc_folder_path = path
|
86
104
|
|
87
105
|
def sc_log_error(
|
@@ -189,7 +207,7 @@ class TelemetryManager(TelemetryClient):
|
|
189
207
|
|
190
208
|
"""
|
191
209
|
try:
|
192
|
-
|
210
|
+
get_io_file_manager().mkdir(str(self.sc_folder_path), exist_ok=True)
|
193
211
|
for event in batch:
|
194
212
|
message = event.get("message")
|
195
213
|
if message is not None:
|
@@ -199,8 +217,7 @@ class TelemetryManager(TelemetryClient):
|
|
199
217
|
f'_telemetry_{message.get("type")}.json'
|
200
218
|
)
|
201
219
|
json_content = self._sc_validate_folder_space(event)
|
202
|
-
|
203
|
-
json_file.write(json_content)
|
220
|
+
get_io_file_manager().write(str(file_path), json_content)
|
204
221
|
except Exception:
|
205
222
|
pass
|
206
223
|
|
@@ -227,10 +244,10 @@ class TelemetryManager(TelemetryClient):
|
|
227
244
|
if not self.sc_is_enabled or self.sc_is_testing or not self._rest:
|
228
245
|
return
|
229
246
|
batch = []
|
230
|
-
for file in self.sc_folder_path
|
231
|
-
|
232
|
-
|
233
|
-
|
247
|
+
for file in get_io_file_manager().ls(f"{self.sc_folder_path}/*.json"):
|
248
|
+
json_content = get_io_file_manager().read(file)
|
249
|
+
data_dict = json.loads(json_content)
|
250
|
+
batch.append(data_dict)
|
234
251
|
if batch == []:
|
235
252
|
return
|
236
253
|
body = {"logs": batch}
|
@@ -242,14 +259,17 @@ class TelemetryManager(TelemetryClient):
|
|
242
259
|
timeout=5,
|
243
260
|
)
|
244
261
|
if ret.get("success"):
|
245
|
-
for
|
262
|
+
for file_path in get_io_file_manager().ls(f"{self.sc_folder_path}/*.json"):
|
263
|
+
file = get_io_file_manager().telemetry_path_files(file_path)
|
246
264
|
file.unlink()
|
247
265
|
|
248
266
|
def _sc_is_telemetry_testing(self) -> bool:
|
249
267
|
is_testing = os.getenv("SNOWPARK_CHECKPOINTS_TELEMETRY_TESTING") == "true"
|
250
268
|
if is_testing:
|
251
269
|
local_telemetry_path = (
|
252
|
-
Path(
|
270
|
+
Path(get_io_file_manager().getcwd())
|
271
|
+
/ "snowpark-checkpoints-output"
|
272
|
+
/ "telemetry"
|
253
273
|
)
|
254
274
|
self.set_sc_output_path(local_telemetry_path)
|
255
275
|
self.sc_is_enabled = True
|
@@ -348,7 +368,7 @@ def _get_metadata() -> dict:
|
|
348
368
|
}
|
349
369
|
|
350
370
|
|
351
|
-
def _get_version() -> str:
|
371
|
+
def _get_version() -> Optional[str]:
|
352
372
|
"""Get the version of the package.
|
353
373
|
|
354
374
|
Returns:
|
@@ -359,11 +379,10 @@ def _get_version() -> str:
|
|
359
379
|
directory_levels_up = 1
|
360
380
|
project_root = Path(__file__).resolve().parents[directory_levels_up]
|
361
381
|
version_file_path = project_root / VERSION_FILE_NAME
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
return version_match.group(1)
|
382
|
+
content = get_io_file_manager().read(str(version_file_path))
|
383
|
+
version_match = re.search(VERSION_VARIABLE_PATTERN, content, re.MULTILINE)
|
384
|
+
if version_match:
|
385
|
+
return version_match.group(1)
|
367
386
|
return None
|
368
387
|
except Exception:
|
369
388
|
return None
|
@@ -379,7 +398,10 @@ def _get_folder_size(folder_path: Path) -> int:
|
|
379
398
|
int: The size of the folder in bytes.
|
380
399
|
|
381
400
|
"""
|
382
|
-
|
401
|
+
sum_size = 0
|
402
|
+
for f in get_io_file_manager().ls(f"{folder_path}/*.json"):
|
403
|
+
sum_size += get_io_file_manager().telemetry_path_files(f).stat().st_size
|
404
|
+
return sum_size
|
383
405
|
|
384
406
|
|
385
407
|
def _free_up_space(folder_path: Path, max_size: int) -> None:
|
@@ -390,9 +412,13 @@ def _free_up_space(folder_path: Path, max_size: int) -> None:
|
|
390
412
|
max_size (int): The maximum allowed size of the folder in bytes.
|
391
413
|
|
392
414
|
"""
|
393
|
-
files = sorted(
|
415
|
+
files = sorted(
|
416
|
+
get_io_file_manager().ls(f"{folder_path}/*.json"),
|
417
|
+
key=lambda f: f.stat().st_mtime,
|
418
|
+
)
|
394
419
|
current_size = _get_folder_size(folder_path)
|
395
|
-
for
|
420
|
+
for file_path in files:
|
421
|
+
file = get_io_file_manager().telemetry_path_files(file_path)
|
396
422
|
if current_size <= max_size:
|
397
423
|
break
|
398
424
|
current_size -= file.stat().st_size
|
@@ -471,12 +497,22 @@ def get_load_json(json_schema: str) -> dict:
|
|
471
497
|
|
472
498
|
"""
|
473
499
|
try:
|
474
|
-
|
475
|
-
|
500
|
+
file_content = get_io_file_manager().read(json_schema, encoding="utf-8")
|
501
|
+
return json.loads(file_content)
|
476
502
|
except (OSError, json.JSONDecodeError) as e:
|
477
503
|
raise ValueError(f"Error reading JSON schema file: {e}") from None
|
478
504
|
|
479
505
|
|
506
|
+
def _is_in_stored_procedure() -> bool:
|
507
|
+
"""Check if the code is running in a stored procedure.
|
508
|
+
|
509
|
+
Returns:
|
510
|
+
bool: True if the code is running in a stored procedure, False otherwise.
|
511
|
+
|
512
|
+
"""
|
513
|
+
return CONNECTOR_PLATFORM == "XP"
|
514
|
+
|
515
|
+
|
480
516
|
def extract_parameters(
|
481
517
|
func: Callable, args: tuple, kwargs: dict, params_list: Optional[list[str]]
|
482
518
|
) -> dict:
|
@@ -824,7 +860,10 @@ def report_telemetry(
|
|
824
860
|
except Exception as err:
|
825
861
|
func_exception = err
|
826
862
|
|
827
|
-
if
|
863
|
+
if (
|
864
|
+
os.getenv("SNOWPARK_CHECKPOINTS_TELEMETRY_ENABLED") == "false"
|
865
|
+
or _is_in_stored_procedure()
|
866
|
+
):
|
828
867
|
return result
|
829
868
|
telemetry_event = None
|
830
869
|
data = None
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: snowpark-checkpoints-collectors
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.0
|
4
4
|
Summary: Snowpark column and table statistics collection
|
5
5
|
Project-URL: Bug Tracker, https://github.com/snowflakedb/snowpark-checkpoints/issues
|
6
6
|
Project-URL: Source code, https://github.com/snowflakedb/snowpark-checkpoints/
|
@@ -30,6 +30,7 @@ Requires-Dist: pandera[io]==0.20.4
|
|
30
30
|
Requires-Dist: snowflake-connector-python
|
31
31
|
Requires-Dist: snowflake-snowpark-python>=1.23.0
|
32
32
|
Provides-Extra: development
|
33
|
+
Requires-Dist: certifi==2025.1.31; extra == 'development'
|
33
34
|
Requires-Dist: coverage>=7.6.7; extra == 'development'
|
34
35
|
Requires-Dist: deepdiff>=8.0.0; extra == 'development'
|
35
36
|
Requires-Dist: hatchling==1.25.0; extra == 'development'
|
@@ -1,11 +1,11 @@
|
|
1
1
|
snowflake/snowpark_checkpoints_collector/__init__.py,sha256=GIESlH2W6g_qdcnyRqw9yjsvEkt0aniFvGixKlF4K7A,1096
|
2
|
-
snowflake/snowpark_checkpoints_collector/__version__.py,sha256=
|
2
|
+
snowflake/snowpark_checkpoints_collector/__version__.py,sha256=kbbDnlkY7JOLNHvfWYkCO_mOBOV9GniMGdxYoQpLhyg,632
|
3
3
|
snowflake/snowpark_checkpoints_collector/collection_common.py,sha256=ff5vYffrTRjoJXZQvVQBaOlegAUj_vXBbl1IZidz8Qo,4510
|
4
4
|
snowflake/snowpark_checkpoints_collector/singleton.py,sha256=7AgIHQBXVRvPBBCkmBplzkdrrm-xVWf_N8svzA2vF8E,836
|
5
|
-
snowflake/snowpark_checkpoints_collector/summary_stats_collector.py,sha256=
|
5
|
+
snowflake/snowpark_checkpoints_collector/summary_stats_collector.py,sha256=SD5MRF7zSDKXpxekMWdg5gO7ZcZr6Y548vkkKpG_jZs,14745
|
6
6
|
snowflake/snowpark_checkpoints_collector/collection_result/model/__init__.py,sha256=jZzx29WzrjH7C_6ZsBGoe4PxbW_oM4uIjySS1axIM34,1000
|
7
7
|
snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result.py,sha256=8xD9zGnFJ7Rz9RUXIys7JnV3kQD4mk8QwNOTxAihSjQ,2908
|
8
|
-
snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py,sha256=
|
8
|
+
snowflake/snowpark_checkpoints_collector/collection_result/model/collection_point_result_manager.py,sha256=EY6WIIXRbvkTYC4bQn7jFALHh7D2PirVoiLZ5Kq8dNs,2659
|
9
9
|
snowflake/snowpark_checkpoints_collector/column_collection/__init__.py,sha256=hpTh1V7hqBSHxNUqISwfxdz-NLD-7oZEMLXDUuRsoOU,783
|
10
10
|
snowflake/snowpark_checkpoints_collector/column_collection/column_collector_manager.py,sha256=Vav_vbiipHFIAdHxeQG4ZK1BAmWTi_18hBnVeIeXFRs,9670
|
11
11
|
snowflake/snowpark_checkpoints_collector/column_collection/model/__init__.py,sha256=d0WNMeayDyUKYFLLaVAMIC5Qt-DoWoWgOjj2ygJaHWA,2919
|
@@ -26,14 +26,18 @@ snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_colum
|
|
26
26
|
snowflake/snowpark_checkpoints_collector/column_collection/model/timestamp_ntz_column_collector.py,sha256=glUUnCLgTbGiPLpF2pSZ11KCgKSpHDRt5uhi1ZT9bxA,2578
|
27
27
|
snowflake/snowpark_checkpoints_collector/column_pandera_checks/__init__.py,sha256=JNZPOYx8rUTONGz_d7xyfAvEC2_umHmGkJLoNSATLs4,793
|
28
28
|
snowflake/snowpark_checkpoints_collector/column_pandera_checks/pandera_column_checks_manager.py,sha256=X1Mm37DKt-WZ5AegvoUA3itU1nBUxvhBxvjO85QqcGE,7893
|
29
|
+
snowflake/snowpark_checkpoints_collector/io_utils/__init__.py,sha256=RhADOBizQJq4CoewWYleuZZErthjzRMHNagObTW-IsI,984
|
30
|
+
snowflake/snowpark_checkpoints_collector/io_utils/io_default_strategy.py,sha256=RG5uL6OM5R55AmyKhrNGw78nlIvSLy9iGw0Rd7WdOl8,1967
|
31
|
+
snowflake/snowpark_checkpoints_collector/io_utils/io_env_strategy.py,sha256=kJMbg2VOKNXXdkGCt_tMMLGEZ2aUl1_nie1qYvx5M-c,3770
|
32
|
+
snowflake/snowpark_checkpoints_collector/io_utils/io_file_manager.py,sha256=M17EtANswD5gcgGnmT13OImO_W1uH4K3ewu2CXL9aes,2597
|
29
33
|
snowflake/snowpark_checkpoints_collector/snow_connection_model/__init__.py,sha256=kLjZId-aGCljK7lF6yeEw-syEqeTOJDxdXfpv9YxvZA,755
|
30
|
-
snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py,sha256=
|
34
|
+
snowflake/snowpark_checkpoints_collector/snow_connection_model/snow_connection.py,sha256=r3IPnmDMb8151PTgE4YojOhWnxWGPLyBWlgFvvhOfRY,7314
|
31
35
|
snowflake/snowpark_checkpoints_collector/utils/checkpoint_name_utils.py,sha256=Xc4k3JU6A96-79VFRR8NrNAUPeO3V1DEAhngg-hLlU4,1787
|
32
|
-
snowflake/snowpark_checkpoints_collector/utils/extra_config.py,sha256=
|
33
|
-
snowflake/snowpark_checkpoints_collector/utils/file_utils.py,sha256=
|
36
|
+
snowflake/snowpark_checkpoints_collector/utils/extra_config.py,sha256=3kVf6WVA-EuyMpTO3ycTlXMSCHtytGtT6wkV4U2Hyjw,5195
|
37
|
+
snowflake/snowpark_checkpoints_collector/utils/file_utils.py,sha256=C1gZmQHvLMgHMVc5kTTpcCaUPw5PtpajY_Uu18mMy6c,4515
|
34
38
|
snowflake/snowpark_checkpoints_collector/utils/logging_utils.py,sha256=yyi6X5DqKeTg0HRhvsH6ymYp2P0wbnyKIzI2RzrQS7k,2278
|
35
|
-
snowflake/snowpark_checkpoints_collector/utils/telemetry.py,sha256=
|
36
|
-
snowpark_checkpoints_collectors-0.
|
37
|
-
snowpark_checkpoints_collectors-0.
|
38
|
-
snowpark_checkpoints_collectors-0.
|
39
|
-
snowpark_checkpoints_collectors-0.
|
39
|
+
snowflake/snowpark_checkpoints_collector/utils/telemetry.py,sha256=ueN9vM8j5YNax7jMcnEj_UrgGkoeMv_hJHVKjN7hiJE,32161
|
40
|
+
snowpark_checkpoints_collectors-0.3.0.dist-info/METADATA,sha256=4nXrRjc1glZUTrb9J8brIHPzyrE43GRKNu7lrqfGMZU,6061
|
41
|
+
snowpark_checkpoints_collectors-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
42
|
+
snowpark_checkpoints_collectors-0.3.0.dist-info/licenses/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
|
43
|
+
snowpark_checkpoints_collectors-0.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|