snowpark-connect 0.27.0__py3-none-any.whl → 0.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/column_name_handler.py +3 -93
- snowflake/snowpark_connect/config.py +99 -1
- snowflake/snowpark_connect/dataframe_container.py +0 -6
- snowflake/snowpark_connect/expression/map_expression.py +22 -7
- snowflake/snowpark_connect/expression/map_sql_expression.py +22 -18
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +4 -26
- snowflake/snowpark_connect/expression/map_unresolved_function.py +12 -3
- snowflake/snowpark_connect/expression/map_unresolved_star.py +2 -3
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/relation/map_extension.py +14 -10
- snowflake/snowpark_connect/relation/map_join.py +62 -258
- snowflake/snowpark_connect/relation/map_relation.py +5 -1
- snowflake/snowpark_connect/relation/map_sql.py +353 -16
- snowflake/snowpark_connect/relation/write/map_write.py +171 -110
- snowflake/snowpark_connect/resources_initializer.py +20 -5
- snowflake/snowpark_connect/server.py +16 -17
- snowflake/snowpark_connect/utils/concurrent.py +4 -0
- snowflake/snowpark_connect/utils/describe_query_cache.py +57 -51
- snowflake/snowpark_connect/utils/identifiers.py +120 -0
- snowflake/snowpark_connect/utils/io_utils.py +21 -1
- snowflake/snowpark_connect/utils/scala_udf_utils.py +34 -43
- snowflake/snowpark_connect/utils/session.py +16 -26
- snowflake/snowpark_connect/utils/telemetry.py +53 -0
- snowflake/snowpark_connect/version.py +1 -1
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/METADATA +2 -2
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/RECORD +34 -35
- snowflake/snowpark_connect/hidden_column.py +0 -39
- {snowpark_connect-0.27.0.data → snowpark_connect-0.28.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-0.28.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-0.28.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/top_level.txt +0 -0
|
@@ -6,20 +6,24 @@ import hashlib
|
|
|
6
6
|
import inspect
|
|
7
7
|
import random
|
|
8
8
|
import re
|
|
9
|
-
import threading
|
|
10
9
|
import time
|
|
11
10
|
from typing import Any
|
|
12
11
|
|
|
13
12
|
from snowflake import snowpark
|
|
14
13
|
from snowflake.connector.cursor import ResultMetadataV2
|
|
15
14
|
from snowflake.snowpark._internal.server_connection import ServerConnection
|
|
15
|
+
from snowflake.snowpark_connect.utils.concurrent import SynchronizedDict
|
|
16
16
|
from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
17
17
|
from snowflake.snowpark_connect.utils.telemetry import telemetry
|
|
18
18
|
|
|
19
19
|
DESCRIBE_CACHE_TTL_SECONDS = 15
|
|
20
20
|
USE_DESCRIBE_QUERY_CACHE = True
|
|
21
21
|
|
|
22
|
-
DDL_DETECTION_PATTERN = re.compile(r"
|
|
22
|
+
DDL_DETECTION_PATTERN = re.compile(r"\s*(CREATE|ALTER|DROP)\b", re.IGNORECASE)
|
|
23
|
+
PLAIN_CREATE_PATTERN = re.compile(
|
|
24
|
+
r"\s*CREATE\s+((LOCAL|GLOBAL)\s+)?(TRANSIENT\s+)?TABLE\b", re.IGNORECASE
|
|
25
|
+
)
|
|
26
|
+
|
|
23
27
|
# Pattern for simple constant queries like: SELECT 3 :: INT AS "3-80000030-0" FROM ( SELECT $1 AS "__DUMMY" FROM VALUES (NULL :: STRING))
|
|
24
28
|
# Using exact spacing pattern from generated SQL for deterministic matching
|
|
25
29
|
# Column ID format: {original_name}-{8_digit_hex_plan_id}-{column_index}
|
|
@@ -32,8 +36,7 @@ SIMPLE_CONSTANT_PATTERN = re.compile(
|
|
|
32
36
|
|
|
33
37
|
class DescribeQueryCache:
|
|
34
38
|
def __init__(self) -> None:
|
|
35
|
-
self._cache =
|
|
36
|
-
self._lock = threading.Lock()
|
|
39
|
+
self._cache = SynchronizedDict()
|
|
37
40
|
|
|
38
41
|
@staticmethod
|
|
39
42
|
def _hash_query(sql_query: str) -> str:
|
|
@@ -48,49 +51,49 @@ class DescribeQueryCache:
|
|
|
48
51
|
return sql_query
|
|
49
52
|
|
|
50
53
|
def get(self, sql_query: str) -> list[ResultMetadataV2] | None:
|
|
54
|
+
telemetry.report_describe_query_cache_lookup()
|
|
55
|
+
|
|
51
56
|
cache_key = self._get_cache_key(sql_query)
|
|
52
57
|
key = self._hash_query(cache_key)
|
|
53
58
|
current_time = time.monotonic()
|
|
54
59
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
if
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
metadata
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
)
|
|
93
|
-
del self._cache[key]
|
|
60
|
+
if key in self._cache:
|
|
61
|
+
result, timestamp = self._cache[key]
|
|
62
|
+
if current_time < timestamp + DESCRIBE_CACHE_TTL_SECONDS:
|
|
63
|
+
logger.debug(
|
|
64
|
+
f"Returning query result from cache for query: {sql_query[:20]}"
|
|
65
|
+
)
|
|
66
|
+
self._cache[key] = (result, current_time)
|
|
67
|
+
|
|
68
|
+
# If this is a constant query, we need to transform the result metadata
|
|
69
|
+
# to match the actual query's column name
|
|
70
|
+
if cache_key != sql_query: # Only transform if we normalized the key
|
|
71
|
+
match = SIMPLE_CONSTANT_PATTERN.match(sql_query)
|
|
72
|
+
if match:
|
|
73
|
+
number, column_id = match.groups()
|
|
74
|
+
expected_column_name = column_id
|
|
75
|
+
|
|
76
|
+
# Transform the cached result to match this query's column name
|
|
77
|
+
# There should only be one column in these constant queries
|
|
78
|
+
metadata = result[0]
|
|
79
|
+
new_metadata = ResultMetadataV2(
|
|
80
|
+
name=expected_column_name,
|
|
81
|
+
type_code=metadata.type_code,
|
|
82
|
+
display_size=metadata.display_size,
|
|
83
|
+
internal_size=metadata.internal_size,
|
|
84
|
+
precision=metadata.precision,
|
|
85
|
+
scale=metadata.scale,
|
|
86
|
+
is_nullable=metadata.is_nullable,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
telemetry.report_describe_query_cache_hit()
|
|
90
|
+
return [new_metadata]
|
|
91
|
+
|
|
92
|
+
telemetry.report_describe_query_cache_hit()
|
|
93
|
+
return result
|
|
94
|
+
else:
|
|
95
|
+
telemetry.report_describe_query_cache_expired()
|
|
96
|
+
del self._cache[key]
|
|
94
97
|
return None
|
|
95
98
|
|
|
96
99
|
def put(self, sql_query: str, result: list[ResultMetadataV2] | None) -> None:
|
|
@@ -102,12 +105,18 @@ class DescribeQueryCache:
|
|
|
102
105
|
|
|
103
106
|
logger.debug(f"Putting query into cache: {sql_query[:50]}...")
|
|
104
107
|
|
|
105
|
-
|
|
106
|
-
self._cache[key] = (result, time.monotonic())
|
|
108
|
+
self._cache[key] = (result, time.monotonic())
|
|
107
109
|
|
|
108
110
|
def clear(self) -> None:
|
|
109
|
-
|
|
110
|
-
|
|
111
|
+
self._cache.clear()
|
|
112
|
+
|
|
113
|
+
def update_cache_for_query(self, query: str) -> None:
|
|
114
|
+
# Clear cache for DDL operations that modify existing objects (exclude CREATE TABLE)
|
|
115
|
+
if DDL_DETECTION_PATTERN.search(query) and not PLAIN_CREATE_PATTERN.search(
|
|
116
|
+
query
|
|
117
|
+
):
|
|
118
|
+
self.clear()
|
|
119
|
+
telemetry.report_describe_query_cache_clear(query[:100])
|
|
111
120
|
|
|
112
121
|
|
|
113
122
|
def instrument_session_for_describe_cache(session: snowpark.Session):
|
|
@@ -126,10 +135,7 @@ def instrument_session_for_describe_cache(session: snowpark.Session):
|
|
|
126
135
|
if isinstance(cache_instance, DescribeQueryCache):
|
|
127
136
|
cache = cache_instance
|
|
128
137
|
|
|
129
|
-
|
|
130
|
-
if DDL_DETECTION_PATTERN.search(query):
|
|
131
|
-
logger.debug(f"DDL detected, clearing describe query cache: '{query}'")
|
|
132
|
-
cache.clear()
|
|
138
|
+
cache.update_cache_for_query(query)
|
|
133
139
|
|
|
134
140
|
def wrap_execute(wrapped_fn):
|
|
135
141
|
def fn(query: str, **kwargs):
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
4
|
import re
|
|
5
|
+
from typing import Any, TypeVar
|
|
5
6
|
|
|
6
7
|
from pyspark.errors import AnalysisException
|
|
7
8
|
|
|
@@ -117,3 +118,122 @@ def split_fully_qualified_spark_name(qualified_name: str | None) -> list[str]:
|
|
|
117
118
|
parts.append("".join(token_chars))
|
|
118
119
|
|
|
119
120
|
return parts
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# See https://docs.snowflake.com/en/sql-reference/identifiers-syntax for identifier syntax
|
|
124
|
+
UNQUOTED_IDENTIFIER_REGEX = r"([a-zA-Z_])([a-zA-Z0-9_$]{0,254})"
|
|
125
|
+
QUOTED_IDENTIFIER_REGEX = r'"((""|[^"]){0,255})"'
|
|
126
|
+
VALID_IDENTIFIER_REGEX = f"(?:{UNQUOTED_IDENTIFIER_REGEX}|{QUOTED_IDENTIFIER_REGEX})"
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
Self = TypeVar("Self", bound="FQN")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class FQN:
|
|
133
|
+
"""Represents an object identifier, supporting fully qualified names.
|
|
134
|
+
|
|
135
|
+
The instance supports builder pattern that allows updating the identifier with database and
|
|
136
|
+
schema from different sources.
|
|
137
|
+
|
|
138
|
+
Examples
|
|
139
|
+
________
|
|
140
|
+
>>> fqn = FQN.from_string("my_schema.object").using_connection(conn)
|
|
141
|
+
|
|
142
|
+
>>> fqn = FQN.from_string("my_name").set_database("db").set_schema("foo")
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
def __init__(
|
|
146
|
+
self,
|
|
147
|
+
database: str | None,
|
|
148
|
+
schema: str | None,
|
|
149
|
+
name: str,
|
|
150
|
+
signature: str | None = None,
|
|
151
|
+
) -> None:
|
|
152
|
+
self._database = database
|
|
153
|
+
self._schema = schema
|
|
154
|
+
self._name = name
|
|
155
|
+
self.signature = signature
|
|
156
|
+
|
|
157
|
+
@property
|
|
158
|
+
def database(self) -> str | None:
|
|
159
|
+
return self._database
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def schema(self) -> str | None:
|
|
163
|
+
return self._schema
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def name(self) -> str:
|
|
167
|
+
return self._name
|
|
168
|
+
|
|
169
|
+
@property
|
|
170
|
+
def prefix(self) -> str:
|
|
171
|
+
if self.database:
|
|
172
|
+
return f"{self.database}.{self.schema if self.schema else 'PUBLIC'}"
|
|
173
|
+
if self.schema:
|
|
174
|
+
return f"{self.schema}"
|
|
175
|
+
return ""
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def identifier(self) -> str:
|
|
179
|
+
if self.prefix:
|
|
180
|
+
return f"{self.prefix}.{self.name}"
|
|
181
|
+
return self.name
|
|
182
|
+
|
|
183
|
+
def __str__(self) -> str:
|
|
184
|
+
return self.identifier
|
|
185
|
+
|
|
186
|
+
def __eq__(self, other: Any) -> bool:
|
|
187
|
+
if not isinstance(other, FQN):
|
|
188
|
+
raise AnalysisException(f"{other} is not a valid FQN")
|
|
189
|
+
return self.identifier == other.identifier
|
|
190
|
+
|
|
191
|
+
@classmethod
|
|
192
|
+
def from_string(cls, identifier: str) -> Self:
|
|
193
|
+
"""Take in an object name in the form [[database.]schema.]name and return a new :class:`FQN` instance.
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
InvalidIdentifierError: If the object identifier does not meet identifier requirements.
|
|
197
|
+
"""
|
|
198
|
+
qualifier_pattern = (
|
|
199
|
+
rf"(?:(?P<first_qualifier>{VALID_IDENTIFIER_REGEX})\.)?"
|
|
200
|
+
rf"(?:(?P<second_qualifier>{VALID_IDENTIFIER_REGEX})\.)?"
|
|
201
|
+
rf"(?P<name>{VALID_IDENTIFIER_REGEX})(?P<signature>\(.*\))?"
|
|
202
|
+
)
|
|
203
|
+
result = re.fullmatch(qualifier_pattern, identifier)
|
|
204
|
+
|
|
205
|
+
if result is None:
|
|
206
|
+
raise AnalysisException(f"{identifier} is not a valid identifier")
|
|
207
|
+
|
|
208
|
+
unqualified_name = result.group("name")
|
|
209
|
+
if result.group("second_qualifier") is not None:
|
|
210
|
+
database = result.group("first_qualifier")
|
|
211
|
+
schema = result.group("second_qualifier")
|
|
212
|
+
else:
|
|
213
|
+
database = None
|
|
214
|
+
schema = result.group("first_qualifier")
|
|
215
|
+
|
|
216
|
+
signature = None
|
|
217
|
+
if result.group("signature"):
|
|
218
|
+
signature = result.group("signature")
|
|
219
|
+
return cls(
|
|
220
|
+
name=unqualified_name, schema=schema, database=database, signature=signature
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
def set_database(self, database: str | None) -> Self:
|
|
224
|
+
if database:
|
|
225
|
+
self._database = database
|
|
226
|
+
return self
|
|
227
|
+
|
|
228
|
+
def set_schema(self, schema: str | None) -> Self:
|
|
229
|
+
if schema:
|
|
230
|
+
self._schema = schema
|
|
231
|
+
return self
|
|
232
|
+
|
|
233
|
+
def set_name(self, name: str) -> Self:
|
|
234
|
+
self._name = name
|
|
235
|
+
return self
|
|
236
|
+
|
|
237
|
+
def to_dict(self) -> dict[str, str | None]:
|
|
238
|
+
"""Return the dictionary representation of the instance."""
|
|
239
|
+
return {"name": self.name, "schema": self.schema, "database": self.database}
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
#
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
|
-
|
|
4
|
+
import contextlib
|
|
5
5
|
import functools
|
|
6
6
|
|
|
7
7
|
from snowflake.snowpark import Session
|
|
8
|
+
from snowflake.snowpark_connect.utils.identifiers import FQN
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
@functools.cache
|
|
@@ -33,3 +34,22 @@ def file_format(
|
|
|
33
34
|
).collect()
|
|
34
35
|
|
|
35
36
|
return file_format_name
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_table_type(
|
|
40
|
+
snowpark_table_name: str,
|
|
41
|
+
snowpark_session: Session,
|
|
42
|
+
) -> str:
|
|
43
|
+
fqn = FQN.from_string(snowpark_table_name)
|
|
44
|
+
with contextlib.suppress(Exception):
|
|
45
|
+
if fqn.database is not None:
|
|
46
|
+
return snowpark_session.catalog.getTable(
|
|
47
|
+
table_name=fqn.name, schema=fqn.schema, database=fqn.database
|
|
48
|
+
).table_type
|
|
49
|
+
elif fqn.schema is not None:
|
|
50
|
+
return snowpark_session.catalog.getTable(
|
|
51
|
+
table_name=fqn.name, schema=fqn.schema
|
|
52
|
+
).table_type
|
|
53
|
+
else:
|
|
54
|
+
return snowpark_session.catalog.getTable(table_name=fqn.name).table_type
|
|
55
|
+
return "TABLE"
|
|
@@ -171,12 +171,19 @@ class ScalaUDFDef:
|
|
|
171
171
|
is_map_return = udf_func_return_type.startswith("Map")
|
|
172
172
|
wrapper_return_type = "String" if is_map_return else udf_func_return_type
|
|
173
173
|
|
|
174
|
+
# For handling Seq type correctly, ensure that the wrapper function always uses Array as its input and
|
|
175
|
+
# return types (when required) and the wrapped function uses Seq.
|
|
176
|
+
udf_func_return_type = udf_func_return_type.replace("Array", "Seq")
|
|
177
|
+
is_seq_return = udf_func_return_type.startswith("Seq")
|
|
178
|
+
|
|
174
179
|
# Need to call the map to JSON string converter when a map is returned by the user's function.
|
|
175
|
-
|
|
176
|
-
f"write(func({invocation_args}))"
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
+
if is_map_return:
|
|
181
|
+
invoke_udf_func = f"write(func({invocation_args}))"
|
|
182
|
+
elif is_seq_return:
|
|
183
|
+
# TODO: SNOW-2339385 Handle Array[T] return types correctly. Currently, only Seq[T] is supported.
|
|
184
|
+
invoke_udf_func = f"func({invocation_args}).toArray"
|
|
185
|
+
else:
|
|
186
|
+
invoke_udf_func = f"func({invocation_args})"
|
|
180
187
|
|
|
181
188
|
# The lines of code below are required only when a Map is returned by the UDF. This is needed to serialize the
|
|
182
189
|
# map output to a JSON string.
|
|
@@ -184,9 +191,9 @@ class ScalaUDFDef:
|
|
|
184
191
|
""
|
|
185
192
|
if not is_map_return
|
|
186
193
|
else """
|
|
187
|
-
import
|
|
188
|
-
import
|
|
189
|
-
import
|
|
194
|
+
import shaded_json4s._
|
|
195
|
+
import shaded_json4s.native.Serialization._
|
|
196
|
+
import shaded_json4s.native.Serialization
|
|
190
197
|
"""
|
|
191
198
|
)
|
|
192
199
|
map_return_formatter = (
|
|
@@ -199,22 +206,12 @@ import org.json4s.native.Serialization
|
|
|
199
206
|
|
|
200
207
|
return f"""import org.apache.spark.sql.connect.common.UdfPacket
|
|
201
208
|
{map_return_imports}
|
|
202
|
-
import
|
|
203
|
-
import java.nio.file.{{Files, Paths}}
|
|
209
|
+
import com.snowflake.sas.scala.Utils
|
|
204
210
|
|
|
205
211
|
object __RecreatedSparkUdf {{
|
|
206
212
|
{map_return_formatter}
|
|
207
|
-
private lazy val func: ({udf_func_input_types}) => {udf_func_return_type} =
|
|
208
|
-
|
|
209
|
-
val fPath = importDirectory + "{self.name}.bin"
|
|
210
|
-
val bytes = Files.readAllBytes(Paths.get(fPath))
|
|
211
|
-
val ois = new ObjectInputStream(new ByteArrayInputStream(bytes))
|
|
212
|
-
try {{
|
|
213
|
-
ois.readObject().asInstanceOf[UdfPacket].function.asInstanceOf[({udf_func_input_types}) => {udf_func_return_type}]
|
|
214
|
-
}} finally {{
|
|
215
|
-
ois.close()
|
|
216
|
-
}}
|
|
217
|
-
}}
|
|
213
|
+
private lazy val func: ({udf_func_input_types}) => {udf_func_return_type} =
|
|
214
|
+
Utils.deserializeFunc("{self.name}.bin").asInstanceOf[({udf_func_input_types}) => {udf_func_return_type}]
|
|
218
215
|
|
|
219
216
|
def __wrapperFunc({wrapper_arg_and_input_types_str}): {wrapper_return_type} = {{
|
|
220
217
|
{invoke_udf_func}
|
|
@@ -299,29 +296,15 @@ def build_scala_udf_imports(session, payload, udf_name, is_map_return) -> List[s
|
|
|
299
296
|
# Remove the stage path since it is not properly formatted.
|
|
300
297
|
user_jars.append(row[0][row[0].find("/") :])
|
|
301
298
|
|
|
302
|
-
# Jars used when the return type is a Map.
|
|
303
|
-
map_jars = (
|
|
304
|
-
[]
|
|
305
|
-
if not is_map_return
|
|
306
|
-
else [
|
|
307
|
-
f"{stage_resource_path}/json4s-core_2.12-3.7.0-M11.jar",
|
|
308
|
-
f"{stage_resource_path}/json4s-native_2.12-3.7.0-M11.jar",
|
|
309
|
-
f"{stage_resource_path}/paranamer-2.8.3.jar",
|
|
310
|
-
]
|
|
311
|
-
)
|
|
312
|
-
|
|
313
299
|
# Format the user jars to be used in the IMPORTS clause of the stored procedure.
|
|
314
|
-
return
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
+ map_jars
|
|
323
|
-
+ [f"{stage + jar}" for jar in user_jars]
|
|
324
|
-
)
|
|
300
|
+
return [
|
|
301
|
+
closure_binary_file,
|
|
302
|
+
f"{stage_resource_path}/spark-connect-client-jvm_2.12-3.5.6.jar",
|
|
303
|
+
f"{stage_resource_path}/spark-common-utils_2.12-3.5.6.jar",
|
|
304
|
+
f"{stage_resource_path}/spark-sql_2.12-3.5.6.jar",
|
|
305
|
+
f"{stage_resource_path}/json4s-ast_2.12-3.7.0-M11.jar",
|
|
306
|
+
f"{stage_resource_path}/sas-scala-udf_2.12-0.1.0.jar",
|
|
307
|
+
] + [f"{stage + jar}" for jar in user_jars]
|
|
325
308
|
|
|
326
309
|
|
|
327
310
|
def create_scala_udf(pciudf: ProcessCommonInlineUserDefinedFunction) -> ScalaUdf:
|
|
@@ -343,6 +326,14 @@ def create_scala_udf(pciudf: ProcessCommonInlineUserDefinedFunction) -> ScalaUdf
|
|
|
343
326
|
Returns:
|
|
344
327
|
A ScalaUdf object representing the created or cached Scala UDF.
|
|
345
328
|
"""
|
|
329
|
+
from snowflake.snowpark_connect.resources_initializer import (
|
|
330
|
+
wait_for_resource_initialization,
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# Make sure that the resource initializer thread is completed before creating Scala UDFs since we depend on the jars
|
|
334
|
+
# uploaded by it.
|
|
335
|
+
wait_for_resource_initialization()
|
|
336
|
+
|
|
346
337
|
from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
|
|
347
338
|
|
|
348
339
|
function_name = pciudf._function_name
|
|
@@ -8,7 +8,7 @@ from collections.abc import Sequence
|
|
|
8
8
|
from typing import Any
|
|
9
9
|
|
|
10
10
|
from snowflake import snowpark
|
|
11
|
-
from snowflake.snowpark.exceptions import SnowparkClientException
|
|
11
|
+
from snowflake.snowpark.exceptions import SnowparkClientException
|
|
12
12
|
from snowflake.snowpark.session import _get_active_session
|
|
13
13
|
from snowflake.snowpark_connect.constants import DEFAULT_CONNECTION_NAME
|
|
14
14
|
from snowflake.snowpark_connect.utils.describe_query_cache import (
|
|
@@ -50,7 +50,10 @@ def _get_current_snowpark_session() -> snowpark.Session | None:
|
|
|
50
50
|
|
|
51
51
|
def configure_snowpark_session(session: snowpark.Session):
|
|
52
52
|
"""Configure a snowpark session with required parameters and settings."""
|
|
53
|
-
from snowflake.snowpark_connect.config import
|
|
53
|
+
from snowflake.snowpark_connect.config import (
|
|
54
|
+
get_cte_optimization_enabled,
|
|
55
|
+
global_config,
|
|
56
|
+
)
|
|
54
57
|
|
|
55
58
|
logger.info(f"Configuring session {session}")
|
|
56
59
|
|
|
@@ -77,6 +80,14 @@ def configure_snowpark_session(session: snowpark.Session):
|
|
|
77
80
|
session.connection.arrow_number_to_decimal_setter = True
|
|
78
81
|
session.custom_package_usage_config["enabled"] = True
|
|
79
82
|
|
|
83
|
+
# Configure CTE optimization based on session configuration
|
|
84
|
+
cte_optimization_enabled = get_cte_optimization_enabled()
|
|
85
|
+
session.cte_optimization_enabled = cte_optimization_enabled
|
|
86
|
+
logger.info(f"CTE optimization enabled: {cte_optimization_enabled}")
|
|
87
|
+
|
|
88
|
+
# Default query tag to be used unless overridden by user using AppName or spark.addTag()
|
|
89
|
+
query_tag = "SNOWPARK_CONNECT_QUERY"
|
|
90
|
+
|
|
80
91
|
default_fallback_timezone = "UTC"
|
|
81
92
|
if global_config.spark_sql_session_timeZone is None:
|
|
82
93
|
try:
|
|
@@ -104,35 +115,14 @@ def configure_snowpark_session(session: snowpark.Session):
|
|
|
104
115
|
"QUOTED_IDENTIFIERS_IGNORE_CASE": "false",
|
|
105
116
|
"PYTHON_SNOWPARK_ENABLE_THREAD_SAFE_SESSION": "true",
|
|
106
117
|
"PYTHON_SNOWPARK_USE_SCOPED_TEMP_OBJECTS": "false", # this is required for creating udfs from sproc
|
|
118
|
+
"ENABLE_STRUCTURED_TYPES_IN_SNOWPARK_CONNECT_RESPONSE": "true",
|
|
119
|
+
"QUERY_TAG": f"'{query_tag}'",
|
|
107
120
|
}
|
|
108
121
|
|
|
109
122
|
session.sql(
|
|
110
123
|
f"ALTER SESSION SET {', '.join([f'{k} = {v}' for k, v in session_params.items()])}"
|
|
111
124
|
).collect()
|
|
112
125
|
|
|
113
|
-
# Rolling ahead in preparation of GS release 9.22 (ETA 8/5/2025). Once 9.22 is past rollback risk, merge this
|
|
114
|
-
# parameter with other in the session_params dictionary above
|
|
115
|
-
try:
|
|
116
|
-
session.sql(
|
|
117
|
-
"ALTER SESSION SET ENABLE_STRUCTURED_TYPES_IN_SNOWPARK_CONNECT_RESPONSE=true"
|
|
118
|
-
).collect()
|
|
119
|
-
except SnowparkSQLException:
|
|
120
|
-
logger.debug(
|
|
121
|
-
"ENABLE_STRUCTURED_TYPES_IN_SNOWPARK_CONNECT_RESPONSE is not defined"
|
|
122
|
-
)
|
|
123
|
-
try:
|
|
124
|
-
session.sql(
|
|
125
|
-
"ALTER SESSION SET ENABLE_STRUCTURED_TYPES_NATIVE_ARROW_FORMAT=true"
|
|
126
|
-
).collect()
|
|
127
|
-
except SnowparkSQLException:
|
|
128
|
-
logger.debug("ENABLE_STRUCTURED_TYPES_NATIVE_ARROW_FORMAT is not defined")
|
|
129
|
-
try:
|
|
130
|
-
session.sql(
|
|
131
|
-
"ALTER SESSION SET ENABLE_STRUCTURED_TYPES_IN_CLIENT_RESPONSE=true"
|
|
132
|
-
).collect()
|
|
133
|
-
except SnowparkSQLException:
|
|
134
|
-
logger.debug("ENABLE_STRUCTURED_TYPES_IN_CLIENT_RESPONSE is not defined")
|
|
135
|
-
|
|
136
126
|
# Instrument the snowpark session to use a cache for describe queries.
|
|
137
127
|
instrument_session_for_describe_cache(session)
|
|
138
128
|
|
|
@@ -204,5 +194,5 @@ def set_query_tags(spark_tags: Sequence[str]) -> None:
|
|
|
204
194
|
snowpark_session = get_or_create_snowpark_session()
|
|
205
195
|
spark_tags_str = ",".join(sorted(spark_tags)) if spark_tags else None
|
|
206
196
|
|
|
207
|
-
if spark_tags_str != snowpark_session.query_tag:
|
|
197
|
+
if spark_tags_str and spark_tags_str != snowpark_session.query_tag:
|
|
208
198
|
snowpark_session.query_tag = spark_tags_str
|
|
@@ -88,6 +88,7 @@ RECORDED_CONFIG_KEYS = {
|
|
|
88
88
|
"spark.sql.session.localRelationCacheThreshold",
|
|
89
89
|
"spark.sql.mapKeyDedupPolicy",
|
|
90
90
|
"snowpark.connect.sql.passthrough",
|
|
91
|
+
"snowpark.connect.cte.optimization_enabled",
|
|
91
92
|
"snowpark.connect.iceberg.external_volume",
|
|
92
93
|
"snowpark.connect.sql.identifiers.auto-uppercase",
|
|
93
94
|
"snowpark.connect.udtf.compatibility_mode",
|
|
@@ -426,6 +427,58 @@ class Telemetry:
|
|
|
426
427
|
|
|
427
428
|
summary["internal_queries"] += 1
|
|
428
429
|
|
|
430
|
+
@safe
|
|
431
|
+
def report_describe_query_cache_lookup(self):
|
|
432
|
+
"""Report a describe query cache lookup."""
|
|
433
|
+
if self._not_in_request():
|
|
434
|
+
return
|
|
435
|
+
|
|
436
|
+
summary = self._request_summary.get()
|
|
437
|
+
|
|
438
|
+
if "describe_cache_lookups" not in summary:
|
|
439
|
+
summary["describe_cache_lookups"] = 0
|
|
440
|
+
|
|
441
|
+
summary["describe_cache_lookups"] += 1
|
|
442
|
+
|
|
443
|
+
@safe
|
|
444
|
+
def report_describe_query_cache_hit(self):
|
|
445
|
+
"""Report a describe query cache hit."""
|
|
446
|
+
if self._not_in_request():
|
|
447
|
+
return
|
|
448
|
+
|
|
449
|
+
summary = self._request_summary.get()
|
|
450
|
+
|
|
451
|
+
if "describe_cache_hits" not in summary:
|
|
452
|
+
summary["describe_cache_hits"] = 0
|
|
453
|
+
|
|
454
|
+
summary["describe_cache_hits"] += 1
|
|
455
|
+
|
|
456
|
+
@safe
|
|
457
|
+
def report_describe_query_cache_expired(self):
|
|
458
|
+
"""Report a describe query cache hit."""
|
|
459
|
+
if self._not_in_request():
|
|
460
|
+
return
|
|
461
|
+
|
|
462
|
+
summary = self._request_summary.get()
|
|
463
|
+
|
|
464
|
+
if "describe_cache_expired" not in summary:
|
|
465
|
+
summary["describe_cache_expired"] = 0
|
|
466
|
+
|
|
467
|
+
summary["describe_cache_expired"] += 1
|
|
468
|
+
|
|
469
|
+
@safe
|
|
470
|
+
def report_describe_query_cache_clear(self, query_prefix: str):
|
|
471
|
+
"""Report a describe query cache clear."""
|
|
472
|
+
if self._not_in_request():
|
|
473
|
+
return
|
|
474
|
+
|
|
475
|
+
summary = self._request_summary.get()
|
|
476
|
+
|
|
477
|
+
if "describe_cache_clears" not in summary:
|
|
478
|
+
summary["describe_cache_clears"] = []
|
|
479
|
+
|
|
480
|
+
summary["describe_cache_clears"].append(query_prefix)
|
|
481
|
+
|
|
429
482
|
@safe
|
|
430
483
|
def report_udf_usage(self, udf_name: str):
|
|
431
484
|
if self._not_in_request():
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: snowpark-connect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.28.0
|
|
4
4
|
Summary: Snowpark Connect for Spark
|
|
5
5
|
Author: Snowflake, Inc
|
|
6
6
|
License: Apache License, Version 2.0
|
|
@@ -16,7 +16,7 @@ Requires-Dist: jpype1
|
|
|
16
16
|
Requires-Dist: protobuf<5.0,>=4.25.3
|
|
17
17
|
Requires-Dist: s3fs>=2025.3.0
|
|
18
18
|
Requires-Dist: snowflake.core<2,>=1.0.5
|
|
19
|
-
Requires-Dist: snowflake-snowpark-python[pandas]<1.
|
|
19
|
+
Requires-Dist: snowflake-snowpark-python[pandas]<1.40.0,==1.39.0
|
|
20
20
|
Requires-Dist: sqlglot>=26.3.8
|
|
21
21
|
Requires-Dist: jaydebeapi
|
|
22
22
|
Requires-Dist: aiobotocore~=2.23.0
|