snowpark-connect 0.21.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/config.py +19 -14
- snowflake/snowpark_connect/error/error_utils.py +32 -0
- snowflake/snowpark_connect/error/exceptions.py +4 -0
- snowflake/snowpark_connect/expression/hybrid_column_map.py +192 -0
- snowflake/snowpark_connect/expression/literal.py +9 -12
- snowflake/snowpark_connect/expression/map_cast.py +20 -4
- snowflake/snowpark_connect/expression/map_expression.py +8 -1
- snowflake/snowpark_connect/expression/map_udf.py +4 -4
- snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +32 -5
- snowflake/snowpark_connect/expression/map_unresolved_function.py +269 -134
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +8 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +4 -2
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +127 -21
- snowflake/snowpark_connect/relation/map_aggregate.py +154 -18
- snowflake/snowpark_connect/relation/map_column_ops.py +59 -8
- snowflake/snowpark_connect/relation/map_extension.py +58 -24
- snowflake/snowpark_connect/relation/map_local_relation.py +8 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +3 -1
- snowflake/snowpark_connect/relation/map_row_ops.py +30 -1
- snowflake/snowpark_connect/relation/map_sql.py +40 -196
- snowflake/snowpark_connect/relation/map_udtf.py +4 -4
- snowflake/snowpark_connect/relation/read/map_read.py +2 -1
- snowflake/snowpark_connect/relation/read/map_read_json.py +12 -1
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +8 -1
- snowflake/snowpark_connect/relation/read/reader_config.py +10 -0
- snowflake/snowpark_connect/relation/read/utils.py +7 -6
- snowflake/snowpark_connect/relation/utils.py +170 -1
- snowflake/snowpark_connect/relation/write/map_write.py +306 -87
- snowflake/snowpark_connect/server.py +34 -5
- snowflake/snowpark_connect/type_mapping.py +6 -2
- snowflake/snowpark_connect/utils/describe_query_cache.py +2 -9
- snowflake/snowpark_connect/utils/env_utils.py +55 -0
- snowflake/snowpark_connect/utils/session.py +21 -4
- snowflake/snowpark_connect/utils/telemetry.py +213 -61
- snowflake/snowpark_connect/utils/udxf_import_utils.py +14 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/__init__.py +0 -0
- snowflake/snowpark_decoder/_internal/proto/generated/DataframeProcessorMsg_pb2.py +36 -0
- snowflake/snowpark_decoder/_internal/proto/generated/DataframeProcessorMsg_pb2.pyi +156 -0
- snowflake/snowpark_decoder/dp_session.py +111 -0
- snowflake/snowpark_decoder/spark_decoder.py +76 -0
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/METADATA +2 -2
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/RECORD +55 -44
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/top_level.txt +1 -0
- spark/__init__.py +0 -0
- spark/connect/__init__.py +0 -0
- spark/connect/envelope_pb2.py +31 -0
- spark/connect/envelope_pb2.pyi +46 -0
- snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
- {snowpark_connect-0.21.0.data → snowpark_connect-0.23.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.21.0.data → snowpark_connect-0.23.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.21.0.data → snowpark_connect-0.23.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/NOTICE-binary +0 -0
|
@@ -83,6 +83,7 @@ from snowflake.snowpark_connect.utils.context import (
|
|
|
83
83
|
set_session_id,
|
|
84
84
|
set_spark_version,
|
|
85
85
|
)
|
|
86
|
+
from snowflake.snowpark_connect.utils.env_utils import get_int_from_env
|
|
86
87
|
from snowflake.snowpark_connect.utils.interrupt import (
|
|
87
88
|
interrupt_all_queries,
|
|
88
89
|
interrupt_queries_with_tag,
|
|
@@ -700,11 +701,27 @@ def _serve(
|
|
|
700
701
|
return
|
|
701
702
|
|
|
702
703
|
server_options = [
|
|
703
|
-
(
|
|
704
|
-
|
|
704
|
+
(
|
|
705
|
+
"grpc.max_receive_message_length",
|
|
706
|
+
get_int_from_env(
|
|
707
|
+
"SNOWFLAKE_GRPC_MAX_MESSAGE_SIZE",
|
|
708
|
+
_SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE,
|
|
709
|
+
),
|
|
710
|
+
),
|
|
711
|
+
(
|
|
712
|
+
"grpc.max_metadata_size",
|
|
713
|
+
get_int_from_env(
|
|
714
|
+
"SNOWFLAKE_GRPC_MAX_METADATA_SIZE",
|
|
715
|
+
_SPARK_CONNECT_GRPC_MAX_METADATA_SIZE,
|
|
716
|
+
),
|
|
717
|
+
),
|
|
705
718
|
(
|
|
706
719
|
"grpc.absolute_max_metadata_size",
|
|
707
|
-
|
|
720
|
+
get_int_from_env(
|
|
721
|
+
"SNOWFLAKE_GRPC_MAX_METADATA_SIZE",
|
|
722
|
+
_SPARK_CONNECT_GRPC_MAX_METADATA_SIZE,
|
|
723
|
+
)
|
|
724
|
+
* 2,
|
|
708
725
|
),
|
|
709
726
|
]
|
|
710
727
|
server = grpc.server(
|
|
@@ -812,8 +829,11 @@ class UnixDomainSocketChannelBuilder(ChannelBuilder):
|
|
|
812
829
|
Spark Connect gRPC channel builder for Unix domain sockets
|
|
813
830
|
"""
|
|
814
831
|
|
|
815
|
-
def __init__(
|
|
816
|
-
url: str =
|
|
832
|
+
def __init__(
|
|
833
|
+
self, url: str = None, channelOptions: Optional[List[Tuple[str, Any]]] = None
|
|
834
|
+
) -> None:
|
|
835
|
+
if url is None:
|
|
836
|
+
url = get_client_url()
|
|
817
837
|
if url[:6] != "unix:/" or len(url) < 7:
|
|
818
838
|
raise PySparkValueError(
|
|
819
839
|
error_class="INVALID_CONNECT_URL",
|
|
@@ -981,6 +1001,7 @@ def start_session(
|
|
|
981
1001
|
stop_event: threading.Event = None,
|
|
982
1002
|
snowpark_session: Optional[snowpark.Session] = None,
|
|
983
1003
|
connection_parameters: Optional[Dict[str, str]] = None,
|
|
1004
|
+
max_grpc_message_size: int = _SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE,
|
|
984
1005
|
) -> threading.Thread | None:
|
|
985
1006
|
"""
|
|
986
1007
|
Starts Spark Connect server connected to Snowflake. No-op if the Server is already running.
|
|
@@ -1003,6 +1024,14 @@ def start_session(
|
|
|
1003
1024
|
provided, the `snowpark_session` parameter must be None.
|
|
1004
1025
|
"""
|
|
1005
1026
|
try:
|
|
1027
|
+
# Changing the value of our global variable based on the grpc message size provided by the user.
|
|
1028
|
+
global _SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE
|
|
1029
|
+
_SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE = max_grpc_message_size
|
|
1030
|
+
|
|
1031
|
+
from pyspark.sql.connect.client import ChannelBuilder
|
|
1032
|
+
|
|
1033
|
+
ChannelBuilder.MAX_MESSAGE_LENGTH = max_grpc_message_size
|
|
1034
|
+
|
|
1006
1035
|
if os.environ.get("SPARK_ENV_LOADED"):
|
|
1007
1036
|
raise RuntimeError(
|
|
1008
1037
|
"Snowpark Connect cannot be run inside of a Spark environment"
|
|
@@ -59,7 +59,7 @@ def _get_struct_type_class():
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
@cache
|
|
62
|
-
def
|
|
62
|
+
def get_python_sql_utils_class():
|
|
63
63
|
return jpype.JClass("org.apache.spark.sql.api.python.PythonSQLUtils")
|
|
64
64
|
|
|
65
65
|
|
|
@@ -70,7 +70,7 @@ def parse_ddl_with_spark_scala(ddl_string: str) -> pyspark.sql.types.DataType:
|
|
|
70
70
|
This mimics pysparks.ddl parsing logic pyspark.sql.types._py_parse_datatype_string
|
|
71
71
|
"""
|
|
72
72
|
struct_type_class = _get_struct_type_class()
|
|
73
|
-
python_sql_utils =
|
|
73
|
+
python_sql_utils = get_python_sql_utils_class()
|
|
74
74
|
|
|
75
75
|
try:
|
|
76
76
|
# DDL format, "fieldname datatype, fieldname datatype".
|
|
@@ -324,6 +324,8 @@ def cast_to_match_snowpark_type(
|
|
|
324
324
|
return str(content)
|
|
325
325
|
case snowpark.types.VariantType:
|
|
326
326
|
return str(content)
|
|
327
|
+
case snowpark.types.TimestampType:
|
|
328
|
+
return str(content)
|
|
327
329
|
case _:
|
|
328
330
|
raise SnowparkConnectNotImplementedError(
|
|
329
331
|
f"Unsupported snowpark data type in casting: {data_type}"
|
|
@@ -779,6 +781,8 @@ def map_simple_types(simple_type: str) -> snowpark.types.DataType:
|
|
|
779
781
|
return snowpark.types.TimestampType()
|
|
780
782
|
case "timestamp_ntz":
|
|
781
783
|
return snowpark.types.TimestampType(snowpark.types.TimestampTimeZone.NTZ)
|
|
784
|
+
case "timestamp_ltz":
|
|
785
|
+
return snowpark.types.TimestampType(snowpark.types.TimestampTimeZone.LTZ)
|
|
782
786
|
case "day_time_interval":
|
|
783
787
|
# this is not a column type in snowflake so there won't be a dataframe column
|
|
784
788
|
# with this, for now this type won't make any sense
|
|
@@ -131,21 +131,14 @@ def instrument_session_for_describe_cache(session: snowpark.Session):
|
|
|
131
131
|
logger.debug(f"DDL detected, clearing describe query cache: '{query}'")
|
|
132
132
|
cache.clear()
|
|
133
133
|
|
|
134
|
-
def report_query(qid: str, is_internal: bool) -> None:
|
|
135
|
-
if is_internal:
|
|
136
|
-
telemetry.report_internal_query()
|
|
137
|
-
elif qid:
|
|
138
|
-
telemetry.report_query_id(qid)
|
|
139
|
-
|
|
140
134
|
def wrap_execute(wrapped_fn):
|
|
141
135
|
def fn(query: str, **kwargs):
|
|
142
136
|
update_cache_for_query(query)
|
|
143
|
-
is_internal = kwargs.get("_is_internal", False)
|
|
144
137
|
try:
|
|
145
138
|
result = wrapped_fn(query, **kwargs)
|
|
146
|
-
report_query(result
|
|
139
|
+
telemetry.report_query(result, **kwargs)
|
|
147
140
|
except Exception as e:
|
|
148
|
-
report_query(e
|
|
141
|
+
telemetry.report_query(e, **kwargs)
|
|
149
142
|
raise e
|
|
150
143
|
return result
|
|
151
144
|
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Environment variable utilities for Snowpark Connect.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def get_int_from_env(env_var: str, default: int) -> int:
|
|
15
|
+
"""
|
|
16
|
+
Safely get integer value from environment variable with fallback to default.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
env_var: Environment variable name
|
|
20
|
+
default: Default integer value if env var is not set or invalid
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Integer value from environment variable or default
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
TypeError: If default is not an integer
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
>>> get_int_from_env("MAX_WORKERS", 10)
|
|
30
|
+
10
|
|
31
|
+
>>> os.environ["MAX_WORKERS"] = "20"
|
|
32
|
+
>>> get_int_from_env("MAX_WORKERS", 10)
|
|
33
|
+
20
|
|
34
|
+
>>> os.environ["MAX_WORKERS"] = "invalid"
|
|
35
|
+
>>> get_int_from_env("MAX_WORKERS", 10) # logs warning, returns 10
|
|
36
|
+
10
|
|
37
|
+
"""
|
|
38
|
+
# Validate that default is actually an integer
|
|
39
|
+
if not isinstance(default, int):
|
|
40
|
+
raise TypeError(
|
|
41
|
+
f"Default value must be an integer, got {type(default).__name__}: {default}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
value = os.getenv(env_var)
|
|
45
|
+
if value is None:
|
|
46
|
+
return default
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
return int(value)
|
|
50
|
+
except ValueError:
|
|
51
|
+
logger.warning(
|
|
52
|
+
f"Invalid integer value for environment variable {env_var}: '{value}', "
|
|
53
|
+
f"using default: {default}"
|
|
54
|
+
)
|
|
55
|
+
return default
|
|
@@ -73,6 +73,27 @@ def configure_snowpark_session(session: snowpark.Session):
|
|
|
73
73
|
session.connection.arrow_number_to_decimal_setter = True
|
|
74
74
|
session.custom_package_usage_config["enabled"] = True
|
|
75
75
|
|
|
76
|
+
default_fallback_timezone = "UTC"
|
|
77
|
+
if global_config.spark_sql_session_timeZone is None:
|
|
78
|
+
try:
|
|
79
|
+
result = session.sql("SHOW PARAMETERS LIKE 'TIMEZONE'").collect()
|
|
80
|
+
if result and len(result) > 0:
|
|
81
|
+
value = result[0]["value"]
|
|
82
|
+
logger.warning(
|
|
83
|
+
f"Using Snowflake session timezone parameter as fallback: {value}"
|
|
84
|
+
)
|
|
85
|
+
else:
|
|
86
|
+
value = default_fallback_timezone
|
|
87
|
+
logger.warning(
|
|
88
|
+
f"Could not determine timezone from parameters, defaulting to {default_fallback_timezone}"
|
|
89
|
+
)
|
|
90
|
+
except Exception as e:
|
|
91
|
+
value = default_fallback_timezone
|
|
92
|
+
logger.warning(
|
|
93
|
+
f"Could not query Snowflake timezone parameter ({e}), defaulting to {default_fallback_timezone}"
|
|
94
|
+
)
|
|
95
|
+
global_config.spark_sql_session_timeZone = value
|
|
96
|
+
|
|
76
97
|
session_params = {
|
|
77
98
|
"TIMESTAMP_TYPE_MAPPING": "TIMESTAMP_LTZ",
|
|
78
99
|
"TIMEZONE": f"'{global_config.spark_sql_session_timeZone}'",
|
|
@@ -181,7 +202,3 @@ def set_query_tags(spark_tags: Sequence[str]) -> None:
|
|
|
181
202
|
|
|
182
203
|
if spark_tags_str != snowpark_session.query_tag:
|
|
183
204
|
snowpark_session.query_tag = spark_tags_str
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
def get_python_udxf_import_files(session: snowpark.Session) -> str:
|
|
187
|
-
return ",".join([file for file in [*session._python_files, *session._import_files]])
|