snowpark-connect 0.20.2__py3-none-any.whl → 0.22.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +3 -2
- snowflake/snowpark_connect/column_name_handler.py +6 -65
- snowflake/snowpark_connect/config.py +47 -17
- snowflake/snowpark_connect/dataframe_container.py +242 -0
- snowflake/snowpark_connect/error/error_utils.py +25 -0
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +13 -23
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +9 -5
- snowflake/snowpark_connect/expression/map_extension.py +2 -1
- snowflake/snowpark_connect/expression/map_udf.py +4 -4
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +8 -7
- snowflake/snowpark_connect/expression/map_unresolved_function.py +481 -170
- snowflake/snowpark_connect/expression/map_unresolved_star.py +8 -8
- snowflake/snowpark_connect/expression/map_update_fields.py +1 -1
- snowflake/snowpark_connect/expression/typer.py +6 -6
- snowflake/snowpark_connect/proto/control_pb2.py +17 -16
- snowflake/snowpark_connect/proto/control_pb2.pyi +17 -17
- snowflake/snowpark_connect/proto/control_pb2_grpc.py +12 -63
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +15 -14
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +19 -14
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2_grpc.py +4 -0
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +27 -26
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +74 -68
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2_grpc.py +4 -0
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +5 -5
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +25 -17
- snowflake/snowpark_connect/relation/map_aggregate.py +170 -61
- snowflake/snowpark_connect/relation/map_catalog.py +2 -2
- snowflake/snowpark_connect/relation/map_column_ops.py +227 -145
- snowflake/snowpark_connect/relation/map_crosstab.py +25 -6
- snowflake/snowpark_connect/relation/map_extension.py +81 -56
- snowflake/snowpark_connect/relation/map_join.py +72 -63
- snowflake/snowpark_connect/relation/map_local_relation.py +35 -20
- snowflake/snowpark_connect/relation/map_map_partitions.py +24 -17
- snowflake/snowpark_connect/relation/map_relation.py +22 -16
- snowflake/snowpark_connect/relation/map_row_ops.py +232 -146
- snowflake/snowpark_connect/relation/map_sample_by.py +15 -8
- snowflake/snowpark_connect/relation/map_show_string.py +42 -5
- snowflake/snowpark_connect/relation/map_sql.py +141 -237
- snowflake/snowpark_connect/relation/map_stats.py +88 -39
- snowflake/snowpark_connect/relation/map_subquery_alias.py +13 -14
- snowflake/snowpark_connect/relation/map_udtf.py +10 -13
- snowflake/snowpark_connect/relation/read/map_read.py +8 -3
- snowflake/snowpark_connect/relation/read/map_read_csv.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_json.py +19 -8
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_socket.py +7 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +25 -16
- snowflake/snowpark_connect/relation/read/map_read_text.py +7 -7
- snowflake/snowpark_connect/relation/read/reader_config.py +1 -0
- snowflake/snowpark_connect/relation/utils.py +11 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +15 -12
- snowflake/snowpark_connect/relation/write/map_write.py +259 -56
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +3 -2
- snowflake/snowpark_connect/server.py +43 -4
- snowflake/snowpark_connect/type_mapping.py +6 -23
- snowflake/snowpark_connect/utils/cache.py +27 -22
- snowflake/snowpark_connect/utils/context.py +33 -17
- snowflake/snowpark_connect/utils/describe_query_cache.py +2 -9
- snowflake/snowpark_connect/utils/{attribute_handling.py → identifiers.py} +47 -0
- snowflake/snowpark_connect/utils/session.py +41 -38
- snowflake/snowpark_connect/utils/telemetry.py +214 -63
- snowflake/snowpark_connect/utils/udxf_import_utils.py +14 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/__init__.py +0 -0
- snowflake/snowpark_decoder/_internal/proto/generated/DataframeProcessorMsg_pb2.py +36 -0
- snowflake/snowpark_decoder/_internal/proto/generated/DataframeProcessorMsg_pb2.pyi +156 -0
- snowflake/snowpark_decoder/dp_session.py +111 -0
- snowflake/snowpark_decoder/spark_decoder.py +76 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.22.1.dist-info}/METADATA +6 -4
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.22.1.dist-info}/RECORD +83 -69
- snowpark_connect-0.22.1.dist-info/licenses/LICENSE-binary +568 -0
- snowpark_connect-0.22.1.dist-info/licenses/NOTICE-binary +1533 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.22.1.dist-info}/top_level.txt +1 -0
- spark/__init__.py +0 -0
- spark/connect/__init__.py +0 -0
- spark/connect/envelope_pb2.py +31 -0
- spark/connect/envelope_pb2.pyi +46 -0
- snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.22.1.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.22.1.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.22.1.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.22.1.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.22.1.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -15,8 +15,9 @@ def map_tree_string(
|
|
|
15
15
|
) -> proto_base.AnalyzePlanResponse:
|
|
16
16
|
# TODO: tracking the difference with pyspark in SNOW-1853347
|
|
17
17
|
tree_string = request.tree_string
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
snowpark_df_container = map_relation(tree_string.plan.root)
|
|
19
|
+
snowpark_df = snowpark_df_container.dataframe
|
|
20
|
+
column_map = snowpark_df_container.column_map
|
|
20
21
|
|
|
21
22
|
snowpark_tree_string = snowpark_df._format_schema(
|
|
22
23
|
level=tree_string.level if tree_string.HasField("level") else None,
|
|
@@ -12,14 +12,13 @@ from functools import cached_property
|
|
|
12
12
|
|
|
13
13
|
from pyspark.errors.exceptions.base import AnalysisException
|
|
14
14
|
|
|
15
|
-
from snowflake import snowpark
|
|
16
15
|
from snowflake.snowpark import DataFrame
|
|
17
16
|
from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
18
17
|
quote_name_without_upper_casing,
|
|
19
18
|
unquote_if_quoted,
|
|
20
19
|
)
|
|
21
20
|
from snowflake.snowpark._internal.utils import quote_name
|
|
22
|
-
from snowflake.snowpark.types import
|
|
21
|
+
from snowflake.snowpark.types import StructType
|
|
23
22
|
from snowflake.snowpark_connect.config import global_config
|
|
24
23
|
from snowflake.snowpark_connect.utils.context import get_current_operation_scope
|
|
25
24
|
|
|
@@ -41,64 +40,6 @@ def set_schema_getter(df: DataFrame, get_schema: Callable[[], StructType]) -> No
|
|
|
41
40
|
df.__class__ = PatchedDataFrame
|
|
42
41
|
|
|
43
42
|
|
|
44
|
-
def with_column_map(
|
|
45
|
-
result_df: snowpark.DataFrame,
|
|
46
|
-
spark_column_names: list[str],
|
|
47
|
-
snowpark_column_names: list[str],
|
|
48
|
-
snowpark_column_types: list[DataType] = None,
|
|
49
|
-
column_metadata: dict | None = None,
|
|
50
|
-
column_qualifiers: list[list[str]] | None = None,
|
|
51
|
-
parent_column_name_map: ColumnNameMap | None = None,
|
|
52
|
-
) -> snowpark.DataFrame:
|
|
53
|
-
"""
|
|
54
|
-
Build a mapping from the DataFrame's column names to the Spark column names.
|
|
55
|
-
|
|
56
|
-
This is used to track the original column names and handle column naming differences
|
|
57
|
-
between Spark and Snowpark.
|
|
58
|
-
|
|
59
|
-
The elements in result_df.columns and the elements in spark_column_names must be a one-to-one mapping.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
result_df (snowpark.DataFrame): The DataFrame to map.
|
|
63
|
-
spark_column_names (list[str]): The Spark column names.
|
|
64
|
-
snowpark_column_names (list[str]): The Snowpark column names.
|
|
65
|
-
snowpark_column_types (list[DataType], optional): The Snowpark column types. **if provided df.schema will be overridden with inferred schema**
|
|
66
|
-
column_metadata (dict, optional): Metadata for the columns.
|
|
67
|
-
column_qualifiers (list[list[str]], optional): Qualifiers for the columns, used to handle table aliases or DataFrame aliases.
|
|
68
|
-
parent_column_name_map (ColumnNameMap, optional): A ColumnNameMap, that came from the dataframe used to create result_df (parent df)
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
snowpark.DataFrame: The mapped DataFrame.
|
|
72
|
-
"""
|
|
73
|
-
assert len(snowpark_column_names) == len(
|
|
74
|
-
spark_column_names
|
|
75
|
-
), "Number of Spark column names must match number of columns in DataFrame"
|
|
76
|
-
result_df._column_map = ColumnNameMap(
|
|
77
|
-
spark_column_names,
|
|
78
|
-
snowpark_column_names,
|
|
79
|
-
column_metadata=column_metadata,
|
|
80
|
-
column_qualifiers=column_qualifiers,
|
|
81
|
-
parent_column_name_map=parent_column_name_map,
|
|
82
|
-
)
|
|
83
|
-
result_df._table_name = None
|
|
84
|
-
|
|
85
|
-
if snowpark_column_types is not None:
|
|
86
|
-
assert len(snowpark_column_names) == len(
|
|
87
|
-
snowpark_column_types
|
|
88
|
-
), "Number of Snowpark column names and types must match"
|
|
89
|
-
|
|
90
|
-
set_schema_getter(
|
|
91
|
-
result_df,
|
|
92
|
-
lambda: StructType(
|
|
93
|
-
[
|
|
94
|
-
StructField(n, t, _is_column=False)
|
|
95
|
-
for n, t in zip(snowpark_column_names, snowpark_column_types)
|
|
96
|
-
]
|
|
97
|
-
),
|
|
98
|
-
)
|
|
99
|
-
return result_df
|
|
100
|
-
|
|
101
|
-
|
|
102
43
|
def make_column_names_snowpark_compatible(
|
|
103
44
|
names: list[str], plan_id: int, offset: int = 0
|
|
104
45
|
) -> list[str]:
|
|
@@ -189,7 +130,7 @@ class ColumnNameMap:
|
|
|
189
130
|
column_qualifiers: Optional qualifiers for the columns, used to handle table aliases or DataFrame aliases.
|
|
190
131
|
parent_column_name_map: parent ColumnNameMap
|
|
191
132
|
"""
|
|
192
|
-
self.columns = []
|
|
133
|
+
self.columns: list[ColumnNames] = []
|
|
193
134
|
self.spark_to_col = defaultdict(list)
|
|
194
135
|
self.uppercase_spark_to_col = defaultdict(list)
|
|
195
136
|
self.snowpark_to_col = defaultdict(list)
|
|
@@ -602,11 +543,11 @@ class ColumnNameMap:
|
|
|
602
543
|
class JoinColumnNameMap(ColumnNameMap):
|
|
603
544
|
def __init__(
|
|
604
545
|
self,
|
|
605
|
-
|
|
606
|
-
|
|
546
|
+
left_colmap: ColumnNameMap,
|
|
547
|
+
right_colmap: ColumnNameMap,
|
|
607
548
|
) -> None:
|
|
608
|
-
self.left_column_mapping: ColumnNameMap =
|
|
609
|
-
self.right_column_mapping: ColumnNameMap =
|
|
549
|
+
self.left_column_mapping: ColumnNameMap = left_colmap
|
|
550
|
+
self.right_column_mapping: ColumnNameMap = right_colmap
|
|
610
551
|
|
|
611
552
|
def get_snowpark_column_name_from_spark_column_name(
|
|
612
553
|
self,
|
|
@@ -9,7 +9,7 @@ import re
|
|
|
9
9
|
import sys
|
|
10
10
|
import time
|
|
11
11
|
from collections import defaultdict
|
|
12
|
-
from copy import copy
|
|
12
|
+
from copy import copy, deepcopy
|
|
13
13
|
from typing import Any
|
|
14
14
|
|
|
15
15
|
import jpype
|
|
@@ -33,7 +33,7 @@ from snowflake.snowpark_connect.version import VERSION as sas_version
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
def str_to_bool(boolean_str: str) -> bool:
|
|
36
|
-
assert boolean_str in
|
|
36
|
+
assert boolean_str in (
|
|
37
37
|
"True",
|
|
38
38
|
"true",
|
|
39
39
|
"False",
|
|
@@ -41,7 +41,7 @@ def str_to_bool(boolean_str: str) -> bool:
|
|
|
41
41
|
"1",
|
|
42
42
|
"0",
|
|
43
43
|
"", # This is the default value, equivalent to False.
|
|
44
|
-
|
|
44
|
+
), f"Invalid boolean value: {boolean_str}"
|
|
45
45
|
return boolean_str in ["True", "true", "1"]
|
|
46
46
|
|
|
47
47
|
|
|
@@ -131,6 +131,7 @@ class GlobalConfig:
|
|
|
131
131
|
"spark.sql.caseSensitive": "false",
|
|
132
132
|
"spark.sql.mapKeyDedupPolicy": "EXCEPTION",
|
|
133
133
|
"spark.sql.ansi.enabled": "false",
|
|
134
|
+
"spark.sql.legacy.allowHashOnMapType": "false",
|
|
134
135
|
"spark.sql.sources.default": "parquet",
|
|
135
136
|
"spark.Catalog.databaseFilterInformationSchema": "false",
|
|
136
137
|
"spark.sql.parser.quotedRegexColumnNames": "false",
|
|
@@ -145,6 +146,7 @@ class GlobalConfig:
|
|
|
145
146
|
"spark.sql.crossJoin.enabled",
|
|
146
147
|
"spark.sql.caseSensitive",
|
|
147
148
|
"spark.sql.ansi.enabled",
|
|
149
|
+
"spark.sql.legacy.allowHashOnMapType",
|
|
148
150
|
"spark.Catalog.databaseFilterInformationSchema",
|
|
149
151
|
"spark.sql.parser.quotedRegexColumnNames",
|
|
150
152
|
]
|
|
@@ -166,6 +168,9 @@ class GlobalConfig:
|
|
|
166
168
|
"snowpark.connect.udf.packages": lambda session, packages: session.add_packages(
|
|
167
169
|
*packages.strip("[] ").split(",")
|
|
168
170
|
),
|
|
171
|
+
"snowpark.connect.udf.imports": lambda session, imports: parse_imports(
|
|
172
|
+
session, imports
|
|
173
|
+
),
|
|
169
174
|
}
|
|
170
175
|
|
|
171
176
|
float_config_list = []
|
|
@@ -250,10 +255,10 @@ SESSION_CONFIG_KEY_WHITELIST = {
|
|
|
250
255
|
"spark.sql.tvf.allowMultipleTableArguments.enabled",
|
|
251
256
|
"snowpark.connect.sql.passthrough",
|
|
252
257
|
"snowpark.connect.iceberg.external_volume",
|
|
253
|
-
"snowpark.connect.auto-uppercase
|
|
254
|
-
"snowpark.connect.auto-uppercase.dml",
|
|
258
|
+
"snowpark.connect.sql.identifiers.auto-uppercase",
|
|
255
259
|
"snowpark.connect.udtf.compatibility_mode",
|
|
256
260
|
"snowpark.connect.views.duplicate_column_names_handling_mode",
|
|
261
|
+
"enable_snowflake_extension_behavior",
|
|
257
262
|
}
|
|
258
263
|
AZURE_SAS_KEY = re.compile(
|
|
259
264
|
r"^fs\.azure\.sas\.[^\.]+\.[^\.]+\.blob\.core\.windows\.net$"
|
|
@@ -271,17 +276,17 @@ class SessionConfig:
|
|
|
271
276
|
"""This class contains the session configuration for the Spark Server."""
|
|
272
277
|
|
|
273
278
|
default_session_config = {
|
|
274
|
-
"snowpark.connect.auto-uppercase
|
|
275
|
-
"snowpark.connect.auto-uppercase.dml": "true",
|
|
279
|
+
"snowpark.connect.sql.identifiers.auto-uppercase": "all_except_columns",
|
|
276
280
|
"snowpark.connect.sql.passthrough": "false",
|
|
277
281
|
"snowpark.connect.udtf.compatibility_mode": "false",
|
|
278
282
|
"snowpark.connect.views.duplicate_column_names_handling_mode": "rename",
|
|
279
283
|
"spark.sql.execution.pythonUDTF.arrow.enabled": "false",
|
|
280
284
|
"spark.sql.tvf.allowMultipleTableArguments.enabled": "true",
|
|
285
|
+
"enable_snowflake_extension_behavior": "false",
|
|
281
286
|
}
|
|
282
287
|
|
|
283
288
|
def __init__(self) -> None:
|
|
284
|
-
self.config =
|
|
289
|
+
self.config = deepcopy(self.default_session_config)
|
|
285
290
|
|
|
286
291
|
def __getitem__(self, item: str) -> str:
|
|
287
292
|
return self.get(item)
|
|
@@ -304,7 +309,13 @@ CONFIG_ALLOWED_VALUES: dict[str, tuple] = {
|
|
|
304
309
|
"rename",
|
|
305
310
|
"fail",
|
|
306
311
|
"drop",
|
|
307
|
-
)
|
|
312
|
+
),
|
|
313
|
+
"snowpark.connect.sql.identifiers.auto-uppercase": (
|
|
314
|
+
"all_except_columns",
|
|
315
|
+
"only_columns",
|
|
316
|
+
"all",
|
|
317
|
+
"none",
|
|
318
|
+
),
|
|
308
319
|
}
|
|
309
320
|
|
|
310
321
|
# Set some default configuration that are necessary for the driver.
|
|
@@ -324,7 +335,7 @@ def route_config_proto(
|
|
|
324
335
|
match op_type:
|
|
325
336
|
case "set":
|
|
326
337
|
logger.info("SET")
|
|
327
|
-
|
|
338
|
+
telemetry.report_config_set(config.operation.set.pairs)
|
|
328
339
|
for pair in config.operation.set.pairs:
|
|
329
340
|
# Check if the value field is present, not present when invalid fields are set in conf.
|
|
330
341
|
if not pair.HasField("value"):
|
|
@@ -334,7 +345,6 @@ def route_config_proto(
|
|
|
334
345
|
f"Cannot set config '{pair.key}' to None"
|
|
335
346
|
)
|
|
336
347
|
|
|
337
|
-
telemetry.report_config_set(pair.key, pair.value)
|
|
338
348
|
set_config_param(
|
|
339
349
|
config.session_id, pair.key, pair.value, snowpark_session
|
|
340
350
|
)
|
|
@@ -342,14 +352,15 @@ def route_config_proto(
|
|
|
342
352
|
return proto_base.ConfigResponse(session_id=config.session_id)
|
|
343
353
|
case "unset":
|
|
344
354
|
logger.info("UNSET")
|
|
355
|
+
telemetry.report_config_unset(config.operation.unset.keys)
|
|
345
356
|
for key in config.operation.unset.keys:
|
|
346
|
-
telemetry.report_config_unset(key)
|
|
347
357
|
unset_config_param(config.session_id, key, snowpark_session)
|
|
348
358
|
|
|
349
359
|
return proto_base.ConfigResponse(session_id=config.session_id)
|
|
350
360
|
case "get":
|
|
351
361
|
logger.info("GET")
|
|
352
362
|
res = proto_base.ConfigResponse(session_id=config.session_id)
|
|
363
|
+
telemetry.report_config_get(config.operation.get.keys)
|
|
353
364
|
for key in config.operation.get.keys:
|
|
354
365
|
pair = res.pairs.add()
|
|
355
366
|
pair.key = key
|
|
@@ -359,6 +370,9 @@ def route_config_proto(
|
|
|
359
370
|
return res
|
|
360
371
|
case "get_with_default":
|
|
361
372
|
logger.info("GET_WITH_DEFAULT")
|
|
373
|
+
telemetry.report_config_get(
|
|
374
|
+
[pair.key for pair in config.operation.get_with_default.pairs]
|
|
375
|
+
)
|
|
362
376
|
result_pairs = [
|
|
363
377
|
proto_base.KeyValue(
|
|
364
378
|
key=pair.key,
|
|
@@ -375,6 +389,7 @@ def route_config_proto(
|
|
|
375
389
|
case "get_option":
|
|
376
390
|
logger.info("GET_OPTION")
|
|
377
391
|
res = proto_base.ConfigResponse(session_id=config.session_id)
|
|
392
|
+
telemetry.report_config_get(config.operation.get_option.keys)
|
|
378
393
|
for key in config.operation.get_option.keys:
|
|
379
394
|
pair = res.pairs.add()
|
|
380
395
|
pair.key = key
|
|
@@ -403,6 +418,7 @@ def route_config_proto(
|
|
|
403
418
|
case "is_modifiable":
|
|
404
419
|
logger.info("IS_MODIFIABLE")
|
|
405
420
|
res = proto_base.ConfigResponse(session_id=config.session_id)
|
|
421
|
+
telemetry.report_config_get(config.operation.is_modifiable.keys)
|
|
406
422
|
for key in config.operation.is_modifiable.keys:
|
|
407
423
|
pair = res.pairs.add()
|
|
408
424
|
pair.key = key
|
|
@@ -533,7 +549,7 @@ def set_snowflake_parameters(
|
|
|
533
549
|
value = global_config.default_static_global_config.get(key)
|
|
534
550
|
|
|
535
551
|
snowpark_name = quote_name_without_upper_casing(value)
|
|
536
|
-
if
|
|
552
|
+
if auto_uppercase_non_column_identifiers():
|
|
537
553
|
snowpark_name = snowpark_name.upper()
|
|
538
554
|
|
|
539
555
|
# Create the schema on demand. Before creating it, however,
|
|
@@ -568,9 +584,23 @@ def get_boolean_session_config_param(name: str) -> bool:
|
|
|
568
584
|
return str_to_bool(session_config[name])
|
|
569
585
|
|
|
570
586
|
|
|
571
|
-
def
|
|
572
|
-
|
|
587
|
+
def auto_uppercase_column_identifiers() -> bool:
|
|
588
|
+
session_config = sessions_config[get_session_id()]
|
|
589
|
+
return session_config[
|
|
590
|
+
"snowpark.connect.sql.identifiers.auto-uppercase"
|
|
591
|
+
].lower() in ("all", "only_columns")
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def auto_uppercase_non_column_identifiers() -> bool:
|
|
595
|
+
session_config = sessions_config[get_session_id()]
|
|
596
|
+
return session_config[
|
|
597
|
+
"snowpark.connect.sql.identifiers.auto-uppercase"
|
|
598
|
+
].lower() in ("all", "all_except_columns")
|
|
599
|
+
|
|
573
600
|
|
|
601
|
+
def parse_imports(session: snowpark.Session, imports: str | None) -> None:
|
|
602
|
+
if not imports:
|
|
603
|
+
return
|
|
574
604
|
|
|
575
|
-
|
|
576
|
-
|
|
605
|
+
for udf_import in imports.strip("[] ").split(","):
|
|
606
|
+
session.add_import(udf_import)
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING, Callable
|
|
8
|
+
|
|
9
|
+
from snowflake import snowpark
|
|
10
|
+
from snowflake.snowpark.types import StructField, StructType
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DataFrameContainer:
|
|
17
|
+
"""
|
|
18
|
+
A container class that wraps a Snowpark DataFrame along with additional metadata.
|
|
19
|
+
|
|
20
|
+
This class provides a unified interface for managing Snowpark DataFrames along with
|
|
21
|
+
their column mappings, schema information, and metadata.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
dataframe: snowpark.DataFrame,
|
|
27
|
+
column_map: ColumnNameMap | None = None,
|
|
28
|
+
table_name: str | None = None,
|
|
29
|
+
alias: str | None = None,
|
|
30
|
+
cached_schema_getter: Callable[[], StructType] | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Initialize a new DataFrameContainer.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
dataframe: The underlying Snowpark DataFrame
|
|
37
|
+
column_map: Optional column name mapping
|
|
38
|
+
table_name: Optional table name for the DataFrame
|
|
39
|
+
alias: Optional alias for the DataFrame
|
|
40
|
+
cached_schema_getter: Optional function to get cached schema
|
|
41
|
+
"""
|
|
42
|
+
self._dataframe = dataframe
|
|
43
|
+
self._column_map = self._create_default_column_map(column_map)
|
|
44
|
+
self._table_name = table_name
|
|
45
|
+
self._alias = alias
|
|
46
|
+
|
|
47
|
+
if cached_schema_getter is not None:
|
|
48
|
+
self._apply_cached_schema_getter(cached_schema_getter)
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def create_with_column_mapping(
|
|
52
|
+
cls,
|
|
53
|
+
dataframe: snowpark.DataFrame,
|
|
54
|
+
spark_column_names: list[str],
|
|
55
|
+
snowpark_column_names: list[str],
|
|
56
|
+
snowpark_column_types: list | None = None,
|
|
57
|
+
column_metadata: dict | None = None,
|
|
58
|
+
column_qualifiers: list[list[str]] | None = None,
|
|
59
|
+
parent_column_name_map: ColumnNameMap | None = None,
|
|
60
|
+
table_name: str | None = None,
|
|
61
|
+
alias: str | None = None,
|
|
62
|
+
cached_schema_getter: Callable[[], StructType] | None = None,
|
|
63
|
+
) -> DataFrameContainer:
|
|
64
|
+
"""
|
|
65
|
+
Create a new container with complete column mapping configuration.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
dataframe: The underlying Snowpark DataFrame
|
|
69
|
+
spark_column_names: List of Spark column names
|
|
70
|
+
snowpark_column_names: List of corresponding Snowpark column names
|
|
71
|
+
snowpark_column_types: Optional list of column types
|
|
72
|
+
column_metadata: Optional metadata dictionary
|
|
73
|
+
column_qualifiers: Optional column qualifiers
|
|
74
|
+
parent_column_name_map: Optional parent column name map
|
|
75
|
+
table_name: Optional table name
|
|
76
|
+
alias: Optional alias
|
|
77
|
+
cached_schema_getter: Optional function to get cached schema
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
A new DataFrameContainer instance
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
AssertionError: If column names and types don't match expected lengths
|
|
84
|
+
"""
|
|
85
|
+
# Validate inputs
|
|
86
|
+
cls._validate_column_mapping_inputs(
|
|
87
|
+
spark_column_names, snowpark_column_names, snowpark_column_types
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
column_map = cls._create_column_map(
|
|
91
|
+
spark_column_names,
|
|
92
|
+
snowpark_column_names,
|
|
93
|
+
column_metadata,
|
|
94
|
+
column_qualifiers,
|
|
95
|
+
parent_column_name_map,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Determine the schema getter to use
|
|
99
|
+
final_schema_getter = None
|
|
100
|
+
|
|
101
|
+
if cached_schema_getter is not None:
|
|
102
|
+
# Use the provided schema getter
|
|
103
|
+
final_schema_getter = cached_schema_getter
|
|
104
|
+
elif snowpark_column_types is not None:
|
|
105
|
+
# Create schema from types and wrap in function
|
|
106
|
+
schema = cls._create_schema_from_types(
|
|
107
|
+
snowpark_column_names, snowpark_column_types
|
|
108
|
+
)
|
|
109
|
+
if schema is not None:
|
|
110
|
+
|
|
111
|
+
def get_schema():
|
|
112
|
+
return schema
|
|
113
|
+
|
|
114
|
+
final_schema_getter = get_schema
|
|
115
|
+
|
|
116
|
+
return cls(
|
|
117
|
+
dataframe=dataframe,
|
|
118
|
+
column_map=column_map,
|
|
119
|
+
table_name=table_name,
|
|
120
|
+
alias=alias,
|
|
121
|
+
cached_schema_getter=final_schema_getter,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def dataframe(self) -> snowpark.DataFrame:
|
|
126
|
+
"""Get the underlying Snowpark DataFrame."""
|
|
127
|
+
# Ensure the DataFrame has the _column_map attribute for backward compatibility
|
|
128
|
+
# Some of the snowpark code needs references to _column_map
|
|
129
|
+
self._dataframe._column_map = self._column_map
|
|
130
|
+
return self._dataframe
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def column_map(self) -> ColumnNameMap:
|
|
134
|
+
"""Get the column name mapping."""
|
|
135
|
+
return self._column_map
|
|
136
|
+
|
|
137
|
+
@column_map.setter
|
|
138
|
+
def column_map(self, value: ColumnNameMap) -> None:
|
|
139
|
+
"""Set the column name mapping."""
|
|
140
|
+
self._column_map = value
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def table_name(self) -> str | None:
|
|
144
|
+
"""Get the table name."""
|
|
145
|
+
return self._table_name
|
|
146
|
+
|
|
147
|
+
@table_name.setter
|
|
148
|
+
def table_name(self, value: str | None) -> None:
|
|
149
|
+
"""Set the table name."""
|
|
150
|
+
self._table_name = value
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def alias(self) -> str | None:
|
|
154
|
+
"""Get the alias name."""
|
|
155
|
+
return self._alias
|
|
156
|
+
|
|
157
|
+
@alias.setter
|
|
158
|
+
def alias(self, value: str | None) -> None:
|
|
159
|
+
"""Set the alias name."""
|
|
160
|
+
self._alias = value
|
|
161
|
+
|
|
162
|
+
def _create_default_column_map(
|
|
163
|
+
self, column_map: ColumnNameMap | None
|
|
164
|
+
) -> ColumnNameMap:
|
|
165
|
+
"""Create a default column map if none provided."""
|
|
166
|
+
if column_map is not None:
|
|
167
|
+
return column_map
|
|
168
|
+
|
|
169
|
+
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap
|
|
170
|
+
|
|
171
|
+
return ColumnNameMap([], [])
|
|
172
|
+
|
|
173
|
+
def _apply_cached_schema_getter(
|
|
174
|
+
self, schema_getter: Callable[[], StructType]
|
|
175
|
+
) -> None:
|
|
176
|
+
"""Apply a cached schema getter to the dataframe."""
|
|
177
|
+
from snowflake.snowpark_connect.column_name_handler import set_schema_getter
|
|
178
|
+
|
|
179
|
+
set_schema_getter(self._dataframe, schema_getter)
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def _validate_column_mapping_inputs(
|
|
183
|
+
spark_column_names: list[str],
|
|
184
|
+
snowpark_column_names: list[str],
|
|
185
|
+
snowpark_column_types: list | None = None,
|
|
186
|
+
) -> None:
|
|
187
|
+
"""
|
|
188
|
+
Validate inputs for column mapping creation.
|
|
189
|
+
|
|
190
|
+
Raises:
|
|
191
|
+
AssertionError: If validation fails
|
|
192
|
+
"""
|
|
193
|
+
assert len(snowpark_column_names) == len(
|
|
194
|
+
spark_column_names
|
|
195
|
+
), "Number of Spark column names must match number of columns in DataFrame"
|
|
196
|
+
|
|
197
|
+
if snowpark_column_types is not None:
|
|
198
|
+
assert len(snowpark_column_names) == len(
|
|
199
|
+
snowpark_column_types
|
|
200
|
+
), "Number of Snowpark column names and types must match"
|
|
201
|
+
|
|
202
|
+
@staticmethod
|
|
203
|
+
def _create_column_map(
|
|
204
|
+
spark_column_names: list[str],
|
|
205
|
+
snowpark_column_names: list[str],
|
|
206
|
+
column_metadata: dict | None = None,
|
|
207
|
+
column_qualifiers: list[list[str]] | None = None,
|
|
208
|
+
parent_column_name_map: ColumnNameMap | None = None,
|
|
209
|
+
) -> ColumnNameMap:
|
|
210
|
+
"""Create a ColumnNameMap with the provided configuration."""
|
|
211
|
+
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap
|
|
212
|
+
|
|
213
|
+
return ColumnNameMap(
|
|
214
|
+
spark_column_names,
|
|
215
|
+
snowpark_column_names,
|
|
216
|
+
column_metadata=column_metadata,
|
|
217
|
+
column_qualifiers=column_qualifiers,
|
|
218
|
+
parent_column_name_map=parent_column_name_map,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
@staticmethod
|
|
222
|
+
def _create_schema_from_types(
|
|
223
|
+
snowpark_column_names: list[str],
|
|
224
|
+
snowpark_column_types: list | None,
|
|
225
|
+
) -> StructType | None:
|
|
226
|
+
"""
|
|
227
|
+
Create a StructType schema from column names and types.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
StructType if types are provided, None otherwise
|
|
231
|
+
"""
|
|
232
|
+
if snowpark_column_types is None:
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
return StructType(
|
|
236
|
+
[
|
|
237
|
+
StructField(name, column_type, _is_column=False)
|
|
238
|
+
for name, column_type in zip(
|
|
239
|
+
snowpark_column_names, snowpark_column_types
|
|
240
|
+
)
|
|
241
|
+
]
|
|
242
|
+
)
|
|
@@ -28,7 +28,9 @@ from pyspark.errors.exceptions.base import (
|
|
|
28
28
|
PySparkException,
|
|
29
29
|
PythonException,
|
|
30
30
|
SparkRuntimeException,
|
|
31
|
+
UnsupportedOperationException,
|
|
31
32
|
)
|
|
33
|
+
from pyspark.errors.exceptions.connect import SparkConnectGrpcException
|
|
32
34
|
from snowflake.core.exceptions import NotFoundError
|
|
33
35
|
|
|
34
36
|
from snowflake.connector.errors import ProgrammingError
|
|
@@ -49,7 +51,9 @@ SPARK_PYTHON_TO_JAVA_EXCEPTION = {
|
|
|
49
51
|
ArrayIndexOutOfBoundsException: "java.lang.ArrayIndexOutOfBoundsException",
|
|
50
52
|
NumberFormatException: "java.lang.NumberFormatException",
|
|
51
53
|
SparkRuntimeException: "org.apache.spark.SparkRuntimeException",
|
|
54
|
+
SparkConnectGrpcException: "pyspark.errors.exceptions.connect.SparkConnectGrpcException",
|
|
52
55
|
PythonException: "org.apache.spark.api.python.PythonException",
|
|
56
|
+
UnsupportedOperationException: "java.lang.UnsupportedOperationException",
|
|
53
57
|
}
|
|
54
58
|
|
|
55
59
|
WINDOW_FUNCTION_ANALYSIS_EXCEPTION_SQL_ERROR_CODE = {1005, 2303}
|
|
@@ -68,6 +72,9 @@ init_multi_args_exception_pattern = (
|
|
|
68
72
|
terminate_multi_args_exception_pattern = (
|
|
69
73
|
r"terminate\(\) missing \d+ required positional argument"
|
|
70
74
|
)
|
|
75
|
+
snowpark_connect_exception_pattern = re.compile(
|
|
76
|
+
r"\[snowpark-connect-exception(?::(\w+))?\]\s*(.+?)'\s*is not recognized"
|
|
77
|
+
)
|
|
71
78
|
|
|
72
79
|
|
|
73
80
|
def contains_udtf_select(sql_string):
|
|
@@ -100,6 +107,19 @@ def _get_converted_known_sql_or_custom_exception(
|
|
|
100
107
|
return SparkRuntimeException(
|
|
101
108
|
message="Unexpected value for start in function slice: SQL array indices start at 1."
|
|
102
109
|
)
|
|
110
|
+
match = snowpark_connect_exception_pattern.search(
|
|
111
|
+
ex.message if hasattr(ex, "message") else str(ex)
|
|
112
|
+
)
|
|
113
|
+
if match:
|
|
114
|
+
class_name = match.group(1)
|
|
115
|
+
message = match.group(2)
|
|
116
|
+
exception_class = (
|
|
117
|
+
globals().get(class_name, SparkConnectGrpcException)
|
|
118
|
+
if class_name
|
|
119
|
+
else SparkConnectGrpcException
|
|
120
|
+
)
|
|
121
|
+
return exception_class(message=message)
|
|
122
|
+
|
|
103
123
|
if "select with no columns" in msg and contains_udtf_select(query):
|
|
104
124
|
# We try our best to detect if the SQL string contains a UDTF call and the output schema is empty.
|
|
105
125
|
return PythonException(message=f"[UDTF_RETURN_SCHEMA_MISMATCH] {ex.message}")
|
|
@@ -131,6 +151,11 @@ def _get_converted_known_sql_or_custom_exception(
|
|
|
131
151
|
message=f"[UDTF_EXEC_ERROR] User defined table function encountered an error in the terminate method: {ex.message}"
|
|
132
152
|
)
|
|
133
153
|
|
|
154
|
+
if "failed to split string, provided pattern:" in msg:
|
|
155
|
+
return IllegalArgumentException(
|
|
156
|
+
message=f"Failed to split string using provided pattern. {ex.message}"
|
|
157
|
+
)
|
|
158
|
+
|
|
134
159
|
if "100357" in msg and "wrong tuple size for returned value" in msg:
|
|
135
160
|
return PythonException(
|
|
136
161
|
message=f"[UDTF_RETURN_SCHEMA_MISMATCH] The number of columns in the result does not match the specified schema. {ex.message}"
|