snowpark-connect 0.20.2__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +3 -2
- snowflake/snowpark_connect/column_name_handler.py +6 -65
- snowflake/snowpark_connect/config.py +28 -14
- snowflake/snowpark_connect/dataframe_container.py +242 -0
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +13 -23
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +9 -5
- snowflake/snowpark_connect/expression/map_extension.py +2 -1
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +8 -7
- snowflake/snowpark_connect/expression/map_unresolved_function.py +279 -43
- snowflake/snowpark_connect/expression/map_unresolved_star.py +8 -8
- snowflake/snowpark_connect/expression/map_update_fields.py +1 -1
- snowflake/snowpark_connect/expression/typer.py +6 -6
- snowflake/snowpark_connect/proto/control_pb2.py +17 -16
- snowflake/snowpark_connect/proto/control_pb2.pyi +17 -17
- snowflake/snowpark_connect/proto/control_pb2_grpc.py +12 -63
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +15 -14
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +19 -14
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +27 -26
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +74 -68
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +5 -5
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +25 -17
- snowflake/snowpark_connect/relation/map_aggregate.py +72 -47
- snowflake/snowpark_connect/relation/map_catalog.py +2 -2
- snowflake/snowpark_connect/relation/map_column_ops.py +207 -144
- snowflake/snowpark_connect/relation/map_crosstab.py +25 -6
- snowflake/snowpark_connect/relation/map_extension.py +81 -56
- snowflake/snowpark_connect/relation/map_join.py +72 -63
- snowflake/snowpark_connect/relation/map_local_relation.py +35 -20
- snowflake/snowpark_connect/relation/map_map_partitions.py +21 -16
- snowflake/snowpark_connect/relation/map_relation.py +22 -16
- snowflake/snowpark_connect/relation/map_row_ops.py +232 -146
- snowflake/snowpark_connect/relation/map_sample_by.py +15 -8
- snowflake/snowpark_connect/relation/map_show_string.py +42 -5
- snowflake/snowpark_connect/relation/map_sql.py +155 -78
- snowflake/snowpark_connect/relation/map_stats.py +88 -39
- snowflake/snowpark_connect/relation/map_subquery_alias.py +13 -14
- snowflake/snowpark_connect/relation/map_udtf.py +6 -9
- snowflake/snowpark_connect/relation/read/map_read.py +8 -3
- snowflake/snowpark_connect/relation/read/map_read_csv.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_json.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_socket.py +7 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +25 -16
- snowflake/snowpark_connect/relation/read/map_read_text.py +7 -7
- snowflake/snowpark_connect/relation/utils.py +11 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +15 -12
- snowflake/snowpark_connect/relation/write/map_write.py +199 -40
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +3 -2
- snowflake/snowpark_connect/server.py +34 -4
- snowflake/snowpark_connect/type_mapping.py +2 -23
- snowflake/snowpark_connect/utils/cache.py +27 -22
- snowflake/snowpark_connect/utils/context.py +33 -17
- snowflake/snowpark_connect/utils/{attribute_handling.py → identifiers.py} +47 -0
- snowflake/snowpark_connect/utils/session.py +41 -34
- snowflake/snowpark_connect/utils/telemetry.py +1 -2
- snowflake/snowpark_connect/version.py +1 -1
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/METADATA +5 -3
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/RECORD +67 -64
- snowpark_connect-0.21.0.dist-info/licenses/LICENSE-binary +568 -0
- snowpark_connect-0.21.0.dist-info/licenses/NOTICE-binary +1533 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -15,8 +15,9 @@ def map_tree_string(
|
|
|
15
15
|
) -> proto_base.AnalyzePlanResponse:
|
|
16
16
|
# TODO: tracking the difference with pyspark in SNOW-1853347
|
|
17
17
|
tree_string = request.tree_string
|
|
18
|
-
|
|
19
|
-
|
|
18
|
+
snowpark_df_container = map_relation(tree_string.plan.root)
|
|
19
|
+
snowpark_df = snowpark_df_container.dataframe
|
|
20
|
+
column_map = snowpark_df_container.column_map
|
|
20
21
|
|
|
21
22
|
snowpark_tree_string = snowpark_df._format_schema(
|
|
22
23
|
level=tree_string.level if tree_string.HasField("level") else None,
|
|
@@ -12,14 +12,13 @@ from functools import cached_property
|
|
|
12
12
|
|
|
13
13
|
from pyspark.errors.exceptions.base import AnalysisException
|
|
14
14
|
|
|
15
|
-
from snowflake import snowpark
|
|
16
15
|
from snowflake.snowpark import DataFrame
|
|
17
16
|
from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
18
17
|
quote_name_without_upper_casing,
|
|
19
18
|
unquote_if_quoted,
|
|
20
19
|
)
|
|
21
20
|
from snowflake.snowpark._internal.utils import quote_name
|
|
22
|
-
from snowflake.snowpark.types import
|
|
21
|
+
from snowflake.snowpark.types import StructType
|
|
23
22
|
from snowflake.snowpark_connect.config import global_config
|
|
24
23
|
from snowflake.snowpark_connect.utils.context import get_current_operation_scope
|
|
25
24
|
|
|
@@ -41,64 +40,6 @@ def set_schema_getter(df: DataFrame, get_schema: Callable[[], StructType]) -> No
|
|
|
41
40
|
df.__class__ = PatchedDataFrame
|
|
42
41
|
|
|
43
42
|
|
|
44
|
-
def with_column_map(
|
|
45
|
-
result_df: snowpark.DataFrame,
|
|
46
|
-
spark_column_names: list[str],
|
|
47
|
-
snowpark_column_names: list[str],
|
|
48
|
-
snowpark_column_types: list[DataType] = None,
|
|
49
|
-
column_metadata: dict | None = None,
|
|
50
|
-
column_qualifiers: list[list[str]] | None = None,
|
|
51
|
-
parent_column_name_map: ColumnNameMap | None = None,
|
|
52
|
-
) -> snowpark.DataFrame:
|
|
53
|
-
"""
|
|
54
|
-
Build a mapping from the DataFrame's column names to the Spark column names.
|
|
55
|
-
|
|
56
|
-
This is used to track the original column names and handle column naming differences
|
|
57
|
-
between Spark and Snowpark.
|
|
58
|
-
|
|
59
|
-
The elements in result_df.columns and the elements in spark_column_names must be a one-to-one mapping.
|
|
60
|
-
|
|
61
|
-
Args:
|
|
62
|
-
result_df (snowpark.DataFrame): The DataFrame to map.
|
|
63
|
-
spark_column_names (list[str]): The Spark column names.
|
|
64
|
-
snowpark_column_names (list[str]): The Snowpark column names.
|
|
65
|
-
snowpark_column_types (list[DataType], optional): The Snowpark column types. **if provided df.schema will be overridden with inferred schema**
|
|
66
|
-
column_metadata (dict, optional): Metadata for the columns.
|
|
67
|
-
column_qualifiers (list[list[str]], optional): Qualifiers for the columns, used to handle table aliases or DataFrame aliases.
|
|
68
|
-
parent_column_name_map (ColumnNameMap, optional): A ColumnNameMap, that came from the dataframe used to create result_df (parent df)
|
|
69
|
-
|
|
70
|
-
Returns:
|
|
71
|
-
snowpark.DataFrame: The mapped DataFrame.
|
|
72
|
-
"""
|
|
73
|
-
assert len(snowpark_column_names) == len(
|
|
74
|
-
spark_column_names
|
|
75
|
-
), "Number of Spark column names must match number of columns in DataFrame"
|
|
76
|
-
result_df._column_map = ColumnNameMap(
|
|
77
|
-
spark_column_names,
|
|
78
|
-
snowpark_column_names,
|
|
79
|
-
column_metadata=column_metadata,
|
|
80
|
-
column_qualifiers=column_qualifiers,
|
|
81
|
-
parent_column_name_map=parent_column_name_map,
|
|
82
|
-
)
|
|
83
|
-
result_df._table_name = None
|
|
84
|
-
|
|
85
|
-
if snowpark_column_types is not None:
|
|
86
|
-
assert len(snowpark_column_names) == len(
|
|
87
|
-
snowpark_column_types
|
|
88
|
-
), "Number of Snowpark column names and types must match"
|
|
89
|
-
|
|
90
|
-
set_schema_getter(
|
|
91
|
-
result_df,
|
|
92
|
-
lambda: StructType(
|
|
93
|
-
[
|
|
94
|
-
StructField(n, t, _is_column=False)
|
|
95
|
-
for n, t in zip(snowpark_column_names, snowpark_column_types)
|
|
96
|
-
]
|
|
97
|
-
),
|
|
98
|
-
)
|
|
99
|
-
return result_df
|
|
100
|
-
|
|
101
|
-
|
|
102
43
|
def make_column_names_snowpark_compatible(
|
|
103
44
|
names: list[str], plan_id: int, offset: int = 0
|
|
104
45
|
) -> list[str]:
|
|
@@ -189,7 +130,7 @@ class ColumnNameMap:
|
|
|
189
130
|
column_qualifiers: Optional qualifiers for the columns, used to handle table aliases or DataFrame aliases.
|
|
190
131
|
parent_column_name_map: parent ColumnNameMap
|
|
191
132
|
"""
|
|
192
|
-
self.columns = []
|
|
133
|
+
self.columns: list[ColumnNames] = []
|
|
193
134
|
self.spark_to_col = defaultdict(list)
|
|
194
135
|
self.uppercase_spark_to_col = defaultdict(list)
|
|
195
136
|
self.snowpark_to_col = defaultdict(list)
|
|
@@ -602,11 +543,11 @@ class ColumnNameMap:
|
|
|
602
543
|
class JoinColumnNameMap(ColumnNameMap):
|
|
603
544
|
def __init__(
|
|
604
545
|
self,
|
|
605
|
-
|
|
606
|
-
|
|
546
|
+
left_colmap: ColumnNameMap,
|
|
547
|
+
right_colmap: ColumnNameMap,
|
|
607
548
|
) -> None:
|
|
608
|
-
self.left_column_mapping: ColumnNameMap =
|
|
609
|
-
self.right_column_mapping: ColumnNameMap =
|
|
549
|
+
self.left_column_mapping: ColumnNameMap = left_colmap
|
|
550
|
+
self.right_column_mapping: ColumnNameMap = right_colmap
|
|
610
551
|
|
|
611
552
|
def get_snowpark_column_name_from_spark_column_name(
|
|
612
553
|
self,
|
|
@@ -9,7 +9,7 @@ import re
|
|
|
9
9
|
import sys
|
|
10
10
|
import time
|
|
11
11
|
from collections import defaultdict
|
|
12
|
-
from copy import copy
|
|
12
|
+
from copy import copy, deepcopy
|
|
13
13
|
from typing import Any
|
|
14
14
|
|
|
15
15
|
import jpype
|
|
@@ -33,7 +33,7 @@ from snowflake.snowpark_connect.version import VERSION as sas_version
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
def str_to_bool(boolean_str: str) -> bool:
|
|
36
|
-
assert boolean_str in
|
|
36
|
+
assert boolean_str in (
|
|
37
37
|
"True",
|
|
38
38
|
"true",
|
|
39
39
|
"False",
|
|
@@ -41,7 +41,7 @@ def str_to_bool(boolean_str: str) -> bool:
|
|
|
41
41
|
"1",
|
|
42
42
|
"0",
|
|
43
43
|
"", # This is the default value, equivalent to False.
|
|
44
|
-
|
|
44
|
+
), f"Invalid boolean value: {boolean_str}"
|
|
45
45
|
return boolean_str in ["True", "true", "1"]
|
|
46
46
|
|
|
47
47
|
|
|
@@ -131,6 +131,7 @@ class GlobalConfig:
|
|
|
131
131
|
"spark.sql.caseSensitive": "false",
|
|
132
132
|
"spark.sql.mapKeyDedupPolicy": "EXCEPTION",
|
|
133
133
|
"spark.sql.ansi.enabled": "false",
|
|
134
|
+
"spark.sql.legacy.allowHashOnMapType": "false",
|
|
134
135
|
"spark.sql.sources.default": "parquet",
|
|
135
136
|
"spark.Catalog.databaseFilterInformationSchema": "false",
|
|
136
137
|
"spark.sql.parser.quotedRegexColumnNames": "false",
|
|
@@ -145,6 +146,7 @@ class GlobalConfig:
|
|
|
145
146
|
"spark.sql.crossJoin.enabled",
|
|
146
147
|
"spark.sql.caseSensitive",
|
|
147
148
|
"spark.sql.ansi.enabled",
|
|
149
|
+
"spark.sql.legacy.allowHashOnMapType",
|
|
148
150
|
"spark.Catalog.databaseFilterInformationSchema",
|
|
149
151
|
"spark.sql.parser.quotedRegexColumnNames",
|
|
150
152
|
]
|
|
@@ -250,10 +252,10 @@ SESSION_CONFIG_KEY_WHITELIST = {
|
|
|
250
252
|
"spark.sql.tvf.allowMultipleTableArguments.enabled",
|
|
251
253
|
"snowpark.connect.sql.passthrough",
|
|
252
254
|
"snowpark.connect.iceberg.external_volume",
|
|
253
|
-
"snowpark.connect.auto-uppercase
|
|
254
|
-
"snowpark.connect.auto-uppercase.dml",
|
|
255
|
+
"snowpark.connect.sql.identifiers.auto-uppercase",
|
|
255
256
|
"snowpark.connect.udtf.compatibility_mode",
|
|
256
257
|
"snowpark.connect.views.duplicate_column_names_handling_mode",
|
|
258
|
+
"enable_snowflake_extension_behavior",
|
|
257
259
|
}
|
|
258
260
|
AZURE_SAS_KEY = re.compile(
|
|
259
261
|
r"^fs\.azure\.sas\.[^\.]+\.[^\.]+\.blob\.core\.windows\.net$"
|
|
@@ -271,17 +273,17 @@ class SessionConfig:
|
|
|
271
273
|
"""This class contains the session configuration for the Spark Server."""
|
|
272
274
|
|
|
273
275
|
default_session_config = {
|
|
274
|
-
"snowpark.connect.auto-uppercase
|
|
275
|
-
"snowpark.connect.auto-uppercase.dml": "true",
|
|
276
|
+
"snowpark.connect.sql.identifiers.auto-uppercase": "all_except_columns",
|
|
276
277
|
"snowpark.connect.sql.passthrough": "false",
|
|
277
278
|
"snowpark.connect.udtf.compatibility_mode": "false",
|
|
278
279
|
"snowpark.connect.views.duplicate_column_names_handling_mode": "rename",
|
|
279
280
|
"spark.sql.execution.pythonUDTF.arrow.enabled": "false",
|
|
280
281
|
"spark.sql.tvf.allowMultipleTableArguments.enabled": "true",
|
|
282
|
+
"enable_snowflake_extension_behavior": "false",
|
|
281
283
|
}
|
|
282
284
|
|
|
283
285
|
def __init__(self) -> None:
|
|
284
|
-
self.config =
|
|
286
|
+
self.config = deepcopy(self.default_session_config)
|
|
285
287
|
|
|
286
288
|
def __getitem__(self, item: str) -> str:
|
|
287
289
|
return self.get(item)
|
|
@@ -304,7 +306,13 @@ CONFIG_ALLOWED_VALUES: dict[str, tuple] = {
|
|
|
304
306
|
"rename",
|
|
305
307
|
"fail",
|
|
306
308
|
"drop",
|
|
307
|
-
)
|
|
309
|
+
),
|
|
310
|
+
"snowpark.connect.sql.identifiers.auto-uppercase": (
|
|
311
|
+
"all_except_columns",
|
|
312
|
+
"only_columns",
|
|
313
|
+
"all",
|
|
314
|
+
"none",
|
|
315
|
+
),
|
|
308
316
|
}
|
|
309
317
|
|
|
310
318
|
# Set some default configuration that are necessary for the driver.
|
|
@@ -533,7 +541,7 @@ def set_snowflake_parameters(
|
|
|
533
541
|
value = global_config.default_static_global_config.get(key)
|
|
534
542
|
|
|
535
543
|
snowpark_name = quote_name_without_upper_casing(value)
|
|
536
|
-
if
|
|
544
|
+
if auto_uppercase_non_column_identifiers():
|
|
537
545
|
snowpark_name = snowpark_name.upper()
|
|
538
546
|
|
|
539
547
|
# Create the schema on demand. Before creating it, however,
|
|
@@ -568,9 +576,15 @@ def get_boolean_session_config_param(name: str) -> bool:
|
|
|
568
576
|
return str_to_bool(session_config[name])
|
|
569
577
|
|
|
570
578
|
|
|
571
|
-
def
|
|
572
|
-
|
|
579
|
+
def auto_uppercase_column_identifiers() -> bool:
|
|
580
|
+
session_config = sessions_config[get_session_id()]
|
|
581
|
+
return session_config[
|
|
582
|
+
"snowpark.connect.sql.identifiers.auto-uppercase"
|
|
583
|
+
].lower() in ("all", "only_columns")
|
|
573
584
|
|
|
574
585
|
|
|
575
|
-
def
|
|
576
|
-
|
|
586
|
+
def auto_uppercase_non_column_identifiers() -> bool:
|
|
587
|
+
session_config = sessions_config[get_session_id()]
|
|
588
|
+
return session_config[
|
|
589
|
+
"snowpark.connect.sql.identifiers.auto-uppercase"
|
|
590
|
+
].lower() in ("all", "all_except_columns")
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING, Callable
|
|
8
|
+
|
|
9
|
+
from snowflake import snowpark
|
|
10
|
+
from snowflake.snowpark.types import StructField, StructType
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DataFrameContainer:
|
|
17
|
+
"""
|
|
18
|
+
A container class that wraps a Snowpark DataFrame along with additional metadata.
|
|
19
|
+
|
|
20
|
+
This class provides a unified interface for managing Snowpark DataFrames along with
|
|
21
|
+
their column mappings, schema information, and metadata.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
dataframe: snowpark.DataFrame,
|
|
27
|
+
column_map: ColumnNameMap | None = None,
|
|
28
|
+
table_name: str | None = None,
|
|
29
|
+
alias: str | None = None,
|
|
30
|
+
cached_schema_getter: Callable[[], StructType] | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
"""
|
|
33
|
+
Initialize a new DataFrameContainer.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
dataframe: The underlying Snowpark DataFrame
|
|
37
|
+
column_map: Optional column name mapping
|
|
38
|
+
table_name: Optional table name for the DataFrame
|
|
39
|
+
alias: Optional alias for the DataFrame
|
|
40
|
+
cached_schema_getter: Optional function to get cached schema
|
|
41
|
+
"""
|
|
42
|
+
self._dataframe = dataframe
|
|
43
|
+
self._column_map = self._create_default_column_map(column_map)
|
|
44
|
+
self._table_name = table_name
|
|
45
|
+
self._alias = alias
|
|
46
|
+
|
|
47
|
+
if cached_schema_getter is not None:
|
|
48
|
+
self._apply_cached_schema_getter(cached_schema_getter)
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def create_with_column_mapping(
|
|
52
|
+
cls,
|
|
53
|
+
dataframe: snowpark.DataFrame,
|
|
54
|
+
spark_column_names: list[str],
|
|
55
|
+
snowpark_column_names: list[str],
|
|
56
|
+
snowpark_column_types: list | None = None,
|
|
57
|
+
column_metadata: dict | None = None,
|
|
58
|
+
column_qualifiers: list[list[str]] | None = None,
|
|
59
|
+
parent_column_name_map: ColumnNameMap | None = None,
|
|
60
|
+
table_name: str | None = None,
|
|
61
|
+
alias: str | None = None,
|
|
62
|
+
cached_schema_getter: Callable[[], StructType] | None = None,
|
|
63
|
+
) -> DataFrameContainer:
|
|
64
|
+
"""
|
|
65
|
+
Create a new container with complete column mapping configuration.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
dataframe: The underlying Snowpark DataFrame
|
|
69
|
+
spark_column_names: List of Spark column names
|
|
70
|
+
snowpark_column_names: List of corresponding Snowpark column names
|
|
71
|
+
snowpark_column_types: Optional list of column types
|
|
72
|
+
column_metadata: Optional metadata dictionary
|
|
73
|
+
column_qualifiers: Optional column qualifiers
|
|
74
|
+
parent_column_name_map: Optional parent column name map
|
|
75
|
+
table_name: Optional table name
|
|
76
|
+
alias: Optional alias
|
|
77
|
+
cached_schema_getter: Optional function to get cached schema
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
A new DataFrameContainer instance
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
AssertionError: If column names and types don't match expected lengths
|
|
84
|
+
"""
|
|
85
|
+
# Validate inputs
|
|
86
|
+
cls._validate_column_mapping_inputs(
|
|
87
|
+
spark_column_names, snowpark_column_names, snowpark_column_types
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
column_map = cls._create_column_map(
|
|
91
|
+
spark_column_names,
|
|
92
|
+
snowpark_column_names,
|
|
93
|
+
column_metadata,
|
|
94
|
+
column_qualifiers,
|
|
95
|
+
parent_column_name_map,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Determine the schema getter to use
|
|
99
|
+
final_schema_getter = None
|
|
100
|
+
|
|
101
|
+
if cached_schema_getter is not None:
|
|
102
|
+
# Use the provided schema getter
|
|
103
|
+
final_schema_getter = cached_schema_getter
|
|
104
|
+
elif snowpark_column_types is not None:
|
|
105
|
+
# Create schema from types and wrap in function
|
|
106
|
+
schema = cls._create_schema_from_types(
|
|
107
|
+
snowpark_column_names, snowpark_column_types
|
|
108
|
+
)
|
|
109
|
+
if schema is not None:
|
|
110
|
+
|
|
111
|
+
def get_schema():
|
|
112
|
+
return schema
|
|
113
|
+
|
|
114
|
+
final_schema_getter = get_schema
|
|
115
|
+
|
|
116
|
+
return cls(
|
|
117
|
+
dataframe=dataframe,
|
|
118
|
+
column_map=column_map,
|
|
119
|
+
table_name=table_name,
|
|
120
|
+
alias=alias,
|
|
121
|
+
cached_schema_getter=final_schema_getter,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def dataframe(self) -> snowpark.DataFrame:
|
|
126
|
+
"""Get the underlying Snowpark DataFrame."""
|
|
127
|
+
# Ensure the DataFrame has the _column_map attribute for backward compatibility
|
|
128
|
+
# Some of the snowpark code needs references to _column_map
|
|
129
|
+
self._dataframe._column_map = self._column_map
|
|
130
|
+
return self._dataframe
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def column_map(self) -> ColumnNameMap:
|
|
134
|
+
"""Get the column name mapping."""
|
|
135
|
+
return self._column_map
|
|
136
|
+
|
|
137
|
+
@column_map.setter
|
|
138
|
+
def column_map(self, value: ColumnNameMap) -> None:
|
|
139
|
+
"""Set the column name mapping."""
|
|
140
|
+
self._column_map = value
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def table_name(self) -> str | None:
|
|
144
|
+
"""Get the table name."""
|
|
145
|
+
return self._table_name
|
|
146
|
+
|
|
147
|
+
@table_name.setter
|
|
148
|
+
def table_name(self, value: str | None) -> None:
|
|
149
|
+
"""Set the table name."""
|
|
150
|
+
self._table_name = value
|
|
151
|
+
|
|
152
|
+
@property
|
|
153
|
+
def alias(self) -> str | None:
|
|
154
|
+
"""Get the alias name."""
|
|
155
|
+
return self._alias
|
|
156
|
+
|
|
157
|
+
@alias.setter
|
|
158
|
+
def alias(self, value: str | None) -> None:
|
|
159
|
+
"""Set the alias name."""
|
|
160
|
+
self._alias = value
|
|
161
|
+
|
|
162
|
+
def _create_default_column_map(
|
|
163
|
+
self, column_map: ColumnNameMap | None
|
|
164
|
+
) -> ColumnNameMap:
|
|
165
|
+
"""Create a default column map if none provided."""
|
|
166
|
+
if column_map is not None:
|
|
167
|
+
return column_map
|
|
168
|
+
|
|
169
|
+
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap
|
|
170
|
+
|
|
171
|
+
return ColumnNameMap([], [])
|
|
172
|
+
|
|
173
|
+
def _apply_cached_schema_getter(
|
|
174
|
+
self, schema_getter: Callable[[], StructType]
|
|
175
|
+
) -> None:
|
|
176
|
+
"""Apply a cached schema getter to the dataframe."""
|
|
177
|
+
from snowflake.snowpark_connect.column_name_handler import set_schema_getter
|
|
178
|
+
|
|
179
|
+
set_schema_getter(self._dataframe, schema_getter)
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def _validate_column_mapping_inputs(
|
|
183
|
+
spark_column_names: list[str],
|
|
184
|
+
snowpark_column_names: list[str],
|
|
185
|
+
snowpark_column_types: list | None = None,
|
|
186
|
+
) -> None:
|
|
187
|
+
"""
|
|
188
|
+
Validate inputs for column mapping creation.
|
|
189
|
+
|
|
190
|
+
Raises:
|
|
191
|
+
AssertionError: If validation fails
|
|
192
|
+
"""
|
|
193
|
+
assert len(snowpark_column_names) == len(
|
|
194
|
+
spark_column_names
|
|
195
|
+
), "Number of Spark column names must match number of columns in DataFrame"
|
|
196
|
+
|
|
197
|
+
if snowpark_column_types is not None:
|
|
198
|
+
assert len(snowpark_column_names) == len(
|
|
199
|
+
snowpark_column_types
|
|
200
|
+
), "Number of Snowpark column names and types must match"
|
|
201
|
+
|
|
202
|
+
@staticmethod
|
|
203
|
+
def _create_column_map(
|
|
204
|
+
spark_column_names: list[str],
|
|
205
|
+
snowpark_column_names: list[str],
|
|
206
|
+
column_metadata: dict | None = None,
|
|
207
|
+
column_qualifiers: list[list[str]] | None = None,
|
|
208
|
+
parent_column_name_map: ColumnNameMap | None = None,
|
|
209
|
+
) -> ColumnNameMap:
|
|
210
|
+
"""Create a ColumnNameMap with the provided configuration."""
|
|
211
|
+
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap
|
|
212
|
+
|
|
213
|
+
return ColumnNameMap(
|
|
214
|
+
spark_column_names,
|
|
215
|
+
snowpark_column_names,
|
|
216
|
+
column_metadata=column_metadata,
|
|
217
|
+
column_qualifiers=column_qualifiers,
|
|
218
|
+
parent_column_name_map=parent_column_name_map,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
@staticmethod
|
|
222
|
+
def _create_schema_from_types(
|
|
223
|
+
snowpark_column_names: list[str],
|
|
224
|
+
snowpark_column_types: list | None,
|
|
225
|
+
) -> StructType | None:
|
|
226
|
+
"""
|
|
227
|
+
Create a StructType schema from column names and types.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
StructType if types are provided, None otherwise
|
|
231
|
+
"""
|
|
232
|
+
if snowpark_column_types is None:
|
|
233
|
+
return None
|
|
234
|
+
|
|
235
|
+
return StructType(
|
|
236
|
+
[
|
|
237
|
+
StructField(name, column_type, _is_column=False)
|
|
238
|
+
for name, column_type in zip(
|
|
239
|
+
snowpark_column_names, snowpark_column_types
|
|
240
|
+
)
|
|
241
|
+
]
|
|
242
|
+
)
|
|
@@ -1,21 +1,13 @@
|
|
|
1
1
|
#
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
|
-
|
|
5
4
|
from collections import Counter
|
|
6
5
|
|
|
7
6
|
import pyspark.sql.connect.proto.base_pb2 as proto_base
|
|
8
7
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
9
8
|
|
|
10
|
-
from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
11
|
-
quote_name_without_upper_casing,
|
|
12
|
-
)
|
|
13
9
|
from snowflake.snowpark_connect.column_name_handler import ColumnNames
|
|
14
|
-
from snowflake.snowpark_connect.config import
|
|
15
|
-
auto_uppercase_ddl,
|
|
16
|
-
global_config,
|
|
17
|
-
sessions_config,
|
|
18
|
-
)
|
|
10
|
+
from snowflake.snowpark_connect.config import global_config, sessions_config
|
|
19
11
|
from snowflake.snowpark_connect.constants import SERVER_SIDE_SESSION_ID
|
|
20
12
|
from snowflake.snowpark_connect.execute_plan.utils import pandas_to_arrow_batches_bytes
|
|
21
13
|
from snowflake.snowpark_connect.expression import map_udf
|
|
@@ -24,24 +16,23 @@ from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
|
24
16
|
from snowflake.snowpark_connect.relation.map_sql import map_sql_to_pandas_df
|
|
25
17
|
from snowflake.snowpark_connect.relation.write.map_write import map_write, map_write_v2
|
|
26
18
|
from snowflake.snowpark_connect.utils.context import get_session_id
|
|
19
|
+
from snowflake.snowpark_connect.utils.identifiers import (
|
|
20
|
+
spark_to_sf_single_id,
|
|
21
|
+
spark_to_sf_single_id_with_unquoting,
|
|
22
|
+
)
|
|
27
23
|
from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
28
24
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
29
25
|
SnowparkConnectNotImplementedError,
|
|
30
26
|
)
|
|
31
27
|
|
|
32
28
|
|
|
33
|
-
def _spark_to_snowflake_single_id(name: str) -> str:
|
|
34
|
-
name = quote_name_without_upper_casing(name)
|
|
35
|
-
return name.upper() if auto_uppercase_ddl() else name
|
|
36
|
-
|
|
37
|
-
|
|
38
29
|
def _create_column_rename_map(
|
|
39
30
|
columns: list[ColumnNames], rename_duplicated: bool
|
|
40
31
|
) -> dict:
|
|
41
32
|
if rename_duplicated is False:
|
|
42
33
|
# if we are not renaming duplicated columns, we can just return the original names
|
|
43
34
|
return {
|
|
44
|
-
col.snowpark_name:
|
|
35
|
+
col.snowpark_name: spark_to_sf_single_id(col.spark_name, is_column=True)
|
|
45
36
|
for col in columns
|
|
46
37
|
}
|
|
47
38
|
|
|
@@ -64,7 +55,7 @@ def _create_column_rename_map(
|
|
|
64
55
|
|
|
65
56
|
if len(renamed_cols) == 0:
|
|
66
57
|
return {
|
|
67
|
-
col.snowpark_name:
|
|
58
|
+
col.snowpark_name: spark_to_sf_single_id(col.spark_name, is_column=True)
|
|
68
59
|
for col in not_renamed_cols
|
|
69
60
|
}
|
|
70
61
|
|
|
@@ -95,12 +86,9 @@ def map_execution_command(
|
|
|
95
86
|
match request.plan.command.WhichOneof("command_type"):
|
|
96
87
|
case "create_dataframe_view":
|
|
97
88
|
req = request.plan.command.create_dataframe_view
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
input_df, "_column_map"
|
|
102
|
-
), "input_df does not have the _column_map attribute"
|
|
103
|
-
column_map = input_df._column_map
|
|
89
|
+
input_df_container = map_relation(req.input)
|
|
90
|
+
input_df = input_df_container.dataframe
|
|
91
|
+
column_map = input_df_container.column_map
|
|
104
92
|
|
|
105
93
|
session_config = sessions_config[get_session_id()]
|
|
106
94
|
duplicate_column_names_handling_mode = session_config[
|
|
@@ -133,7 +121,9 @@ def map_execution_command(
|
|
|
133
121
|
view_name = [global_config.spark_sql_globalTempDatabase, req.name]
|
|
134
122
|
else:
|
|
135
123
|
view_name = [req.name]
|
|
136
|
-
view_name = [
|
|
124
|
+
view_name = [
|
|
125
|
+
spark_to_sf_single_id_with_unquoting(part) for part in view_name
|
|
126
|
+
]
|
|
137
127
|
|
|
138
128
|
if req.replace:
|
|
139
129
|
input_df.create_or_replace_temp_view(view_name)
|
|
@@ -20,6 +20,7 @@ from snowflake.snowpark._internal.utils import (
|
|
|
20
20
|
create_or_update_statement_params_with_query_tag,
|
|
21
21
|
)
|
|
22
22
|
from snowflake.snowpark_connect.constants import SERVER_SIDE_SESSION_ID
|
|
23
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
23
24
|
from snowflake.snowpark_connect.execute_plan.utils import (
|
|
24
25
|
arrow_table_to_arrow_bytes,
|
|
25
26
|
pandas_to_arrow_batches_bytes,
|
|
@@ -89,13 +90,16 @@ def to_arrow_batch_iter(result_df: snowpark.DataFrame) -> Iterator[Table]:
|
|
|
89
90
|
def map_execution_root(
|
|
90
91
|
request: proto_base.ExecutePlanRequest,
|
|
91
92
|
) -> Iterator[proto_base.ExecutePlanResponse | QueryResult]:
|
|
92
|
-
|
|
93
|
+
result: DataFrameContainer | pandas.DataFrame = map_relation(request.plan.root)
|
|
94
|
+
if isinstance(result, pandas.DataFrame):
|
|
95
|
+
result_df = result
|
|
96
|
+
else:
|
|
97
|
+
result_df = result.dataframe
|
|
98
|
+
|
|
93
99
|
if isinstance(result_df, snowpark.DataFrame):
|
|
94
100
|
snowpark_schema = result_df.schema
|
|
95
|
-
schema = snowpark_to_proto_type(
|
|
96
|
-
|
|
97
|
-
)
|
|
98
|
-
spark_columns = result_df._column_map.get_spark_columns()
|
|
101
|
+
schema = snowpark_to_proto_type(snowpark_schema, result.column_map, result_df)
|
|
102
|
+
spark_columns = result.column_map.get_spark_columns()
|
|
99
103
|
if tcm.TCM_MODE:
|
|
100
104
|
# TCM result handling:
|
|
101
105
|
# - small result (only one batch): just return the executePlanResponse
|
|
@@ -58,7 +58,8 @@ def map_extension(
|
|
|
58
58
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
59
59
|
|
|
60
60
|
with push_evaluating_sql_scope():
|
|
61
|
-
|
|
61
|
+
df_container = map_relation(extension.subquery_expression.input)
|
|
62
|
+
df = df_container.dataframe
|
|
62
63
|
|
|
63
64
|
queries = df.queries["queries"]
|
|
64
65
|
if len(queries) != 1:
|
|
@@ -17,15 +17,15 @@ from snowflake.snowpark_connect.column_name_handler import ColumnNameMap
|
|
|
17
17
|
from snowflake.snowpark_connect.config import global_config
|
|
18
18
|
from snowflake.snowpark_connect.expression.typer import ExpressionTyper
|
|
19
19
|
from snowflake.snowpark_connect.typed_column import TypedColumn
|
|
20
|
-
from snowflake.snowpark_connect.utils.attribute_handling import (
|
|
21
|
-
split_fully_qualified_spark_name,
|
|
22
|
-
)
|
|
23
20
|
from snowflake.snowpark_connect.utils.context import (
|
|
24
21
|
get_is_evaluating_sql,
|
|
25
22
|
get_outer_dataframes,
|
|
26
23
|
get_plan_id_map,
|
|
27
24
|
resolve_lca_alias,
|
|
28
25
|
)
|
|
26
|
+
from snowflake.snowpark_connect.utils.identifiers import (
|
|
27
|
+
split_fully_qualified_spark_name,
|
|
28
|
+
)
|
|
29
29
|
|
|
30
30
|
SPARK_QUOTED = re.compile("^(`.*`)$", re.DOTALL)
|
|
31
31
|
|
|
@@ -46,11 +46,12 @@ def map_unresolved_attribute(
|
|
|
46
46
|
|
|
47
47
|
if has_plan_id:
|
|
48
48
|
plan_id = exp.unresolved_attribute.plan_id
|
|
49
|
-
|
|
49
|
+
target_df_container = get_plan_id_map(plan_id)
|
|
50
|
+
target_df = target_df_container.dataframe
|
|
50
51
|
assert (
|
|
51
52
|
target_df is not None
|
|
52
53
|
), f"resolving an attribute of a unresolved dataframe {plan_id}"
|
|
53
|
-
column_mapping =
|
|
54
|
+
column_mapping = target_df_container.column_map
|
|
54
55
|
typer = ExpressionTyper(target_df)
|
|
55
56
|
|
|
56
57
|
def get_col(snowpark_name):
|
|
@@ -146,8 +147,8 @@ def map_unresolved_attribute(
|
|
|
146
147
|
name_parts[0], allow_non_exists=True
|
|
147
148
|
)
|
|
148
149
|
if snowpark_name is None:
|
|
149
|
-
for
|
|
150
|
-
snowpark_name =
|
|
150
|
+
for outer_df_container in get_outer_dataframes():
|
|
151
|
+
snowpark_name = outer_df_container.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
151
152
|
name_parts[0], allow_non_exists=True
|
|
152
153
|
)
|
|
153
154
|
if snowpark_name is not None:
|