snowpark-connect 0.33.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/column_name_handler.py +42 -56
- snowflake/snowpark_connect/config.py +9 -0
- snowflake/snowpark_connect/expression/literal.py +12 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +6 -0
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +147 -63
- snowflake/snowpark_connect/expression/map_unresolved_function.py +31 -28
- snowflake/snowpark_connect/relation/map_aggregate.py +156 -255
- snowflake/snowpark_connect/relation/map_column_ops.py +14 -0
- snowflake/snowpark_connect/relation/map_join.py +364 -234
- snowflake/snowpark_connect/relation/map_sql.py +309 -150
- snowflake/snowpark_connect/relation/read/map_read.py +9 -1
- snowflake/snowpark_connect/relation/read/map_read_csv.py +19 -2
- snowflake/snowpark_connect/relation/read/map_read_json.py +3 -0
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +3 -0
- snowflake/snowpark_connect/relation/read/map_read_text.py +4 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +10 -0
- snowflake/snowpark_connect/relation/read/utils.py +41 -0
- snowflake/snowpark_connect/relation/utils.py +4 -2
- snowflake/snowpark_connect/relation/write/map_write.py +65 -17
- snowflake/snowpark_connect/utils/context.py +0 -14
- snowflake/snowpark_connect/utils/expression_transformer.py +163 -0
- snowflake/snowpark_connect/utils/session.py +0 -4
- snowflake/snowpark_connect/utils/udf_helper.py +1 -0
- snowflake/snowpark_connect/utils/udtf_helper.py +3 -0
- snowflake/snowpark_connect/version.py +1 -1
- {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/METADATA +2 -2
- {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/RECORD +35 -38
- snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/__init__.py +0 -16
- snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/accessors.py +0 -1281
- snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/functions.py +0 -203
- snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/utils.py +0 -202
- {snowpark_connect-0.33.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.33.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.33.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
4
|
import dataclasses
|
|
5
|
+
from collections.abc import Callable
|
|
5
6
|
from enum import Enum
|
|
6
7
|
from functools import reduce
|
|
7
8
|
from typing import Optional
|
|
@@ -11,12 +12,12 @@ from pyspark.errors import AnalysisException
|
|
|
11
12
|
|
|
12
13
|
import snowflake.snowpark.functions as snowpark_fn
|
|
13
14
|
from snowflake import snowpark
|
|
15
|
+
from snowflake.snowpark import DataFrame
|
|
14
16
|
from snowflake.snowpark.types import StructField, StructType
|
|
15
17
|
from snowflake.snowpark_connect.column_name_handler import (
|
|
16
18
|
JoinColumnNameMap,
|
|
17
19
|
make_unique_snowpark_name,
|
|
18
20
|
)
|
|
19
|
-
from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
|
|
20
21
|
from snowflake.snowpark_connect.config import global_config
|
|
21
22
|
from snowflake.snowpark_connect.constants import COLUMN_METADATA_COLLISION_KEY
|
|
22
23
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
@@ -60,9 +61,7 @@ class JoinInfo:
|
|
|
60
61
|
join_type: str
|
|
61
62
|
condition_type: ConditionType
|
|
62
63
|
join_columns: Optional[list[str]]
|
|
63
|
-
|
|
64
|
-
def has_join_condition(self) -> bool:
|
|
65
|
-
return self.condition_type == ConditionType.JOIN_CONDITION
|
|
64
|
+
just_left_columns: bool
|
|
66
65
|
|
|
67
66
|
def is_using_columns(self):
|
|
68
67
|
return self.condition_type == ConditionType.USING_COLUMNS
|
|
@@ -77,196 +76,26 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
77
76
|
left_container = filter_metadata_columns(left_container)
|
|
78
77
|
right_container = filter_metadata_columns(right_container)
|
|
79
78
|
|
|
80
|
-
|
|
81
|
-
|
|
79
|
+
# if there are any conflicting snowpark columns, this is the time to rename them
|
|
80
|
+
left_container, right_container = _disambiguate_snowpark_columns(
|
|
81
|
+
left_container, right_container, rel
|
|
82
|
+
)
|
|
82
83
|
|
|
83
84
|
join_info = _get_join_info(rel, left_container, right_container)
|
|
84
|
-
join_type = join_info.join_type
|
|
85
|
-
|
|
86
|
-
if join_info.has_join_condition():
|
|
87
|
-
left_columns = list(left_container.column_map.spark_to_col.keys())
|
|
88
|
-
right_columns = list(right_container.column_map.spark_to_col.keys())
|
|
89
|
-
|
|
90
|
-
# All PySpark join types are in the format of JOIN_TYPE_XXX.
|
|
91
|
-
# We remove the first 10 characters (JOIN_TYPE_) and replace all underscores with spaces to match the exception.
|
|
92
|
-
pyspark_join_type = relation_proto.Join.JoinType.Name(rel.join.join_type)[
|
|
93
|
-
10:
|
|
94
|
-
].replace("_", " ")
|
|
95
|
-
with push_sql_scope(), push_evaluating_join_condition(
|
|
96
|
-
pyspark_join_type, left_columns, right_columns
|
|
97
|
-
):
|
|
98
|
-
if left_container.alias is not None:
|
|
99
|
-
set_sql_plan_name(left_container.alias, rel.join.left.common.plan_id)
|
|
100
|
-
if right_container.alias is not None:
|
|
101
|
-
set_sql_plan_name(right_container.alias, rel.join.right.common.plan_id)
|
|
102
|
-
_, join_expression = map_single_column_expression(
|
|
103
|
-
rel.join.join_condition,
|
|
104
|
-
column_mapping=JoinColumnNameMap(
|
|
105
|
-
left_container.column_map,
|
|
106
|
-
right_container.column_map,
|
|
107
|
-
),
|
|
108
|
-
typer=JoinExpressionTyper(left_input, right_input),
|
|
109
|
-
)
|
|
110
|
-
result: snowpark.DataFrame = left_input.join(
|
|
111
|
-
right=right_input,
|
|
112
|
-
on=join_expression.col,
|
|
113
|
-
how="inner" if join_info.join_type == "cross" else join_info.join_type,
|
|
114
|
-
lsuffix="_left",
|
|
115
|
-
rsuffix="_right",
|
|
116
|
-
)
|
|
117
|
-
elif join_info.is_using_columns():
|
|
118
|
-
# TODO: disambiguate snowpark columns for all join condition types
|
|
119
|
-
# disambiguation temporarily done only for using_columns/natural joins to reduce changes
|
|
120
|
-
left_container, right_container = _disambiguate_snowpark_columns(
|
|
121
|
-
left_container, right_container
|
|
122
|
-
)
|
|
123
|
-
left_input = left_container.dataframe
|
|
124
|
-
right_input = right_container.dataframe
|
|
125
|
-
|
|
126
|
-
join_columns = join_info.join_columns
|
|
127
85
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
container.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
133
|
-
column, allow_non_exists=True, return_first=True
|
|
134
|
-
)
|
|
135
|
-
is None
|
|
136
|
-
):
|
|
137
|
-
exception = AnalysisException(
|
|
138
|
-
USING_COLUMN_NOT_FOUND_ERROR.format(
|
|
139
|
-
column, side, container.column_map.get_spark_columns()
|
|
140
|
-
)
|
|
141
|
-
)
|
|
142
|
-
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
143
|
-
raise exception
|
|
144
|
-
|
|
145
|
-
for col in join_columns:
|
|
146
|
-
_validate_using_column(col, left_container, "left")
|
|
147
|
-
_validate_using_column(col, right_container, "right")
|
|
148
|
-
|
|
149
|
-
# We cannot assume that Snowpark will have the same names for left and right columns,
|
|
150
|
-
# so we convert ["a", "b"] into (left["a"] == right["a"] & left["b"] == right["b"]),
|
|
151
|
-
# then drop right["a"] and right["b"].
|
|
152
|
-
snowpark_using_columns = [
|
|
153
|
-
(
|
|
154
|
-
left_input[
|
|
155
|
-
left_container.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
156
|
-
spark_name, return_first=True
|
|
157
|
-
)
|
|
158
|
-
],
|
|
159
|
-
right_input[
|
|
160
|
-
right_container.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
161
|
-
spark_name, return_first=True
|
|
162
|
-
)
|
|
163
|
-
],
|
|
86
|
+
match join_info.condition_type:
|
|
87
|
+
case ConditionType.JOIN_CONDITION:
|
|
88
|
+
result_container = _join_using_condition(
|
|
89
|
+
left_container, right_container, join_info, rel
|
|
164
90
|
)
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
right=right_input,
|
|
169
|
-
on=reduce(
|
|
170
|
-
snowpark.Column.__and__,
|
|
171
|
-
(left == right for left, right in snowpark_using_columns),
|
|
172
|
-
),
|
|
173
|
-
how=join_type,
|
|
174
|
-
)
|
|
175
|
-
# For outer joins, we need to preserve join keys from both sides using COALESCE
|
|
176
|
-
if join_type == "full_outer":
|
|
177
|
-
coalesced_columns = []
|
|
178
|
-
columns_to_drop = []
|
|
179
|
-
for i, (left_col, right_col) in enumerate(snowpark_using_columns):
|
|
180
|
-
# Use the original user-specified column name to preserve case sensitivity
|
|
181
|
-
original_column_name = rel.join.using_columns[i]
|
|
182
|
-
coalesced_col = snowpark_fn.coalesce(left_col, right_col).alias(
|
|
183
|
-
original_column_name
|
|
184
|
-
)
|
|
185
|
-
coalesced_columns.append(coalesced_col)
|
|
186
|
-
columns_to_drop.extend([left_col, right_col])
|
|
187
|
-
|
|
188
|
-
other_columns = [
|
|
189
|
-
snowpark_fn.col(col_name)
|
|
190
|
-
for col_name in joined_df.columns
|
|
191
|
-
if col_name not in [col.getName() for col in columns_to_drop]
|
|
192
|
-
]
|
|
193
|
-
result = joined_df.select(coalesced_columns + other_columns)
|
|
194
|
-
else:
|
|
195
|
-
result = joined_df.drop(*(right for _, right in snowpark_using_columns))
|
|
196
|
-
else:
|
|
197
|
-
if join_type != "cross" and not global_config.spark_sql_crossJoin_enabled:
|
|
198
|
-
exception = SparkException.implicit_cartesian_product("inner")
|
|
199
|
-
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
200
|
-
raise exception
|
|
201
|
-
# For outer joins without a condition, we need to use a TRUE condition
|
|
202
|
-
# to match Spark's behavior.
|
|
203
|
-
if join_type in ["left", "right", "full_outer"]:
|
|
204
|
-
result: snowpark.DataFrame = left_input.join(
|
|
205
|
-
right=right_input,
|
|
206
|
-
on=snowpark_fn.lit(True),
|
|
207
|
-
how=join_type,
|
|
208
|
-
)
|
|
209
|
-
else:
|
|
210
|
-
result: snowpark.DataFrame = left_input.join(
|
|
211
|
-
right=right_input,
|
|
212
|
-
how=join_type,
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
if join_type in ["leftanti", "leftsemi"]:
|
|
216
|
-
# Join types that only return columns from the left side:
|
|
217
|
-
# - LEFT SEMI JOIN: Returns left rows that have matches in right table (no right columns)
|
|
218
|
-
# - LEFT ANTI JOIN: Returns left rows that have NO matches in right table (no right columns)
|
|
219
|
-
# Both preserve only the columns from the left DataFrame without adding any columns from the right.
|
|
220
|
-
spark_cols_after_join: list[str] = left_container.column_map.get_spark_columns()
|
|
221
|
-
qualifiers = left_container.column_map.get_qualifiers()
|
|
222
|
-
else:
|
|
223
|
-
if not join_info.is_using_columns():
|
|
224
|
-
spark_cols_after_join: list[str] = (
|
|
225
|
-
left_container.column_map.get_spark_columns()
|
|
226
|
-
+ right_container.column_map.get_spark_columns()
|
|
91
|
+
case ConditionType.USING_COLUMNS:
|
|
92
|
+
result_container = _join_using_columns(
|
|
93
|
+
left_container, right_container, join_info
|
|
227
94
|
)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
95
|
+
case _:
|
|
96
|
+
result_container = _join_unconditionally(
|
|
97
|
+
left_container, right_container, join_info
|
|
231
98
|
)
|
|
232
|
-
else:
|
|
233
|
-
# get columns after join
|
|
234
|
-
joined_columns = left_container.column_map.get_columns_after_join(
|
|
235
|
-
right_container.column_map, join_info.join_columns
|
|
236
|
-
)
|
|
237
|
-
spark_cols_after_join: list[str] = [c.spark_name for c in joined_columns]
|
|
238
|
-
qualifiers: list[set[ColumnQualifier]] = [
|
|
239
|
-
c.qualifiers for c in joined_columns
|
|
240
|
-
]
|
|
241
|
-
|
|
242
|
-
column_metadata = dict(left_container.column_map.column_metadata or {})
|
|
243
|
-
if right_container.column_map.column_metadata:
|
|
244
|
-
for key, value in right_container.column_map.column_metadata.items():
|
|
245
|
-
if key not in column_metadata:
|
|
246
|
-
column_metadata[key] = value
|
|
247
|
-
else:
|
|
248
|
-
# In case of collision, use snowpark's column's expr_id as prefix.
|
|
249
|
-
# this is a temporary solution until SNOW-1926440 is resolved.
|
|
250
|
-
try:
|
|
251
|
-
snowpark_name = right_container.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
252
|
-
key
|
|
253
|
-
)
|
|
254
|
-
expr_id = right_input[snowpark_name]._expression.expr_id
|
|
255
|
-
updated_key = COLUMN_METADATA_COLLISION_KEY.format(
|
|
256
|
-
expr_id=expr_id, key=snowpark_name
|
|
257
|
-
)
|
|
258
|
-
column_metadata[updated_key] = value
|
|
259
|
-
except Exception:
|
|
260
|
-
# ignore any errors that happens while fetching the metadata
|
|
261
|
-
pass
|
|
262
|
-
|
|
263
|
-
result_container = DataFrameContainer.create_with_column_mapping(
|
|
264
|
-
dataframe=result,
|
|
265
|
-
spark_column_names=spark_cols_after_join,
|
|
266
|
-
snowpark_column_names=result.columns,
|
|
267
|
-
column_metadata=column_metadata,
|
|
268
|
-
column_qualifiers=qualifiers,
|
|
269
|
-
)
|
|
270
99
|
|
|
271
100
|
# Fix for USING join column references with different plan IDs
|
|
272
101
|
# After a USING join, references to the right dataframe's columns should resolve
|
|
@@ -283,47 +112,266 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
283
112
|
# since both columns are replaced with a coalesced column
|
|
284
113
|
if (
|
|
285
114
|
join_info.is_using_columns()
|
|
286
|
-
and join_type == "full_outer"
|
|
115
|
+
and join_info.join_type == "full_outer"
|
|
287
116
|
and rel.join.left.HasField("common")
|
|
288
117
|
and rel.join.left.common.HasField("plan_id")
|
|
289
118
|
):
|
|
290
119
|
left_plan_id = rel.join.left.common.plan_id
|
|
291
120
|
set_plan_id_map(left_plan_id, result_container)
|
|
292
121
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
122
|
+
return result_container
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _join_unconditionally(
|
|
126
|
+
left_container: DataFrameContainer,
|
|
127
|
+
right_container: DataFrameContainer,
|
|
128
|
+
info: JoinInfo,
|
|
129
|
+
) -> DataFrameContainer:
|
|
130
|
+
if info.join_type != "cross" and not global_config.spark_sql_crossJoin_enabled:
|
|
131
|
+
exception = SparkException.implicit_cartesian_product("inner")
|
|
132
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
133
|
+
raise exception
|
|
134
|
+
|
|
135
|
+
left_input = left_container.dataframe
|
|
136
|
+
right_input = right_container.dataframe
|
|
137
|
+
join_type = info.join_type
|
|
138
|
+
|
|
139
|
+
# For outer joins without a condition, we need to use a TRUE condition
|
|
140
|
+
# to match Spark's behavior.
|
|
141
|
+
result: snowpark.DataFrame = left_input.join(
|
|
142
|
+
right=right_input,
|
|
143
|
+
on=snowpark_fn.lit(True)
|
|
144
|
+
if join_type in ["left", "right", "full_outer"]
|
|
145
|
+
else None,
|
|
146
|
+
how=join_type,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
columns = left_container.column_map.columns + right_container.column_map.columns
|
|
150
|
+
column_metadata = _combine_metadata(left_container, right_container)
|
|
151
|
+
|
|
152
|
+
if info.just_left_columns:
|
|
153
|
+
columns = left_container.column_map.columns
|
|
154
|
+
column_metadata = left_container.column_map.column_metadata
|
|
155
|
+
result = result.select(*left_container.column_map.get_snowpark_columns())
|
|
156
|
+
|
|
157
|
+
snowpark_columns = [c.snowpark_name for c in columns]
|
|
158
|
+
|
|
159
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
160
|
+
dataframe=result,
|
|
161
|
+
spark_column_names=[c.spark_name for c in columns],
|
|
162
|
+
snowpark_column_names=snowpark_columns,
|
|
163
|
+
column_metadata=column_metadata,
|
|
164
|
+
column_qualifiers=[c.qualifiers for c in columns],
|
|
165
|
+
cached_schema_getter=_build_joined_schema(
|
|
166
|
+
snowpark_columns, left_input, right_input
|
|
167
|
+
),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _join_using_columns(
|
|
172
|
+
left_container: DataFrameContainer,
|
|
173
|
+
right_container: DataFrameContainer,
|
|
174
|
+
info: JoinInfo,
|
|
175
|
+
) -> DataFrameContainer:
|
|
176
|
+
join_columns = info.join_columns
|
|
177
|
+
|
|
178
|
+
def _validate_using_column(
|
|
179
|
+
column: str, container: DataFrameContainer, side: str
|
|
180
|
+
) -> None:
|
|
181
|
+
if (
|
|
182
|
+
container.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
183
|
+
column, allow_non_exists=True, return_first=True
|
|
184
|
+
)
|
|
185
|
+
is None
|
|
186
|
+
):
|
|
187
|
+
exception = AnalysisException(
|
|
188
|
+
USING_COLUMN_NOT_FOUND_ERROR.format(
|
|
189
|
+
column, side, container.column_map.get_spark_columns()
|
|
190
|
+
)
|
|
191
|
+
)
|
|
192
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
193
|
+
raise exception
|
|
194
|
+
|
|
195
|
+
for col in join_columns:
|
|
196
|
+
_validate_using_column(col, left_container, "left")
|
|
197
|
+
_validate_using_column(col, right_container, "right")
|
|
198
|
+
|
|
199
|
+
left_input = left_container.dataframe
|
|
200
|
+
right_input = right_container.dataframe
|
|
201
|
+
|
|
202
|
+
# The inputs will have different snowpark names for the same spark name,
|
|
203
|
+
# so we convert ["a", "b"] into (left["a"] == right["a"] & left["b"] == right["b"]),
|
|
204
|
+
# then drop right["a"] and right["b"].
|
|
205
|
+
snowpark_using_columns = [
|
|
206
|
+
(
|
|
207
|
+
snowpark_fn.col(
|
|
208
|
+
left_container.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
209
|
+
spark_name, return_first=True
|
|
210
|
+
)
|
|
211
|
+
),
|
|
212
|
+
snowpark_fn.col(
|
|
213
|
+
right_container.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
214
|
+
spark_name, return_first=True
|
|
215
|
+
)
|
|
216
|
+
),
|
|
298
217
|
)
|
|
218
|
+
for spark_name in join_columns
|
|
219
|
+
]
|
|
220
|
+
|
|
221
|
+
# this is a condition join, so it will contain left + right columns
|
|
222
|
+
# we need to postprocess this later to have a correct projection
|
|
223
|
+
joined_df = left_input.join(
|
|
224
|
+
right=right_input,
|
|
225
|
+
on=reduce(
|
|
226
|
+
snowpark.Column.__and__,
|
|
227
|
+
(left == right for left, right in snowpark_using_columns),
|
|
228
|
+
),
|
|
229
|
+
how=info.join_type,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# figure out default column ordering after the join
|
|
233
|
+
columns = left_container.column_map.get_columns_after_join(
|
|
234
|
+
right_container.column_map, join_columns, info.join_type
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# For outer joins, we need to preserve join keys from both sides using COALESCE
|
|
238
|
+
if info.join_type == "full_outer":
|
|
239
|
+
coalesced_columns = []
|
|
240
|
+
coalesced_column_names = []
|
|
241
|
+
for i, (left_col, right_col) in enumerate(snowpark_using_columns):
|
|
242
|
+
# spark uses the left side spark name
|
|
243
|
+
spark_name = columns[i].spark_name
|
|
244
|
+
new_snowpark_name = make_unique_snowpark_name(spark_name)
|
|
245
|
+
coalesced_col = snowpark_fn.coalesce(left_col, right_col).alias(
|
|
246
|
+
new_snowpark_name
|
|
247
|
+
)
|
|
248
|
+
coalesced_columns.append(coalesced_col)
|
|
249
|
+
coalesced_column_names.append((spark_name, new_snowpark_name))
|
|
250
|
+
|
|
251
|
+
# join columns need to be replaced, so we need the original names for schema lookup later
|
|
252
|
+
snowpark_names_for_schema_lookup = [c.snowpark_name for c in columns]
|
|
253
|
+
|
|
254
|
+
# we need to use the coalesced columns instead of the left-side join columns
|
|
255
|
+
columns = columns[len(join_columns) :]
|
|
299
256
|
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
257
|
+
non_join_columns = [snowpark_fn.col(c.snowpark_name) for c in columns]
|
|
258
|
+
result = joined_df.select(coalesced_columns + non_join_columns)
|
|
259
|
+
|
|
260
|
+
spark_names = [spark_name for spark_name, _ in coalesced_column_names] + [
|
|
261
|
+
c.spark_name for c in columns
|
|
262
|
+
]
|
|
263
|
+
snowpark_names = [
|
|
264
|
+
snowpark_name for _, snowpark_name in coalesced_column_names
|
|
265
|
+
] + [c.snowpark_name for c in columns]
|
|
266
|
+
qualifiers = ([set()] * len(join_columns)) + [c.qualifiers for c in columns]
|
|
304
267
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
268
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
269
|
+
dataframe=result,
|
|
270
|
+
spark_column_names=spark_names,
|
|
271
|
+
snowpark_column_names=snowpark_names,
|
|
272
|
+
column_metadata=_combine_metadata(left_container, right_container),
|
|
273
|
+
column_qualifiers=qualifiers,
|
|
274
|
+
cached_schema_getter=_build_joined_schema(
|
|
275
|
+
snowpark_names_for_schema_lookup,
|
|
276
|
+
left_input,
|
|
277
|
+
right_input,
|
|
278
|
+
snowpark_names,
|
|
279
|
+
),
|
|
308
280
|
)
|
|
309
281
|
|
|
310
|
-
|
|
311
|
-
|
|
282
|
+
if info.just_left_columns:
|
|
283
|
+
# we just need the left columns
|
|
284
|
+
columns = columns[: len(left_container.column_map.columns)]
|
|
285
|
+
snowpark_columns = [c.snowpark_name for c in columns]
|
|
286
|
+
result = joined_df.select(*snowpark_columns)
|
|
287
|
+
|
|
312
288
|
return DataFrameContainer.create_with_column_mapping(
|
|
313
|
-
dataframe=
|
|
314
|
-
spark_column_names=
|
|
315
|
-
snowpark_column_names=
|
|
316
|
-
|
|
289
|
+
dataframe=result,
|
|
290
|
+
spark_column_names=[c.spark_name for c in columns],
|
|
291
|
+
snowpark_column_names=snowpark_columns,
|
|
292
|
+
column_metadata=left_container.column_map.column_metadata,
|
|
293
|
+
column_qualifiers=[c.qualifiers for c in columns],
|
|
294
|
+
cached_schema_getter=_build_joined_schema(
|
|
295
|
+
snowpark_columns, left_input, right_input
|
|
317
296
|
),
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
snowpark_columns = [c.snowpark_name for c in columns]
|
|
300
|
+
result = joined_df.select(*snowpark_columns)
|
|
301
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
302
|
+
dataframe=result,
|
|
303
|
+
spark_column_names=[c.spark_name for c in columns],
|
|
304
|
+
snowpark_column_names=snowpark_columns,
|
|
305
|
+
column_metadata=_combine_metadata(left_container, right_container),
|
|
306
|
+
column_qualifiers=[c.qualifiers for c in columns],
|
|
307
|
+
cached_schema_getter=_build_joined_schema(
|
|
308
|
+
snowpark_columns, left_input, right_input
|
|
309
|
+
),
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _join_using_condition(
|
|
314
|
+
left_container: DataFrameContainer,
|
|
315
|
+
right_container: DataFrameContainer,
|
|
316
|
+
info: JoinInfo,
|
|
317
|
+
rel: relation_proto.Relation,
|
|
318
|
+
) -> DataFrameContainer:
|
|
319
|
+
left_columns = left_container.column_map.get_spark_columns()
|
|
320
|
+
right_columns = right_container.column_map.get_spark_columns()
|
|
321
|
+
|
|
322
|
+
left_input = left_container.dataframe
|
|
323
|
+
right_input = right_container.dataframe
|
|
324
|
+
|
|
325
|
+
# All PySpark join types are in the format of JOIN_TYPE_XXX.
|
|
326
|
+
# We remove the first 10 characters (JOIN_TYPE_) and replace all underscores with spaces to match the exception.
|
|
327
|
+
pyspark_join_type = relation_proto.Join.JoinType.Name(rel.join.join_type)[
|
|
328
|
+
10:
|
|
329
|
+
].replace("_", " ")
|
|
330
|
+
with push_sql_scope(), push_evaluating_join_condition(
|
|
331
|
+
pyspark_join_type, left_columns, right_columns
|
|
332
|
+
):
|
|
333
|
+
if left_container.alias is not None:
|
|
334
|
+
set_sql_plan_name(left_container.alias, rel.join.left.common.plan_id)
|
|
335
|
+
if right_container.alias is not None:
|
|
336
|
+
set_sql_plan_name(right_container.alias, rel.join.right.common.plan_id)
|
|
337
|
+
# resolve join condition expression
|
|
338
|
+
_, join_expression = map_single_column_expression(
|
|
339
|
+
rel.join.join_condition,
|
|
340
|
+
column_mapping=JoinColumnNameMap(
|
|
341
|
+
left_container.column_map,
|
|
342
|
+
right_container.column_map,
|
|
323
343
|
),
|
|
344
|
+
typer=JoinExpressionTyper(left_input, right_input),
|
|
324
345
|
)
|
|
325
346
|
|
|
326
|
-
|
|
347
|
+
result: snowpark.DataFrame = left_input.join(
|
|
348
|
+
right=right_input,
|
|
349
|
+
on=join_expression.col,
|
|
350
|
+
how=info.join_type,
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# column order is already correct, so we just take the left + right side list
|
|
354
|
+
columns = left_container.column_map.columns + right_container.column_map.columns
|
|
355
|
+
column_metadata = _combine_metadata(left_container, right_container)
|
|
356
|
+
|
|
357
|
+
if info.just_left_columns:
|
|
358
|
+
# we just need left-side columns
|
|
359
|
+
columns = left_container.column_map.columns
|
|
360
|
+
result = result.select(*[c.snowpark_name for c in columns])
|
|
361
|
+
column_metadata = left_container.column_map.column_metadata
|
|
362
|
+
|
|
363
|
+
snowpark_columns = [c.snowpark_name for c in columns]
|
|
364
|
+
|
|
365
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
366
|
+
dataframe=result,
|
|
367
|
+
spark_column_names=[c.spark_name for c in columns],
|
|
368
|
+
snowpark_column_names=snowpark_columns,
|
|
369
|
+
column_metadata=column_metadata,
|
|
370
|
+
column_qualifiers=[c.qualifiers for c in columns],
|
|
371
|
+
cached_schema_getter=_build_joined_schema(
|
|
372
|
+
snowpark_columns, left_input, right_input
|
|
373
|
+
),
|
|
374
|
+
)
|
|
327
375
|
|
|
328
376
|
|
|
329
377
|
def _get_join_info(
|
|
@@ -372,6 +420,10 @@ def _get_join_info(
|
|
|
372
420
|
has_join_condition = rel.join.HasField("join_condition")
|
|
373
421
|
is_using_columns = bool(join_columns)
|
|
374
422
|
|
|
423
|
+
if join_type == "cross" and has_join_condition:
|
|
424
|
+
# if the user provided any condition, it's no longer a cross join
|
|
425
|
+
join_type = "inner"
|
|
426
|
+
|
|
375
427
|
if has_join_condition:
|
|
376
428
|
assert not is_using_columns
|
|
377
429
|
|
|
@@ -381,11 +433,17 @@ def _get_join_info(
|
|
|
381
433
|
elif is_using_columns:
|
|
382
434
|
condition_type = ConditionType.USING_COLUMNS
|
|
383
435
|
|
|
384
|
-
return
|
|
436
|
+
# Join types that only return columns from the left side:
|
|
437
|
+
# - LEFT SEMI JOIN: Returns left rows that have matches in right table (no right columns)
|
|
438
|
+
# - LEFT ANTI JOIN: Returns left rows that have NO matches in right table (no right columns)
|
|
439
|
+
# Both preserve only the columns from the left DataFrame without adding any columns from the right.
|
|
440
|
+
just_left_columns = join_type in ["leftanti", "leftsemi"]
|
|
441
|
+
|
|
442
|
+
return JoinInfo(join_type, condition_type, join_columns, just_left_columns)
|
|
385
443
|
|
|
386
444
|
|
|
387
445
|
def _disambiguate_snowpark_columns(
|
|
388
|
-
left: DataFrameContainer, right: DataFrameContainer
|
|
446
|
+
left: DataFrameContainer, right: DataFrameContainer, rel: relation_proto.Relation
|
|
389
447
|
) -> tuple[DataFrameContainer, DataFrameContainer]:
|
|
390
448
|
conflicting_snowpark_columns = left.column_map.get_conflicting_snowpark_columns(
|
|
391
449
|
right.column_map
|
|
@@ -394,14 +452,24 @@ def _disambiguate_snowpark_columns(
|
|
|
394
452
|
if not conflicting_snowpark_columns:
|
|
395
453
|
return left, right
|
|
396
454
|
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
455
|
+
left_plan = rel.join.left.common.plan_id
|
|
456
|
+
right_plan = rel.join.right.common.plan_id
|
|
457
|
+
|
|
458
|
+
if left_plan == right_plan:
|
|
459
|
+
# don't overwrite plan_id map for self joins
|
|
460
|
+
right_plan = None
|
|
461
|
+
|
|
462
|
+
# rename and create new right container
|
|
463
|
+
# TODO: rename both sides after SNOW-2382499
|
|
464
|
+
return left, _disambiguate_container(
|
|
465
|
+
right, conflicting_snowpark_columns, right_plan
|
|
466
|
+
)
|
|
401
467
|
|
|
402
468
|
|
|
403
469
|
def _disambiguate_container(
|
|
404
|
-
container: DataFrameContainer,
|
|
470
|
+
container: DataFrameContainer,
|
|
471
|
+
conflicting_snowpark_columns: set[str],
|
|
472
|
+
plan_id: Optional[int],
|
|
405
473
|
) -> DataFrameContainer:
|
|
406
474
|
column_map = container.column_map
|
|
407
475
|
disambiguated_columns = []
|
|
@@ -420,25 +488,87 @@ def _disambiguate_container(
|
|
|
420
488
|
|
|
421
489
|
disambiguated_df = container.dataframe.select(*disambiguated_columns)
|
|
422
490
|
|
|
423
|
-
def
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
new_fields.append(
|
|
432
|
-
StructField(name, f.datatype, nullable=f.nullable, _is_column=True)
|
|
433
|
-
)
|
|
434
|
-
return StructType(new_fields)
|
|
491
|
+
def _schema_getter() -> StructType:
|
|
492
|
+
fields = container.dataframe.schema.fields
|
|
493
|
+
return StructType(
|
|
494
|
+
[
|
|
495
|
+
StructField(name, fields[i].datatype, fields[i].nullable)
|
|
496
|
+
for i, name in enumerate(disambiguated_snowpark_names)
|
|
497
|
+
]
|
|
498
|
+
)
|
|
435
499
|
|
|
436
|
-
|
|
500
|
+
disambiguated_container = DataFrameContainer.create_with_column_mapping(
|
|
437
501
|
dataframe=disambiguated_df,
|
|
438
502
|
spark_column_names=column_map.get_spark_columns(),
|
|
439
503
|
snowpark_column_names=disambiguated_snowpark_names,
|
|
440
504
|
column_metadata=column_map.column_metadata,
|
|
441
505
|
column_qualifiers=column_map.get_qualifiers(),
|
|
442
506
|
table_name=container.table_name,
|
|
443
|
-
cached_schema_getter=
|
|
507
|
+
cached_schema_getter=_schema_getter,
|
|
444
508
|
)
|
|
509
|
+
|
|
510
|
+
# since we just renamed some snowpark columns, we need to update the dataframe container for the given plan_id
|
|
511
|
+
# TODO: is there a better way to do this?
|
|
512
|
+
if plan_id is not None:
|
|
513
|
+
set_plan_id_map(plan_id, disambiguated_container)
|
|
514
|
+
|
|
515
|
+
return disambiguated_container
|
|
516
|
+
|
|
517
|
+
|
|
518
|
+
def _combine_metadata(
|
|
519
|
+
left_container: DataFrameContainer, right_container: DataFrameContainer
|
|
520
|
+
) -> dict:
|
|
521
|
+
column_metadata = dict(left_container.column_map.column_metadata or {})
|
|
522
|
+
if right_container.column_map.column_metadata:
|
|
523
|
+
for key, value in right_container.column_map.column_metadata.items():
|
|
524
|
+
if key not in column_metadata:
|
|
525
|
+
column_metadata[key] = value
|
|
526
|
+
else:
|
|
527
|
+
# In case of collision, use snowpark's column's expr_id as prefix.
|
|
528
|
+
# this is a temporary solution until SNOW-1926440 is resolved.
|
|
529
|
+
try:
|
|
530
|
+
snowpark_name = right_container.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
531
|
+
key
|
|
532
|
+
)
|
|
533
|
+
expr_id = right_container.dataframe[
|
|
534
|
+
snowpark_name
|
|
535
|
+
]._expression.expr_id
|
|
536
|
+
updated_key = COLUMN_METADATA_COLLISION_KEY.format(
|
|
537
|
+
expr_id=expr_id, key=snowpark_name
|
|
538
|
+
)
|
|
539
|
+
column_metadata[updated_key] = value
|
|
540
|
+
except Exception:
|
|
541
|
+
# ignore any errors that happens while fetching the metadata
|
|
542
|
+
pass
|
|
543
|
+
return column_metadata
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def _build_joined_schema(
|
|
547
|
+
snowpark_columns: list[str],
|
|
548
|
+
left_input: DataFrame,
|
|
549
|
+
right_input: DataFrame,
|
|
550
|
+
target_snowpark_columns: Optional[list[str]] = None,
|
|
551
|
+
) -> Callable[[], StructType]:
|
|
552
|
+
"""
|
|
553
|
+
Builds a lazy schema for the joined dataframe, based on the given snowpark_columns and input dataframes.
|
|
554
|
+
In case of full outer joins, we need a separate target_snowpark_columns, since join columns will have different
|
|
555
|
+
names in the output than in any input.
|
|
556
|
+
"""
|
|
557
|
+
|
|
558
|
+
def _schema_getter() -> StructType:
|
|
559
|
+
all_fields = left_input.schema.fields + right_input.schema.fields
|
|
560
|
+
fields: dict[str, StructField] = {f.name: f for f in all_fields}
|
|
561
|
+
target_names = target_snowpark_columns or snowpark_columns
|
|
562
|
+
|
|
563
|
+
assert len(snowpark_columns) == len(target_names)
|
|
564
|
+
|
|
565
|
+
return StructType(
|
|
566
|
+
[
|
|
567
|
+
StructField(
|
|
568
|
+
target_names[i], fields[name].datatype, fields[name].nullable
|
|
569
|
+
)
|
|
570
|
+
for i, name in enumerate(snowpark_columns)
|
|
571
|
+
]
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
return _schema_getter
|