snowpark-connect 0.27.0__py3-none-any.whl → 0.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/column_name_handler.py +3 -93
- snowflake/snowpark_connect/config.py +99 -1
- snowflake/snowpark_connect/dataframe_container.py +0 -6
- snowflake/snowpark_connect/expression/map_expression.py +22 -7
- snowflake/snowpark_connect/expression/map_sql_expression.py +22 -18
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +4 -26
- snowflake/snowpark_connect/expression/map_unresolved_function.py +12 -3
- snowflake/snowpark_connect/expression/map_unresolved_star.py +2 -3
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/relation/map_extension.py +14 -10
- snowflake/snowpark_connect/relation/map_join.py +62 -258
- snowflake/snowpark_connect/relation/map_relation.py +5 -1
- snowflake/snowpark_connect/relation/map_sql.py +353 -16
- snowflake/snowpark_connect/relation/write/map_write.py +171 -110
- snowflake/snowpark_connect/resources_initializer.py +20 -5
- snowflake/snowpark_connect/server.py +16 -17
- snowflake/snowpark_connect/utils/concurrent.py +4 -0
- snowflake/snowpark_connect/utils/describe_query_cache.py +57 -51
- snowflake/snowpark_connect/utils/identifiers.py +120 -0
- snowflake/snowpark_connect/utils/io_utils.py +21 -1
- snowflake/snowpark_connect/utils/scala_udf_utils.py +34 -43
- snowflake/snowpark_connect/utils/session.py +16 -26
- snowflake/snowpark_connect/utils/telemetry.py +53 -0
- snowflake/snowpark_connect/version.py +1 -1
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/METADATA +2 -2
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/RECORD +34 -35
- snowflake/snowpark_connect/hidden_column.py +0 -39
- {snowpark_connect-0.27.0.data → snowpark_connect-0.28.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-0.28.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-0.28.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-0.28.0.dist-info}/top_level.txt +0 -0
|
@@ -1,18 +1,13 @@
|
|
|
1
1
|
#
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
|
-
|
|
4
|
+
|
|
5
5
|
from functools import reduce
|
|
6
6
|
|
|
7
7
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
8
|
-
from pyspark.errors.exceptions.base import AnalysisException
|
|
9
8
|
|
|
10
9
|
import snowflake.snowpark.functions as snowpark_fn
|
|
11
10
|
from snowflake import snowpark
|
|
12
|
-
from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
13
|
-
quote_name_without_upper_casing,
|
|
14
|
-
unquote_if_quoted,
|
|
15
|
-
)
|
|
16
11
|
from snowflake.snowpark_connect.column_name_handler import JoinColumnNameMap
|
|
17
12
|
from snowflake.snowpark_connect.config import global_config
|
|
18
13
|
from snowflake.snowpark_connect.constants import COLUMN_METADATA_COLLISION_KEY
|
|
@@ -22,7 +17,6 @@ from snowflake.snowpark_connect.expression.map_expression import (
|
|
|
22
17
|
map_single_column_expression,
|
|
23
18
|
)
|
|
24
19
|
from snowflake.snowpark_connect.expression.typer import JoinExpressionTyper
|
|
25
|
-
from snowflake.snowpark_connect.hidden_column import HiddenColumn
|
|
26
20
|
from snowflake.snowpark_connect.relation.map_relation import (
|
|
27
21
|
NATURAL_JOIN_TYPE_BASE,
|
|
28
22
|
map_relation,
|
|
@@ -30,6 +24,7 @@ from snowflake.snowpark_connect.relation.map_relation import (
|
|
|
30
24
|
from snowflake.snowpark_connect.utils.context import (
|
|
31
25
|
push_evaluating_join_condition,
|
|
32
26
|
push_sql_scope,
|
|
27
|
+
set_plan_id_map,
|
|
33
28
|
set_sql_plan_name,
|
|
34
29
|
)
|
|
35
30
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
@@ -38,9 +33,6 @@ from snowflake.snowpark_connect.utils.telemetry import (
|
|
|
38
33
|
|
|
39
34
|
USING_COLUMN_NOT_FOUND_ERROR = "[UNRESOLVED_USING_COLUMN_FOR_JOIN] USING column `{0}` not found on the {1} side of the join. The {1}-side columns: {2}"
|
|
40
35
|
|
|
41
|
-
DUPLICATED_JOIN_COL_LSUFFIX = "_left"
|
|
42
|
-
DUPLICATED_JOIN_COL_RSUFFIX = "_right"
|
|
43
|
-
|
|
44
36
|
|
|
45
37
|
def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
46
38
|
left_container: DataFrameContainer = map_relation(rel.join.left)
|
|
@@ -82,13 +74,6 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
82
74
|
|
|
83
75
|
# This handles case sensitivity for using_columns
|
|
84
76
|
case_corrected_right_columns: list[str] = []
|
|
85
|
-
hidden_columns = set()
|
|
86
|
-
# Propagate the hidden columns from left/right inputs to the result in case of chained joins
|
|
87
|
-
if left_container.column_map.hidden_columns:
|
|
88
|
-
hidden_columns.update(left_container.column_map.hidden_columns)
|
|
89
|
-
|
|
90
|
-
if right_container.column_map.hidden_columns:
|
|
91
|
-
hidden_columns.update(right_container.column_map.hidden_columns)
|
|
92
77
|
|
|
93
78
|
if rel.join.HasField("join_condition"):
|
|
94
79
|
assert not using_columns
|
|
@@ -120,8 +105,8 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
120
105
|
right=right_input,
|
|
121
106
|
on=join_expression.col,
|
|
122
107
|
how=join_type,
|
|
123
|
-
lsuffix=
|
|
124
|
-
rsuffix=
|
|
108
|
+
lsuffix="_left",
|
|
109
|
+
rsuffix="_right",
|
|
125
110
|
)
|
|
126
111
|
elif using_columns:
|
|
127
112
|
if any(
|
|
@@ -171,24 +156,12 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
171
156
|
)
|
|
172
157
|
)
|
|
173
158
|
|
|
174
|
-
using_columns_snowpark_names = (
|
|
175
|
-
left_container.column_map.get_snowpark_column_names_from_spark_column_names(
|
|
176
|
-
list(using_columns), return_first=True
|
|
177
|
-
)
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
using_columns_snowpark_types = [
|
|
181
|
-
left_container.dataframe.schema.fields[idx].datatype
|
|
182
|
-
for idx, col in enumerate(left_container.column_map.get_snowpark_columns())
|
|
183
|
-
if col in using_columns_snowpark_names
|
|
184
|
-
]
|
|
185
|
-
|
|
186
159
|
# Round trip the using columns through the column map to get the correct names
|
|
187
160
|
# in order to support case sensitivity.
|
|
188
161
|
# TODO: case_corrected_left_columns / case_corrected_right_columns may no longer be required as Snowpark dataframe preserves the column casing now.
|
|
189
|
-
case_corrected_left_columns = (
|
|
190
|
-
left_container.column_map.
|
|
191
|
-
|
|
162
|
+
case_corrected_left_columns = left_container.column_map.get_spark_column_names_from_snowpark_column_names(
|
|
163
|
+
left_container.column_map.get_snowpark_column_names_from_spark_column_names(
|
|
164
|
+
list(using_columns), return_first=True
|
|
192
165
|
)
|
|
193
166
|
)
|
|
194
167
|
case_corrected_right_columns = right_container.column_map.get_spark_column_names_from_snowpark_column_names(
|
|
@@ -222,141 +195,28 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
222
195
|
(left == right for left, right in snowpark_using_columns),
|
|
223
196
|
),
|
|
224
197
|
how=join_type,
|
|
225
|
-
rsuffix=DUPLICATED_JOIN_COL_RSUFFIX,
|
|
226
198
|
)
|
|
227
|
-
# If we disambiguated the snowpark_using_columns during the join, we need to update 'snowpark_using_columns' to
|
|
228
|
-
# use the disambiguated names.
|
|
229
|
-
disambiguated_snowpark_using_columns = []
|
|
230
|
-
|
|
231
|
-
# Ignore disambiguation for LEFT SEMI JOIN and LEFT ANTI JOIN because they drop the right columns, so it'll never disambiguate.
|
|
232
|
-
if join_type in ["leftsemi", "leftanti"]:
|
|
233
|
-
disambiguated_snowpark_using_columns = snowpark_using_columns
|
|
234
|
-
else:
|
|
235
|
-
normalized_joined_columns = [
|
|
236
|
-
unquote_if_quoted(col) for col in joined_df.columns
|
|
237
|
-
]
|
|
238
|
-
# snowpark_using_columns is a list of tuples of snowpark columns, joined_df.columns is a list of strings of column names
|
|
239
|
-
for (left, right) in snowpark_using_columns:
|
|
240
|
-
normalized_left_name = unquote_if_quoted(left.getName())
|
|
241
|
-
normalized_right_name = unquote_if_quoted(right.getName())
|
|
242
|
-
|
|
243
|
-
# are both left and right in joined_df? if not, it's been disambiguated
|
|
244
|
-
if (
|
|
245
|
-
normalized_left_name in normalized_joined_columns
|
|
246
|
-
and normalized_right_name in normalized_joined_columns
|
|
247
|
-
):
|
|
248
|
-
# we want to just add this
|
|
249
|
-
disambiguated_snowpark_using_columns.append((left, right))
|
|
250
|
-
else:
|
|
251
|
-
# we need to figure out the disambiguated names and add those - it only disambiguates if left == right
|
|
252
|
-
disambiguated_left: snowpark.Column | None = None
|
|
253
|
-
disambiguated_right: snowpark.Column | None = None
|
|
254
|
-
|
|
255
|
-
for col in normalized_joined_columns:
|
|
256
|
-
quoted_col = f'"{col}"'
|
|
257
|
-
# get the column name and cross check it to see if it ends with the og name
|
|
258
|
-
if col.endswith(normalized_left_name) and col.startswith("l_"):
|
|
259
|
-
disambiguated_left = joined_df[quoted_col]
|
|
260
|
-
elif col.endswith(normalized_right_name) and col.startswith(
|
|
261
|
-
"r_"
|
|
262
|
-
):
|
|
263
|
-
disambiguated_right = joined_df[quoted_col]
|
|
264
|
-
|
|
265
|
-
# If we have both disambiguated columns, we can break out of the loop to save processing time
|
|
266
|
-
if (
|
|
267
|
-
disambiguated_left is not None
|
|
268
|
-
and disambiguated_right is not None
|
|
269
|
-
):
|
|
270
|
-
break
|
|
271
|
-
if disambiguated_left is None or disambiguated_right is None:
|
|
272
|
-
raise AnalysisException(
|
|
273
|
-
f"Disambiguated columns not found for {normalized_left_name} and {normalized_right_name}."
|
|
274
|
-
)
|
|
275
|
-
disambiguated_snowpark_using_columns.append(
|
|
276
|
-
(disambiguated_left, disambiguated_right)
|
|
277
|
-
)
|
|
278
|
-
|
|
279
199
|
# For outer joins, we need to preserve join keys from both sides using COALESCE
|
|
280
|
-
"""
|
|
281
|
-
CHANGES:
|
|
282
|
-
- IF CASE
|
|
283
|
-
- Need to drop the using columns
|
|
284
|
-
- Need to create the hidden_columns DF with the using columns from right and left
|
|
285
|
-
- ELSE CASE
|
|
286
|
-
- Need to drop the right side using columns
|
|
287
|
-
- Need to create the hidden_columns DF with the using columns from right
|
|
288
|
-
"""
|
|
289
200
|
if join_type == "full_outer":
|
|
290
201
|
coalesced_columns = []
|
|
291
|
-
|
|
202
|
+
columns_to_drop = []
|
|
203
|
+
for i, (left_col, right_col) in enumerate(snowpark_using_columns):
|
|
292
204
|
# Use the original user-specified column name to preserve case sensitivity
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
coalesced_col = snowpark_fn.coalesce(
|
|
298
|
-
disambiguated_left_col, disambiguated_right_col
|
|
299
|
-
).alias(left_col.get_name())
|
|
300
|
-
coalesced_columns.append(coalesced_col)
|
|
301
|
-
|
|
302
|
-
# Create HiddenColumn objects for each hidden column
|
|
303
|
-
hidden_left = HiddenColumn(
|
|
304
|
-
hidden_snowpark_name=disambiguated_left_col.getName(),
|
|
305
|
-
spark_name=case_corrected_left_columns[i],
|
|
306
|
-
visible_snowpark_name=left_col.get_name(),
|
|
307
|
-
qualifiers=left_container.column_map.get_qualifier_for_spark_column(
|
|
308
|
-
case_corrected_left_columns[i]
|
|
309
|
-
),
|
|
310
|
-
original_position=left_container.column_map.get_spark_columns().index(
|
|
311
|
-
case_corrected_left_columns[i]
|
|
312
|
-
),
|
|
313
|
-
)
|
|
314
|
-
|
|
315
|
-
hidden_right = HiddenColumn(
|
|
316
|
-
hidden_snowpark_name=disambiguated_right_col.getName(),
|
|
317
|
-
spark_name=case_corrected_right_columns[i],
|
|
318
|
-
visible_snowpark_name=left_col.get_name(),
|
|
319
|
-
qualifiers=right_container.column_map.get_qualifier_for_spark_column(
|
|
320
|
-
case_corrected_right_columns[i]
|
|
321
|
-
),
|
|
322
|
-
original_position=right_container.column_map.get_spark_columns().index(
|
|
323
|
-
case_corrected_right_columns[i]
|
|
324
|
-
),
|
|
325
|
-
)
|
|
326
|
-
hidden_columns.update(
|
|
327
|
-
[
|
|
328
|
-
hidden_left,
|
|
329
|
-
hidden_right,
|
|
330
|
-
]
|
|
205
|
+
original_column_name = rel.join.using_columns[i]
|
|
206
|
+
coalesced_col = snowpark_fn.coalesce(left_col, right_col).alias(
|
|
207
|
+
original_column_name
|
|
331
208
|
)
|
|
209
|
+
coalesced_columns.append(coalesced_col)
|
|
210
|
+
columns_to_drop.extend([left_col, right_col])
|
|
332
211
|
|
|
333
|
-
# All non-hidden columns (not including the coalesced columns)
|
|
334
212
|
other_columns = [
|
|
335
213
|
snowpark_fn.col(col_name)
|
|
336
214
|
for col_name in joined_df.columns
|
|
337
|
-
if col_name not in [col.
|
|
215
|
+
if col_name not in [col.getName() for col in columns_to_drop]
|
|
338
216
|
]
|
|
339
217
|
result = joined_df.select(coalesced_columns + other_columns)
|
|
340
|
-
|
|
341
218
|
else:
|
|
342
219
|
result = joined_df.drop(*(right for _, right in snowpark_using_columns))
|
|
343
|
-
# We never run into the disambiguation case unless it's a full outer join.
|
|
344
|
-
for i, (left_col, right_col) in enumerate(
|
|
345
|
-
disambiguated_snowpark_using_columns
|
|
346
|
-
):
|
|
347
|
-
# Only right side columns are hidden
|
|
348
|
-
hidden_col = HiddenColumn(
|
|
349
|
-
hidden_snowpark_name=right_col.getName(),
|
|
350
|
-
spark_name=case_corrected_right_columns[i],
|
|
351
|
-
visible_snowpark_name=left_col.getName(),
|
|
352
|
-
qualifiers=right_container.column_map.get_qualifier_for_spark_column(
|
|
353
|
-
case_corrected_right_columns[i]
|
|
354
|
-
),
|
|
355
|
-
original_position=right_container.column_map.get_spark_columns().index(
|
|
356
|
-
case_corrected_right_columns[i]
|
|
357
|
-
),
|
|
358
|
-
)
|
|
359
|
-
hidden_columns.add(hidden_col)
|
|
360
220
|
else:
|
|
361
221
|
if join_type != "cross" and not global_config.spark_sql_crossJoin_enabled:
|
|
362
222
|
raise SparkException.implicit_cartesian_product("inner")
|
|
@@ -370,110 +230,35 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
370
230
|
# - LEFT SEMI JOIN: Returns left rows that have matches in right table (no right columns)
|
|
371
231
|
# - LEFT ANTI JOIN: Returns left rows that have NO matches in right table (no right columns)
|
|
372
232
|
# Both preserve only the columns from the left DataFrame without adding any columns from the right.
|
|
373
|
-
spark_cols_after_join = left_container.column_map.get_spark_columns()
|
|
374
|
-
snowpark_cols_after_join = left_container.column_map.get_snowpark_columns()
|
|
375
|
-
snowpark_col_types = [
|
|
376
|
-
f.datatype for f in left_container.dataframe.schema.fields
|
|
377
|
-
]
|
|
233
|
+
spark_cols_after_join: list[str] = left_container.column_map.get_spark_columns()
|
|
378
234
|
qualifiers = left_container.column_map.get_qualifiers()
|
|
379
|
-
elif join_type == "full_outer" and using_columns:
|
|
380
|
-
# We want the coalesced columns to be first, followed by all the left and right columns (excluding using columns)
|
|
381
|
-
spark_cols_after_join: list[str] = []
|
|
382
|
-
snowpark_cols_after_join: list[str] = []
|
|
383
|
-
snowpark_col_types: list[str] = []
|
|
384
|
-
|
|
385
|
-
left_container_snowpark_columns = (
|
|
386
|
-
left_container.column_map.get_snowpark_columns()
|
|
387
|
-
)
|
|
388
|
-
right_container_snowpark_columns = (
|
|
389
|
-
right_container.column_map.get_snowpark_columns()
|
|
390
|
-
)
|
|
391
|
-
|
|
392
|
-
qualifiers = []
|
|
393
|
-
for i in range(len(case_corrected_left_columns)):
|
|
394
|
-
spark_cols_after_join.append(case_corrected_left_columns[i])
|
|
395
|
-
snowpark_cols_after_join.append(using_columns_snowpark_names[i])
|
|
396
|
-
snowpark_col_types.append(using_columns_snowpark_types[i])
|
|
397
|
-
qualifiers.append([])
|
|
398
|
-
|
|
399
|
-
# Handle adding left and right columns, excluding the using columns
|
|
400
|
-
for i, spark_col in enumerate(left_container.column_map.get_spark_columns()):
|
|
401
|
-
if (
|
|
402
|
-
spark_col not in case_corrected_left_columns
|
|
403
|
-
or spark_col in left_container.column_map.get_spark_columns()[:i]
|
|
404
|
-
):
|
|
405
|
-
spark_cols_after_join.append(spark_col)
|
|
406
|
-
snowpark_cols_after_join.append(left_container_snowpark_columns[i])
|
|
407
|
-
qualifiers.append(
|
|
408
|
-
left_container.column_map.get_qualifier_for_spark_column(spark_col)
|
|
409
|
-
)
|
|
410
|
-
|
|
411
|
-
snowpark_col_types.append(
|
|
412
|
-
left_container.dataframe.schema.fields[i].datatype
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
for i, spark_col in enumerate(right_container.column_map.get_spark_columns()):
|
|
416
|
-
if (
|
|
417
|
-
spark_col not in case_corrected_right_columns
|
|
418
|
-
or spark_col in right_container.column_map.get_spark_columns()[:i]
|
|
419
|
-
):
|
|
420
|
-
spark_cols_after_join.append(spark_col)
|
|
421
|
-
snowpark_cols_after_join.append(right_container_snowpark_columns[i])
|
|
422
|
-
qualifiers.append(
|
|
423
|
-
right_container.column_map.get_qualifier_for_spark_column(spark_col)
|
|
424
|
-
)
|
|
425
|
-
|
|
426
|
-
snowpark_col_types.append(
|
|
427
|
-
right_container.dataframe.schema.fields[i].datatype
|
|
428
|
-
)
|
|
429
|
-
|
|
430
235
|
else:
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
236
|
+
# Add Spark columns and plan_ids from left DF
|
|
237
|
+
spark_cols_after_join: list[str] = list(
|
|
238
|
+
left_container.column_map.get_spark_columns()
|
|
239
|
+
) + [
|
|
240
|
+
spark_col
|
|
241
|
+
for i, spark_col in enumerate(
|
|
242
|
+
right_container.column_map.get_spark_columns()
|
|
243
|
+
)
|
|
244
|
+
if spark_col not in case_corrected_right_columns
|
|
245
|
+
or spark_col
|
|
246
|
+
in right_container.column_map.get_spark_columns()[
|
|
247
|
+
:i
|
|
248
|
+
] # this is to make sure we only remove the column once
|
|
435
249
|
]
|
|
436
250
|
|
|
437
|
-
qualifiers = left_container.column_map.get_qualifiers()
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
for i, spark_col in enumerate(right_container.column_map.get_spark_columns()):
|
|
442
|
-
if (
|
|
443
|
-
spark_col not in case_corrected_right_columns
|
|
444
|
-
or spark_col in right_container.column_map.get_spark_columns()[:i]
|
|
445
|
-
):
|
|
446
|
-
spark_cols_after_join.append(spark_col)
|
|
447
|
-
snowpark_cols_after_join.append(right_df_snowpark_columns[i])
|
|
448
|
-
snowpark_col_types.append(
|
|
449
|
-
right_container.dataframe.schema.fields[i].datatype
|
|
450
|
-
)
|
|
451
|
-
|
|
452
|
-
qualifiers.append(
|
|
453
|
-
right_container.column_map.get_qualifier_for_spark_column(spark_col)
|
|
454
|
-
)
|
|
455
|
-
|
|
456
|
-
snowpark_cols_after_join_deduplicated = []
|
|
457
|
-
snowpark_cols_after_join_counter = Counter(snowpark_cols_after_join)
|
|
458
|
-
seen_duplicated_columns = set()
|
|
459
|
-
|
|
460
|
-
for col in snowpark_cols_after_join:
|
|
461
|
-
if snowpark_cols_after_join_counter[col] == 2:
|
|
462
|
-
# This means that the same column exists twice in the joined df, likely due to a self-join and
|
|
463
|
-
# we need to lsuffix and rsuffix to the names of both columns, similar to what Snowpark did under the hood.
|
|
464
|
-
|
|
465
|
-
suffix = (
|
|
466
|
-
DUPLICATED_JOIN_COL_RSUFFIX
|
|
467
|
-
if col in seen_duplicated_columns
|
|
468
|
-
else DUPLICATED_JOIN_COL_LSUFFIX
|
|
251
|
+
qualifiers = list(left_container.column_map.get_qualifiers()) + [
|
|
252
|
+
right_container.column_map.get_qualifier_for_spark_column(spark_col)
|
|
253
|
+
for i, spark_col in enumerate(
|
|
254
|
+
right_container.column_map.get_spark_columns()
|
|
469
255
|
)
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
snowpark_cols_after_join_deduplicated.append(col)
|
|
256
|
+
if spark_col not in case_corrected_right_columns
|
|
257
|
+
or spark_col
|
|
258
|
+
in right_container.column_map.get_spark_columns()[
|
|
259
|
+
:i
|
|
260
|
+
] # this is to make sure we only remove the column once]
|
|
261
|
+
]
|
|
477
262
|
|
|
478
263
|
column_metadata = {}
|
|
479
264
|
if left_container.column_map.column_metadata:
|
|
@@ -502,13 +287,33 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
502
287
|
result_container = DataFrameContainer.create_with_column_mapping(
|
|
503
288
|
dataframe=result,
|
|
504
289
|
spark_column_names=spark_cols_after_join,
|
|
505
|
-
snowpark_column_names=
|
|
290
|
+
snowpark_column_names=result.columns,
|
|
506
291
|
column_metadata=column_metadata,
|
|
507
292
|
column_qualifiers=qualifiers,
|
|
508
|
-
hidden_columns=hidden_columns,
|
|
509
|
-
snowpark_column_types=snowpark_col_types,
|
|
510
293
|
)
|
|
511
294
|
|
|
295
|
+
# Fix for USING join column references with different plan IDs
|
|
296
|
+
# After a USING join, references to the right dataframe's columns should resolve
|
|
297
|
+
# to the result dataframe that contains the merged columns
|
|
298
|
+
if (
|
|
299
|
+
using_columns
|
|
300
|
+
and rel.join.right.HasField("common")
|
|
301
|
+
and rel.join.right.common.HasField("plan_id")
|
|
302
|
+
):
|
|
303
|
+
right_plan_id = rel.join.right.common.plan_id
|
|
304
|
+
set_plan_id_map(right_plan_id, result_container)
|
|
305
|
+
|
|
306
|
+
# For FULL OUTER joins, we also need to map the left dataframe's plan_id
|
|
307
|
+
# since both columns are replaced with a coalesced column
|
|
308
|
+
if (
|
|
309
|
+
using_columns
|
|
310
|
+
and join_type == "full_outer"
|
|
311
|
+
and rel.join.left.HasField("common")
|
|
312
|
+
and rel.join.left.common.HasField("plan_id")
|
|
313
|
+
):
|
|
314
|
+
left_plan_id = rel.join.left.common.plan_id
|
|
315
|
+
set_plan_id_map(left_plan_id, result_container)
|
|
316
|
+
|
|
512
317
|
if rel.join.using_columns:
|
|
513
318
|
# When join 'using_columns', the 'join columns' should go first in result DF.
|
|
514
319
|
idxs_to_shift = [
|
|
@@ -540,7 +345,6 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
540
345
|
cached_schema_getter=lambda: snowpark.types.StructType(
|
|
541
346
|
reorder(original_df.schema.fields)
|
|
542
347
|
),
|
|
543
|
-
hidden_columns=hidden_columns,
|
|
544
348
|
)
|
|
545
349
|
|
|
546
350
|
return result_container
|
|
@@ -15,6 +15,7 @@ from snowflake.snowpark_connect.utils.cache import (
|
|
|
15
15
|
from snowflake.snowpark_connect.utils.context import (
|
|
16
16
|
get_plan_id_map,
|
|
17
17
|
get_session_id,
|
|
18
|
+
not_resolving_fun_args,
|
|
18
19
|
push_map_partitions,
|
|
19
20
|
push_operation_scope,
|
|
20
21
|
set_is_aggregate_function,
|
|
@@ -150,7 +151,10 @@ def map_relation(
|
|
|
150
151
|
case "drop_na":
|
|
151
152
|
result = map_row_ops.map_dropna(rel)
|
|
152
153
|
case "extension":
|
|
153
|
-
|
|
154
|
+
# Extensions can be passed as function args, and we need to reset the context here.
|
|
155
|
+
# Matters only for resolving alias expressions in the extensions rel.
|
|
156
|
+
with not_resolving_fun_args():
|
|
157
|
+
result = map_extension.map_extension(rel)
|
|
154
158
|
case "fill_na":
|
|
155
159
|
result = map_row_ops.map_fillna(rel)
|
|
156
160
|
case "filter":
|