snowpark-connect 0.26.0__py3-none-any.whl → 0.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/column_name_handler.py +3 -93
- snowflake/snowpark_connect/config.py +99 -4
- snowflake/snowpark_connect/dataframe_container.py +0 -6
- snowflake/snowpark_connect/expression/map_expression.py +31 -1
- snowflake/snowpark_connect/expression/map_sql_expression.py +22 -18
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +22 -26
- snowflake/snowpark_connect/expression/map_unresolved_function.py +28 -10
- snowflake/snowpark_connect/expression/map_unresolved_star.py +2 -3
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/relation/map_extension.py +7 -1
- snowflake/snowpark_connect/relation/map_join.py +62 -258
- snowflake/snowpark_connect/relation/map_map_partitions.py +36 -77
- snowflake/snowpark_connect/relation/map_relation.py +8 -2
- snowflake/snowpark_connect/relation/map_show_string.py +2 -0
- snowflake/snowpark_connect/relation/map_sql.py +413 -15
- snowflake/snowpark_connect/relation/write/map_write.py +195 -114
- snowflake/snowpark_connect/resources_initializer.py +20 -5
- snowflake/snowpark_connect/server.py +20 -18
- snowflake/snowpark_connect/utils/artifacts.py +4 -5
- snowflake/snowpark_connect/utils/concurrent.py +4 -0
- snowflake/snowpark_connect/utils/context.py +41 -1
- snowflake/snowpark_connect/utils/describe_query_cache.py +57 -51
- snowflake/snowpark_connect/utils/identifiers.py +120 -0
- snowflake/snowpark_connect/utils/io_utils.py +21 -1
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +86 -2
- snowflake/snowpark_connect/utils/scala_udf_utils.py +34 -43
- snowflake/snowpark_connect/utils/session.py +16 -26
- snowflake/snowpark_connect/utils/telemetry.py +53 -0
- snowflake/snowpark_connect/utils/udf_utils.py +66 -103
- snowflake/snowpark_connect/utils/udtf_helper.py +17 -7
- snowflake/snowpark_connect/version.py +2 -3
- {snowpark_connect-0.26.0.dist-info → snowpark_connect-0.28.0.dist-info}/METADATA +2 -2
- {snowpark_connect-0.26.0.dist-info → snowpark_connect-0.28.0.dist-info}/RECORD +41 -42
- snowflake/snowpark_connect/hidden_column.py +0 -39
- {snowpark_connect-0.26.0.data → snowpark_connect-0.28.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.26.0.data → snowpark_connect-0.28.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.26.0.data → snowpark_connect-0.28.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.26.0.dist-info → snowpark_connect-0.28.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.26.0.dist-info → snowpark_connect-0.28.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.26.0.dist-info → snowpark_connect-0.28.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.26.0.dist-info → snowpark_connect-0.28.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.26.0.dist-info → snowpark_connect-0.28.0.dist-info}/top_level.txt +0 -0
|
@@ -34,7 +34,6 @@ def map_unresolved_star(
|
|
|
34
34
|
column_mapping: ColumnNameMap,
|
|
35
35
|
typer: ExpressionTyper,
|
|
36
36
|
) -> tuple[list[str], TypedColumn]:
|
|
37
|
-
|
|
38
37
|
if exp.unresolved_star.HasField("unparsed_target"):
|
|
39
38
|
unparsed_target = exp.unresolved_star.unparsed_target
|
|
40
39
|
name_parts = split_fully_qualified_spark_name(unparsed_target)
|
|
@@ -103,7 +102,7 @@ def map_unresolved_star(
|
|
|
103
102
|
prefix_candidate_str = f"{prefix_candidate_str}.{name_parts[i]}"
|
|
104
103
|
prefix_candidate = (
|
|
105
104
|
column_mapping.get_snowpark_column_name_from_spark_column_name(
|
|
106
|
-
prefix_candidate_str, allow_non_exists=True
|
|
105
|
+
prefix_candidate_str, allow_non_exists=True
|
|
107
106
|
)
|
|
108
107
|
)
|
|
109
108
|
if prefix_candidate is None:
|
|
@@ -181,7 +180,7 @@ def map_unresolved_star_struct(
|
|
|
181
180
|
prefix_candidate_str = f"{prefix_candidate_str}.{name_parts[i]}"
|
|
182
181
|
prefix_candidate = (
|
|
183
182
|
column_mapping.get_snowpark_column_name_from_spark_column_name(
|
|
184
|
-
prefix_candidate_str, allow_non_exists=True
|
|
183
|
+
prefix_candidate_str, allow_non_exists=True
|
|
185
184
|
)
|
|
186
185
|
)
|
|
187
186
|
if prefix_candidate is None:
|
|
Binary file
|
|
@@ -429,12 +429,18 @@ def map_aggregate(
|
|
|
429
429
|
if groupings:
|
|
430
430
|
# Normal GROUP BY with explicit grouping columns
|
|
431
431
|
result = input_df.group_by(groupings)
|
|
432
|
-
|
|
432
|
+
elif not is_group_by_all:
|
|
433
433
|
# No explicit GROUP BY - this is an aggregate over the entire table
|
|
434
434
|
# Use a dummy constant that will be excluded from the final result
|
|
435
435
|
result = input_df.with_column(
|
|
436
436
|
"__dummy_group__", snowpark_fn.lit(1)
|
|
437
437
|
).group_by("__dummy_group__")
|
|
438
|
+
else:
|
|
439
|
+
# GROUP BY ALL with only one aggregate column
|
|
440
|
+
# Snowpark doesn't support GROUP BY ALL
|
|
441
|
+
# TODO: Change in future with Snowpark Supported arguments or API for GROUP BY ALL
|
|
442
|
+
result = input_df.group_by()
|
|
443
|
+
|
|
438
444
|
case snowflake_proto.Aggregate.GROUP_TYPE_ROLLUP:
|
|
439
445
|
result = input_df.rollup(groupings)
|
|
440
446
|
case snowflake_proto.Aggregate.GROUP_TYPE_CUBE:
|
|
@@ -1,18 +1,13 @@
|
|
|
1
1
|
#
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
|
-
|
|
4
|
+
|
|
5
5
|
from functools import reduce
|
|
6
6
|
|
|
7
7
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
8
|
-
from pyspark.errors.exceptions.base import AnalysisException
|
|
9
8
|
|
|
10
9
|
import snowflake.snowpark.functions as snowpark_fn
|
|
11
10
|
from snowflake import snowpark
|
|
12
|
-
from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
13
|
-
quote_name_without_upper_casing,
|
|
14
|
-
unquote_if_quoted,
|
|
15
|
-
)
|
|
16
11
|
from snowflake.snowpark_connect.column_name_handler import JoinColumnNameMap
|
|
17
12
|
from snowflake.snowpark_connect.config import global_config
|
|
18
13
|
from snowflake.snowpark_connect.constants import COLUMN_METADATA_COLLISION_KEY
|
|
@@ -22,7 +17,6 @@ from snowflake.snowpark_connect.expression.map_expression import (
|
|
|
22
17
|
map_single_column_expression,
|
|
23
18
|
)
|
|
24
19
|
from snowflake.snowpark_connect.expression.typer import JoinExpressionTyper
|
|
25
|
-
from snowflake.snowpark_connect.hidden_column import HiddenColumn
|
|
26
20
|
from snowflake.snowpark_connect.relation.map_relation import (
|
|
27
21
|
NATURAL_JOIN_TYPE_BASE,
|
|
28
22
|
map_relation,
|
|
@@ -30,6 +24,7 @@ from snowflake.snowpark_connect.relation.map_relation import (
|
|
|
30
24
|
from snowflake.snowpark_connect.utils.context import (
|
|
31
25
|
push_evaluating_join_condition,
|
|
32
26
|
push_sql_scope,
|
|
27
|
+
set_plan_id_map,
|
|
33
28
|
set_sql_plan_name,
|
|
34
29
|
)
|
|
35
30
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
@@ -38,9 +33,6 @@ from snowflake.snowpark_connect.utils.telemetry import (
|
|
|
38
33
|
|
|
39
34
|
USING_COLUMN_NOT_FOUND_ERROR = "[UNRESOLVED_USING_COLUMN_FOR_JOIN] USING column `{0}` not found on the {1} side of the join. The {1}-side columns: {2}"
|
|
40
35
|
|
|
41
|
-
DUPLICATED_JOIN_COL_LSUFFIX = "_left"
|
|
42
|
-
DUPLICATED_JOIN_COL_RSUFFIX = "_right"
|
|
43
|
-
|
|
44
36
|
|
|
45
37
|
def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
46
38
|
left_container: DataFrameContainer = map_relation(rel.join.left)
|
|
@@ -82,13 +74,6 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
82
74
|
|
|
83
75
|
# This handles case sensitivity for using_columns
|
|
84
76
|
case_corrected_right_columns: list[str] = []
|
|
85
|
-
hidden_columns = set()
|
|
86
|
-
# Propagate the hidden columns from left/right inputs to the result in case of chained joins
|
|
87
|
-
if left_container.column_map.hidden_columns:
|
|
88
|
-
hidden_columns.update(left_container.column_map.hidden_columns)
|
|
89
|
-
|
|
90
|
-
if right_container.column_map.hidden_columns:
|
|
91
|
-
hidden_columns.update(right_container.column_map.hidden_columns)
|
|
92
77
|
|
|
93
78
|
if rel.join.HasField("join_condition"):
|
|
94
79
|
assert not using_columns
|
|
@@ -120,8 +105,8 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
120
105
|
right=right_input,
|
|
121
106
|
on=join_expression.col,
|
|
122
107
|
how=join_type,
|
|
123
|
-
lsuffix=
|
|
124
|
-
rsuffix=
|
|
108
|
+
lsuffix="_left",
|
|
109
|
+
rsuffix="_right",
|
|
125
110
|
)
|
|
126
111
|
elif using_columns:
|
|
127
112
|
if any(
|
|
@@ -171,24 +156,12 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
171
156
|
)
|
|
172
157
|
)
|
|
173
158
|
|
|
174
|
-
using_columns_snowpark_names = (
|
|
175
|
-
left_container.column_map.get_snowpark_column_names_from_spark_column_names(
|
|
176
|
-
list(using_columns), return_first=True
|
|
177
|
-
)
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
using_columns_snowpark_types = [
|
|
181
|
-
left_container.dataframe.schema.fields[idx].datatype
|
|
182
|
-
for idx, col in enumerate(left_container.column_map.get_snowpark_columns())
|
|
183
|
-
if col in using_columns_snowpark_names
|
|
184
|
-
]
|
|
185
|
-
|
|
186
159
|
# Round trip the using columns through the column map to get the correct names
|
|
187
160
|
# in order to support case sensitivity.
|
|
188
161
|
# TODO: case_corrected_left_columns / case_corrected_right_columns may no longer be required as Snowpark dataframe preserves the column casing now.
|
|
189
|
-
case_corrected_left_columns = (
|
|
190
|
-
left_container.column_map.
|
|
191
|
-
|
|
162
|
+
case_corrected_left_columns = left_container.column_map.get_spark_column_names_from_snowpark_column_names(
|
|
163
|
+
left_container.column_map.get_snowpark_column_names_from_spark_column_names(
|
|
164
|
+
list(using_columns), return_first=True
|
|
192
165
|
)
|
|
193
166
|
)
|
|
194
167
|
case_corrected_right_columns = right_container.column_map.get_spark_column_names_from_snowpark_column_names(
|
|
@@ -222,141 +195,28 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
222
195
|
(left == right for left, right in snowpark_using_columns),
|
|
223
196
|
),
|
|
224
197
|
how=join_type,
|
|
225
|
-
rsuffix=DUPLICATED_JOIN_COL_RSUFFIX,
|
|
226
198
|
)
|
|
227
|
-
# If we disambiguated the snowpark_using_columns during the join, we need to update 'snowpark_using_columns' to
|
|
228
|
-
# use the disambiguated names.
|
|
229
|
-
disambiguated_snowpark_using_columns = []
|
|
230
|
-
|
|
231
|
-
# Ignore disambiguation for LEFT SEMI JOIN and LEFT ANTI JOIN because they drop the right columns, so it'll never disambiguate.
|
|
232
|
-
if join_type in ["leftsemi", "leftanti"]:
|
|
233
|
-
disambiguated_snowpark_using_columns = snowpark_using_columns
|
|
234
|
-
else:
|
|
235
|
-
normalized_joined_columns = [
|
|
236
|
-
unquote_if_quoted(col) for col in joined_df.columns
|
|
237
|
-
]
|
|
238
|
-
# snowpark_using_columns is a list of tuples of snowpark columns, joined_df.columns is a list of strings of column names
|
|
239
|
-
for (left, right) in snowpark_using_columns:
|
|
240
|
-
normalized_left_name = unquote_if_quoted(left.getName())
|
|
241
|
-
normalized_right_name = unquote_if_quoted(right.getName())
|
|
242
|
-
|
|
243
|
-
# are both left and right in joined_df? if not, it's been disambiguated
|
|
244
|
-
if (
|
|
245
|
-
normalized_left_name in normalized_joined_columns
|
|
246
|
-
and normalized_right_name in normalized_joined_columns
|
|
247
|
-
):
|
|
248
|
-
# we want to just add this
|
|
249
|
-
disambiguated_snowpark_using_columns.append((left, right))
|
|
250
|
-
else:
|
|
251
|
-
# we need to figure out the disambiguated names and add those - it only disambiguates if left == right
|
|
252
|
-
disambiguated_left: snowpark.Column | None = None
|
|
253
|
-
disambiguated_right: snowpark.Column | None = None
|
|
254
|
-
|
|
255
|
-
for col in normalized_joined_columns:
|
|
256
|
-
quoted_col = f'"{col}"'
|
|
257
|
-
# get the column name and cross check it to see if it ends with the og name
|
|
258
|
-
if col.endswith(normalized_left_name) and col.startswith("l_"):
|
|
259
|
-
disambiguated_left = joined_df[quoted_col]
|
|
260
|
-
elif col.endswith(normalized_right_name) and col.startswith(
|
|
261
|
-
"r_"
|
|
262
|
-
):
|
|
263
|
-
disambiguated_right = joined_df[quoted_col]
|
|
264
|
-
|
|
265
|
-
# If we have both disambiguated columns, we can break out of the loop to save processing time
|
|
266
|
-
if (
|
|
267
|
-
disambiguated_left is not None
|
|
268
|
-
and disambiguated_right is not None
|
|
269
|
-
):
|
|
270
|
-
break
|
|
271
|
-
if disambiguated_left is None or disambiguated_right is None:
|
|
272
|
-
raise AnalysisException(
|
|
273
|
-
f"Disambiguated columns not found for {normalized_left_name} and {normalized_right_name}."
|
|
274
|
-
)
|
|
275
|
-
disambiguated_snowpark_using_columns.append(
|
|
276
|
-
(disambiguated_left, disambiguated_right)
|
|
277
|
-
)
|
|
278
|
-
|
|
279
199
|
# For outer joins, we need to preserve join keys from both sides using COALESCE
|
|
280
|
-
"""
|
|
281
|
-
CHANGES:
|
|
282
|
-
- IF CASE
|
|
283
|
-
- Need to drop the using columns
|
|
284
|
-
- Need to create the hidden_columns DF with the using columns from right and left
|
|
285
|
-
- ELSE CASE
|
|
286
|
-
- Need to drop the right side using columns
|
|
287
|
-
- Need to create the hidden_columns DF with the using columns from right
|
|
288
|
-
"""
|
|
289
200
|
if join_type == "full_outer":
|
|
290
201
|
coalesced_columns = []
|
|
291
|
-
|
|
202
|
+
columns_to_drop = []
|
|
203
|
+
for i, (left_col, right_col) in enumerate(snowpark_using_columns):
|
|
292
204
|
# Use the original user-specified column name to preserve case sensitivity
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
coalesced_col = snowpark_fn.coalesce(
|
|
298
|
-
disambiguated_left_col, disambiguated_right_col
|
|
299
|
-
).alias(left_col.get_name())
|
|
300
|
-
coalesced_columns.append(coalesced_col)
|
|
301
|
-
|
|
302
|
-
# Create HiddenColumn objects for each hidden column
|
|
303
|
-
hidden_left = HiddenColumn(
|
|
304
|
-
hidden_snowpark_name=disambiguated_left_col.getName(),
|
|
305
|
-
spark_name=case_corrected_left_columns[i],
|
|
306
|
-
visible_snowpark_name=left_col.get_name(),
|
|
307
|
-
qualifiers=left_container.column_map.get_qualifier_for_spark_column(
|
|
308
|
-
case_corrected_left_columns[i]
|
|
309
|
-
),
|
|
310
|
-
original_position=left_container.column_map.get_spark_columns().index(
|
|
311
|
-
case_corrected_left_columns[i]
|
|
312
|
-
),
|
|
313
|
-
)
|
|
314
|
-
|
|
315
|
-
hidden_right = HiddenColumn(
|
|
316
|
-
hidden_snowpark_name=disambiguated_right_col.getName(),
|
|
317
|
-
spark_name=case_corrected_right_columns[i],
|
|
318
|
-
visible_snowpark_name=left_col.get_name(),
|
|
319
|
-
qualifiers=right_container.column_map.get_qualifier_for_spark_column(
|
|
320
|
-
case_corrected_right_columns[i]
|
|
321
|
-
),
|
|
322
|
-
original_position=right_container.column_map.get_spark_columns().index(
|
|
323
|
-
case_corrected_right_columns[i]
|
|
324
|
-
),
|
|
325
|
-
)
|
|
326
|
-
hidden_columns.update(
|
|
327
|
-
[
|
|
328
|
-
hidden_left,
|
|
329
|
-
hidden_right,
|
|
330
|
-
]
|
|
205
|
+
original_column_name = rel.join.using_columns[i]
|
|
206
|
+
coalesced_col = snowpark_fn.coalesce(left_col, right_col).alias(
|
|
207
|
+
original_column_name
|
|
331
208
|
)
|
|
209
|
+
coalesced_columns.append(coalesced_col)
|
|
210
|
+
columns_to_drop.extend([left_col, right_col])
|
|
332
211
|
|
|
333
|
-
# All non-hidden columns (not including the coalesced columns)
|
|
334
212
|
other_columns = [
|
|
335
213
|
snowpark_fn.col(col_name)
|
|
336
214
|
for col_name in joined_df.columns
|
|
337
|
-
if col_name not in [col.
|
|
215
|
+
if col_name not in [col.getName() for col in columns_to_drop]
|
|
338
216
|
]
|
|
339
217
|
result = joined_df.select(coalesced_columns + other_columns)
|
|
340
|
-
|
|
341
218
|
else:
|
|
342
219
|
result = joined_df.drop(*(right for _, right in snowpark_using_columns))
|
|
343
|
-
# We never run into the disambiguation case unless it's a full outer join.
|
|
344
|
-
for i, (left_col, right_col) in enumerate(
|
|
345
|
-
disambiguated_snowpark_using_columns
|
|
346
|
-
):
|
|
347
|
-
# Only right side columns are hidden
|
|
348
|
-
hidden_col = HiddenColumn(
|
|
349
|
-
hidden_snowpark_name=right_col.getName(),
|
|
350
|
-
spark_name=case_corrected_right_columns[i],
|
|
351
|
-
visible_snowpark_name=left_col.getName(),
|
|
352
|
-
qualifiers=right_container.column_map.get_qualifier_for_spark_column(
|
|
353
|
-
case_corrected_right_columns[i]
|
|
354
|
-
),
|
|
355
|
-
original_position=right_container.column_map.get_spark_columns().index(
|
|
356
|
-
case_corrected_right_columns[i]
|
|
357
|
-
),
|
|
358
|
-
)
|
|
359
|
-
hidden_columns.add(hidden_col)
|
|
360
220
|
else:
|
|
361
221
|
if join_type != "cross" and not global_config.spark_sql_crossJoin_enabled:
|
|
362
222
|
raise SparkException.implicit_cartesian_product("inner")
|
|
@@ -370,110 +230,35 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
370
230
|
# - LEFT SEMI JOIN: Returns left rows that have matches in right table (no right columns)
|
|
371
231
|
# - LEFT ANTI JOIN: Returns left rows that have NO matches in right table (no right columns)
|
|
372
232
|
# Both preserve only the columns from the left DataFrame without adding any columns from the right.
|
|
373
|
-
spark_cols_after_join = left_container.column_map.get_spark_columns()
|
|
374
|
-
snowpark_cols_after_join = left_container.column_map.get_snowpark_columns()
|
|
375
|
-
snowpark_col_types = [
|
|
376
|
-
f.datatype for f in left_container.dataframe.schema.fields
|
|
377
|
-
]
|
|
233
|
+
spark_cols_after_join: list[str] = left_container.column_map.get_spark_columns()
|
|
378
234
|
qualifiers = left_container.column_map.get_qualifiers()
|
|
379
|
-
elif join_type == "full_outer" and using_columns:
|
|
380
|
-
# We want the coalesced columns to be first, followed by all the left and right columns (excluding using columns)
|
|
381
|
-
spark_cols_after_join: list[str] = []
|
|
382
|
-
snowpark_cols_after_join: list[str] = []
|
|
383
|
-
snowpark_col_types: list[str] = []
|
|
384
|
-
|
|
385
|
-
left_container_snowpark_columns = (
|
|
386
|
-
left_container.column_map.get_snowpark_columns()
|
|
387
|
-
)
|
|
388
|
-
right_container_snowpark_columns = (
|
|
389
|
-
right_container.column_map.get_snowpark_columns()
|
|
390
|
-
)
|
|
391
|
-
|
|
392
|
-
qualifiers = []
|
|
393
|
-
for i in range(len(case_corrected_left_columns)):
|
|
394
|
-
spark_cols_after_join.append(case_corrected_left_columns[i])
|
|
395
|
-
snowpark_cols_after_join.append(using_columns_snowpark_names[i])
|
|
396
|
-
snowpark_col_types.append(using_columns_snowpark_types[i])
|
|
397
|
-
qualifiers.append([])
|
|
398
|
-
|
|
399
|
-
# Handle adding left and right columns, excluding the using columns
|
|
400
|
-
for i, spark_col in enumerate(left_container.column_map.get_spark_columns()):
|
|
401
|
-
if (
|
|
402
|
-
spark_col not in case_corrected_left_columns
|
|
403
|
-
or spark_col in left_container.column_map.get_spark_columns()[:i]
|
|
404
|
-
):
|
|
405
|
-
spark_cols_after_join.append(spark_col)
|
|
406
|
-
snowpark_cols_after_join.append(left_container_snowpark_columns[i])
|
|
407
|
-
qualifiers.append(
|
|
408
|
-
left_container.column_map.get_qualifier_for_spark_column(spark_col)
|
|
409
|
-
)
|
|
410
|
-
|
|
411
|
-
snowpark_col_types.append(
|
|
412
|
-
left_container.dataframe.schema.fields[i].datatype
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
for i, spark_col in enumerate(right_container.column_map.get_spark_columns()):
|
|
416
|
-
if (
|
|
417
|
-
spark_col not in case_corrected_right_columns
|
|
418
|
-
or spark_col in right_container.column_map.get_spark_columns()[:i]
|
|
419
|
-
):
|
|
420
|
-
spark_cols_after_join.append(spark_col)
|
|
421
|
-
snowpark_cols_after_join.append(right_container_snowpark_columns[i])
|
|
422
|
-
qualifiers.append(
|
|
423
|
-
right_container.column_map.get_qualifier_for_spark_column(spark_col)
|
|
424
|
-
)
|
|
425
|
-
|
|
426
|
-
snowpark_col_types.append(
|
|
427
|
-
right_container.dataframe.schema.fields[i].datatype
|
|
428
|
-
)
|
|
429
|
-
|
|
430
235
|
else:
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
236
|
+
# Add Spark columns and plan_ids from left DF
|
|
237
|
+
spark_cols_after_join: list[str] = list(
|
|
238
|
+
left_container.column_map.get_spark_columns()
|
|
239
|
+
) + [
|
|
240
|
+
spark_col
|
|
241
|
+
for i, spark_col in enumerate(
|
|
242
|
+
right_container.column_map.get_spark_columns()
|
|
243
|
+
)
|
|
244
|
+
if spark_col not in case_corrected_right_columns
|
|
245
|
+
or spark_col
|
|
246
|
+
in right_container.column_map.get_spark_columns()[
|
|
247
|
+
:i
|
|
248
|
+
] # this is to make sure we only remove the column once
|
|
435
249
|
]
|
|
436
250
|
|
|
437
|
-
qualifiers = left_container.column_map.get_qualifiers()
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
for i, spark_col in enumerate(right_container.column_map.get_spark_columns()):
|
|
442
|
-
if (
|
|
443
|
-
spark_col not in case_corrected_right_columns
|
|
444
|
-
or spark_col in right_container.column_map.get_spark_columns()[:i]
|
|
445
|
-
):
|
|
446
|
-
spark_cols_after_join.append(spark_col)
|
|
447
|
-
snowpark_cols_after_join.append(right_df_snowpark_columns[i])
|
|
448
|
-
snowpark_col_types.append(
|
|
449
|
-
right_container.dataframe.schema.fields[i].datatype
|
|
450
|
-
)
|
|
451
|
-
|
|
452
|
-
qualifiers.append(
|
|
453
|
-
right_container.column_map.get_qualifier_for_spark_column(spark_col)
|
|
454
|
-
)
|
|
455
|
-
|
|
456
|
-
snowpark_cols_after_join_deduplicated = []
|
|
457
|
-
snowpark_cols_after_join_counter = Counter(snowpark_cols_after_join)
|
|
458
|
-
seen_duplicated_columns = set()
|
|
459
|
-
|
|
460
|
-
for col in snowpark_cols_after_join:
|
|
461
|
-
if snowpark_cols_after_join_counter[col] == 2:
|
|
462
|
-
# This means that the same column exists twice in the joined df, likely due to a self-join and
|
|
463
|
-
# we need to lsuffix and rsuffix to the names of both columns, similar to what Snowpark did under the hood.
|
|
464
|
-
|
|
465
|
-
suffix = (
|
|
466
|
-
DUPLICATED_JOIN_COL_RSUFFIX
|
|
467
|
-
if col in seen_duplicated_columns
|
|
468
|
-
else DUPLICATED_JOIN_COL_LSUFFIX
|
|
251
|
+
qualifiers = list(left_container.column_map.get_qualifiers()) + [
|
|
252
|
+
right_container.column_map.get_qualifier_for_spark_column(spark_col)
|
|
253
|
+
for i, spark_col in enumerate(
|
|
254
|
+
right_container.column_map.get_spark_columns()
|
|
469
255
|
)
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
snowpark_cols_after_join_deduplicated.append(col)
|
|
256
|
+
if spark_col not in case_corrected_right_columns
|
|
257
|
+
or spark_col
|
|
258
|
+
in right_container.column_map.get_spark_columns()[
|
|
259
|
+
:i
|
|
260
|
+
] # this is to make sure we only remove the column once]
|
|
261
|
+
]
|
|
477
262
|
|
|
478
263
|
column_metadata = {}
|
|
479
264
|
if left_container.column_map.column_metadata:
|
|
@@ -502,13 +287,33 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
502
287
|
result_container = DataFrameContainer.create_with_column_mapping(
|
|
503
288
|
dataframe=result,
|
|
504
289
|
spark_column_names=spark_cols_after_join,
|
|
505
|
-
snowpark_column_names=
|
|
290
|
+
snowpark_column_names=result.columns,
|
|
506
291
|
column_metadata=column_metadata,
|
|
507
292
|
column_qualifiers=qualifiers,
|
|
508
|
-
hidden_columns=hidden_columns,
|
|
509
|
-
snowpark_column_types=snowpark_col_types,
|
|
510
293
|
)
|
|
511
294
|
|
|
295
|
+
# Fix for USING join column references with different plan IDs
|
|
296
|
+
# After a USING join, references to the right dataframe's columns should resolve
|
|
297
|
+
# to the result dataframe that contains the merged columns
|
|
298
|
+
if (
|
|
299
|
+
using_columns
|
|
300
|
+
and rel.join.right.HasField("common")
|
|
301
|
+
and rel.join.right.common.HasField("plan_id")
|
|
302
|
+
):
|
|
303
|
+
right_plan_id = rel.join.right.common.plan_id
|
|
304
|
+
set_plan_id_map(right_plan_id, result_container)
|
|
305
|
+
|
|
306
|
+
# For FULL OUTER joins, we also need to map the left dataframe's plan_id
|
|
307
|
+
# since both columns are replaced with a coalesced column
|
|
308
|
+
if (
|
|
309
|
+
using_columns
|
|
310
|
+
and join_type == "full_outer"
|
|
311
|
+
and rel.join.left.HasField("common")
|
|
312
|
+
and rel.join.left.common.HasField("plan_id")
|
|
313
|
+
):
|
|
314
|
+
left_plan_id = rel.join.left.common.plan_id
|
|
315
|
+
set_plan_id_map(left_plan_id, result_container)
|
|
316
|
+
|
|
512
317
|
if rel.join.using_columns:
|
|
513
318
|
# When join 'using_columns', the 'join columns' should go first in result DF.
|
|
514
319
|
idxs_to_shift = [
|
|
@@ -540,7 +345,6 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
540
345
|
cached_schema_getter=lambda: snowpark.types.StructType(
|
|
541
346
|
reorder(original_df.schema.fields)
|
|
542
347
|
),
|
|
543
|
-
hidden_columns=hidden_columns,
|
|
544
348
|
)
|
|
545
349
|
|
|
546
350
|
return result_container
|
|
@@ -8,28 +8,20 @@ from pyspark.sql.connect.proto.expressions_pb2 import CommonInlineUserDefinedFun
|
|
|
8
8
|
import snowflake.snowpark.functions as snowpark_fn
|
|
9
9
|
from snowflake import snowpark
|
|
10
10
|
from snowflake.snowpark.types import StructType
|
|
11
|
-
from snowflake.snowpark_connect.config import global_config
|
|
12
11
|
from snowflake.snowpark_connect.constants import MAP_IN_ARROW_EVAL_TYPE
|
|
13
12
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
14
13
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
15
14
|
from snowflake.snowpark_connect.type_mapping import proto_to_snowpark_type
|
|
16
|
-
from snowflake.snowpark_connect.utils.
|
|
17
|
-
from snowflake.snowpark_connect.utils.
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
require_creating_udf_in_sproc,
|
|
21
|
-
udf_check,
|
|
22
|
-
)
|
|
23
|
-
from snowflake.snowpark_connect.utils.udf_utils import (
|
|
24
|
-
ProcessCommonInlineUserDefinedFunction,
|
|
15
|
+
from snowflake.snowpark_connect.utils.context import map_partitions_depth
|
|
16
|
+
from snowflake.snowpark_connect.utils.pandas_udtf_utils import (
|
|
17
|
+
create_pandas_udtf,
|
|
18
|
+
create_pandas_udtf_with_arrow,
|
|
25
19
|
)
|
|
20
|
+
from snowflake.snowpark_connect.utils.udf_helper import udf_check
|
|
26
21
|
from snowflake.snowpark_connect.utils.udtf_helper import (
|
|
27
22
|
create_pandas_udtf_in_sproc,
|
|
28
23
|
require_creating_udtf_in_sproc,
|
|
29
24
|
)
|
|
30
|
-
from snowflake.snowpark_connect.utils.udxf_import_utils import (
|
|
31
|
-
get_python_udxf_import_files,
|
|
32
|
-
)
|
|
33
25
|
|
|
34
26
|
|
|
35
27
|
def map_map_partitions(
|
|
@@ -41,18 +33,10 @@ def map_map_partitions(
|
|
|
41
33
|
This is a simple wrapper around the `mapInPandas` method in Snowpark.
|
|
42
34
|
"""
|
|
43
35
|
input_container = map_relation(rel.map_partitions.input)
|
|
44
|
-
input_df = input_container.dataframe
|
|
45
36
|
udf_proto = rel.map_partitions.func
|
|
46
37
|
udf_check(udf_proto)
|
|
47
38
|
|
|
48
|
-
|
|
49
|
-
if (
|
|
50
|
-
udf_proto.WhichOneof("function") == "python_udf"
|
|
51
|
-
and udf_proto.python_udf.eval_type == MAP_IN_ARROW_EVAL_TYPE
|
|
52
|
-
):
|
|
53
|
-
return _map_in_arrow_with_pandas_udtf(input_container, udf_proto)
|
|
54
|
-
else:
|
|
55
|
-
return _map_partitions_with_udf(input_df, udf_proto)
|
|
39
|
+
return _map_with_pandas_udtf(input_container, udf_proto)
|
|
56
40
|
|
|
57
41
|
|
|
58
42
|
def _call_udtf(
|
|
@@ -71,12 +55,17 @@ def _call_udtf(
|
|
|
71
55
|
|
|
72
56
|
udtf_columns = input_df.columns + [snowpark_fn.col("_DUMMY_PARTITION_KEY")]
|
|
73
57
|
|
|
74
|
-
|
|
75
|
-
snowpark_fn.
|
|
76
|
-
partition_by=[snowpark_fn.col("_DUMMY_PARTITION_KEY")]
|
|
77
|
-
)
|
|
58
|
+
tfc = snowpark_fn.call_table_function(udtf_name, *udtf_columns).over(
|
|
59
|
+
partition_by=[snowpark_fn.col("_DUMMY_PARTITION_KEY")]
|
|
78
60
|
)
|
|
79
61
|
|
|
62
|
+
# Use map_partitions_depth only when mapping non nested map_partitions
|
|
63
|
+
# When mapping chained functions additional column casting is necessary
|
|
64
|
+
if map_partitions_depth() == 1:
|
|
65
|
+
result_df_with_dummy = input_df_with_dummy.join_table_function(tfc)
|
|
66
|
+
else:
|
|
67
|
+
result_df_with_dummy = input_df_with_dummy.select(tfc)
|
|
68
|
+
|
|
80
69
|
output_cols = [field.name for field in return_type.fields]
|
|
81
70
|
|
|
82
71
|
# Only return the output columns.
|
|
@@ -90,7 +79,7 @@ def _call_udtf(
|
|
|
90
79
|
)
|
|
91
80
|
|
|
92
81
|
|
|
93
|
-
def
|
|
82
|
+
def _map_with_pandas_udtf(
|
|
94
83
|
input_df_container: DataFrameContainer,
|
|
95
84
|
udf_proto: CommonInlineUserDefinedFunction,
|
|
96
85
|
) -> snowpark.DataFrame:
|
|
@@ -100,59 +89,29 @@ def _map_in_arrow_with_pandas_udtf(
|
|
|
100
89
|
input_df = input_df_container.dataframe
|
|
101
90
|
input_schema = input_df.schema
|
|
102
91
|
spark_column_names = input_df_container.column_map.get_spark_columns()
|
|
103
|
-
return_type = proto_to_snowpark_type(
|
|
92
|
+
return_type = proto_to_snowpark_type(
|
|
93
|
+
udf_proto.python_udf.output_type
|
|
94
|
+
if udf_proto.WhichOneof("function") == "python_udf"
|
|
95
|
+
else udf_proto.scalar_scala_udf.outputType
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Check if this is mapInArrow (eval_type == 207)
|
|
99
|
+
map_in_arrow = (
|
|
100
|
+
udf_proto.WhichOneof("function") == "python_udf"
|
|
101
|
+
and udf_proto.python_udf.eval_type == MAP_IN_ARROW_EVAL_TYPE
|
|
102
|
+
)
|
|
104
103
|
if require_creating_udtf_in_sproc(udf_proto):
|
|
105
104
|
udtf_name = create_pandas_udtf_in_sproc(
|
|
106
105
|
udf_proto, spark_column_names, input_schema, return_type
|
|
107
106
|
)
|
|
108
107
|
else:
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
108
|
+
if map_in_arrow:
|
|
109
|
+
map_udtf = create_pandas_udtf_with_arrow(
|
|
110
|
+
udf_proto, spark_column_names, input_schema, return_type
|
|
111
|
+
)
|
|
112
|
+
else:
|
|
113
|
+
map_udtf = create_pandas_udtf(
|
|
114
|
+
udf_proto, spark_column_names, input_schema, return_type
|
|
115
|
+
)
|
|
116
|
+
udtf_name = map_udtf.name
|
|
113
117
|
return _call_udtf(udtf_name, input_df, return_type)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def _map_partitions_with_udf(
|
|
117
|
-
input_df: snowpark.DataFrame, udf_proto
|
|
118
|
-
) -> snowpark.DataFrame:
|
|
119
|
-
"""
|
|
120
|
-
Original UDF-based approach for non-mapInArrow map_partitions cases.
|
|
121
|
-
"""
|
|
122
|
-
input_column_names = input_df.columns
|
|
123
|
-
kwargs = {
|
|
124
|
-
"common_inline_user_defined_function": udf_proto,
|
|
125
|
-
"input_types": [f.datatype for f in input_df.schema.fields],
|
|
126
|
-
"called_from": "map_map_partitions",
|
|
127
|
-
"udf_name": "spark_map_partitions_udf",
|
|
128
|
-
"input_column_names": input_column_names,
|
|
129
|
-
"replace": True,
|
|
130
|
-
"return_type": proto_to_snowpark_type(
|
|
131
|
-
udf_proto.python_udf.output_type
|
|
132
|
-
if udf_proto.WhichOneof("function") == "python_udf"
|
|
133
|
-
else udf_proto.scalar_scala_udf.outputType
|
|
134
|
-
),
|
|
135
|
-
"udf_packages": global_config.get("snowpark.connect.udf.packages", ""),
|
|
136
|
-
"udf_imports": get_python_udxf_import_files(input_df.session),
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
if require_creating_udf_in_sproc(udf_proto):
|
|
140
|
-
snowpark_udf = process_udf_in_sproc(**kwargs)
|
|
141
|
-
else:
|
|
142
|
-
udf_processor = ProcessCommonInlineUserDefinedFunction(**kwargs)
|
|
143
|
-
udf = udf_processor.create_udf()
|
|
144
|
-
snowpark_udf = SnowparkUDF(
|
|
145
|
-
name=udf.name,
|
|
146
|
-
input_types=udf._input_types,
|
|
147
|
-
return_type=udf._return_type,
|
|
148
|
-
original_return_type=None,
|
|
149
|
-
)
|
|
150
|
-
udf_column_name = "UDF_OUTPUT"
|
|
151
|
-
snowpark_columns = [snowpark_fn.col(name) for name in input_df.columns]
|
|
152
|
-
result = input_df.select(snowpark_fn.call_udf(snowpark_udf.name, *snowpark_columns))
|
|
153
|
-
return DataFrameContainer.create_with_column_mapping(
|
|
154
|
-
dataframe=result,
|
|
155
|
-
spark_column_names=[udf_column_name],
|
|
156
|
-
snowpark_column_names=[udf_column_name],
|
|
157
|
-
snowpark_column_types=[snowpark_udf.return_type],
|
|
158
|
-
)
|
|
@@ -15,6 +15,8 @@ from snowflake.snowpark_connect.utils.cache import (
|
|
|
15
15
|
from snowflake.snowpark_connect.utils.context import (
|
|
16
16
|
get_plan_id_map,
|
|
17
17
|
get_session_id,
|
|
18
|
+
not_resolving_fun_args,
|
|
19
|
+
push_map_partitions,
|
|
18
20
|
push_operation_scope,
|
|
19
21
|
set_is_aggregate_function,
|
|
20
22
|
set_plan_id_map,
|
|
@@ -149,7 +151,10 @@ def map_relation(
|
|
|
149
151
|
case "drop_na":
|
|
150
152
|
result = map_row_ops.map_dropna(rel)
|
|
151
153
|
case "extension":
|
|
152
|
-
|
|
154
|
+
# Extensions can be passed as function args, and we need to reset the context here.
|
|
155
|
+
# Matters only for resolving alias expressions in the extensions rel.
|
|
156
|
+
with not_resolving_fun_args():
|
|
157
|
+
result = map_extension.map_extension(rel)
|
|
153
158
|
case "fill_na":
|
|
154
159
|
result = map_row_ops.map_fillna(rel)
|
|
155
160
|
case "filter":
|
|
@@ -180,7 +185,8 @@ def map_relation(
|
|
|
180
185
|
)
|
|
181
186
|
return cached_df
|
|
182
187
|
case "map_partitions":
|
|
183
|
-
|
|
188
|
+
with push_map_partitions():
|
|
189
|
+
result = map_map_partitions.map_map_partitions(rel)
|
|
184
190
|
case "offset":
|
|
185
191
|
result = map_row_ops.map_offset(rel)
|
|
186
192
|
case "project":
|