snowpark-connect 0.33.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (39) hide show
  1. snowflake/snowpark_connect/column_name_handler.py +42 -56
  2. snowflake/snowpark_connect/config.py +9 -0
  3. snowflake/snowpark_connect/expression/literal.py +12 -12
  4. snowflake/snowpark_connect/expression/map_sql_expression.py +6 -0
  5. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +147 -63
  6. snowflake/snowpark_connect/expression/map_unresolved_function.py +31 -28
  7. snowflake/snowpark_connect/relation/map_aggregate.py +156 -255
  8. snowflake/snowpark_connect/relation/map_column_ops.py +14 -0
  9. snowflake/snowpark_connect/relation/map_join.py +364 -234
  10. snowflake/snowpark_connect/relation/map_sql.py +309 -150
  11. snowflake/snowpark_connect/relation/read/map_read.py +9 -1
  12. snowflake/snowpark_connect/relation/read/map_read_csv.py +19 -2
  13. snowflake/snowpark_connect/relation/read/map_read_json.py +3 -0
  14. snowflake/snowpark_connect/relation/read/map_read_parquet.py +3 -0
  15. snowflake/snowpark_connect/relation/read/map_read_text.py +4 -0
  16. snowflake/snowpark_connect/relation/read/reader_config.py +10 -0
  17. snowflake/snowpark_connect/relation/read/utils.py +41 -0
  18. snowflake/snowpark_connect/relation/utils.py +4 -2
  19. snowflake/snowpark_connect/relation/write/map_write.py +65 -17
  20. snowflake/snowpark_connect/utils/context.py +0 -14
  21. snowflake/snowpark_connect/utils/expression_transformer.py +163 -0
  22. snowflake/snowpark_connect/utils/session.py +0 -4
  23. snowflake/snowpark_connect/utils/udf_helper.py +1 -0
  24. snowflake/snowpark_connect/utils/udtf_helper.py +3 -0
  25. snowflake/snowpark_connect/version.py +1 -1
  26. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/METADATA +2 -2
  27. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/RECORD +35 -38
  28. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/__init__.py +0 -16
  29. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/accessors.py +0 -1281
  30. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/functions.py +0 -203
  31. snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/utils.py +0 -202
  32. {snowpark_connect-0.33.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-connect +0 -0
  33. {snowpark_connect-0.33.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-session +0 -0
  34. {snowpark_connect-0.33.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-submit +0 -0
  35. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/WHEEL +0 -0
  36. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/LICENSE-binary +0 -0
  37. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/LICENSE.txt +0 -0
  38. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/NOTICE-binary +0 -0
  39. {snowpark_connect-0.33.0.dist-info → snowpark_connect-1.0.0.dist-info}/top_level.txt +0 -0
@@ -2,6 +2,7 @@
2
2
  # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
4
  import dataclasses
5
+ from collections.abc import Callable
5
6
  from enum import Enum
6
7
  from functools import reduce
7
8
  from typing import Optional
@@ -11,12 +12,12 @@ from pyspark.errors import AnalysisException
11
12
 
12
13
  import snowflake.snowpark.functions as snowpark_fn
13
14
  from snowflake import snowpark
15
+ from snowflake.snowpark import DataFrame
14
16
  from snowflake.snowpark.types import StructField, StructType
15
17
  from snowflake.snowpark_connect.column_name_handler import (
16
18
  JoinColumnNameMap,
17
19
  make_unique_snowpark_name,
18
20
  )
19
- from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
20
21
  from snowflake.snowpark_connect.config import global_config
21
22
  from snowflake.snowpark_connect.constants import COLUMN_METADATA_COLLISION_KEY
22
23
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
@@ -60,9 +61,7 @@ class JoinInfo:
60
61
  join_type: str
61
62
  condition_type: ConditionType
62
63
  join_columns: Optional[list[str]]
63
-
64
- def has_join_condition(self) -> bool:
65
- return self.condition_type == ConditionType.JOIN_CONDITION
64
+ just_left_columns: bool
66
65
 
67
66
  def is_using_columns(self):
68
67
  return self.condition_type == ConditionType.USING_COLUMNS
@@ -77,196 +76,26 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
77
76
  left_container = filter_metadata_columns(left_container)
78
77
  right_container = filter_metadata_columns(right_container)
79
78
 
80
- left_input: snowpark.DataFrame = left_container.dataframe
81
- right_input: snowpark.DataFrame = right_container.dataframe
79
+ # if there are any conflicting snowpark columns, this is the time to rename them
80
+ left_container, right_container = _disambiguate_snowpark_columns(
81
+ left_container, right_container, rel
82
+ )
82
83
 
83
84
  join_info = _get_join_info(rel, left_container, right_container)
84
- join_type = join_info.join_type
85
-
86
- if join_info.has_join_condition():
87
- left_columns = list(left_container.column_map.spark_to_col.keys())
88
- right_columns = list(right_container.column_map.spark_to_col.keys())
89
-
90
- # All PySpark join types are in the format of JOIN_TYPE_XXX.
91
- # We remove the first 10 characters (JOIN_TYPE_) and replace all underscores with spaces to match the exception.
92
- pyspark_join_type = relation_proto.Join.JoinType.Name(rel.join.join_type)[
93
- 10:
94
- ].replace("_", " ")
95
- with push_sql_scope(), push_evaluating_join_condition(
96
- pyspark_join_type, left_columns, right_columns
97
- ):
98
- if left_container.alias is not None:
99
- set_sql_plan_name(left_container.alias, rel.join.left.common.plan_id)
100
- if right_container.alias is not None:
101
- set_sql_plan_name(right_container.alias, rel.join.right.common.plan_id)
102
- _, join_expression = map_single_column_expression(
103
- rel.join.join_condition,
104
- column_mapping=JoinColumnNameMap(
105
- left_container.column_map,
106
- right_container.column_map,
107
- ),
108
- typer=JoinExpressionTyper(left_input, right_input),
109
- )
110
- result: snowpark.DataFrame = left_input.join(
111
- right=right_input,
112
- on=join_expression.col,
113
- how="inner" if join_info.join_type == "cross" else join_info.join_type,
114
- lsuffix="_left",
115
- rsuffix="_right",
116
- )
117
- elif join_info.is_using_columns():
118
- # TODO: disambiguate snowpark columns for all join condition types
119
- # disambiguation temporarily done only for using_columns/natural joins to reduce changes
120
- left_container, right_container = _disambiguate_snowpark_columns(
121
- left_container, right_container
122
- )
123
- left_input = left_container.dataframe
124
- right_input = right_container.dataframe
125
-
126
- join_columns = join_info.join_columns
127
85
 
128
- def _validate_using_column(
129
- column: str, container: DataFrameContainer, side: str
130
- ) -> None:
131
- if (
132
- container.column_map.get_snowpark_column_name_from_spark_column_name(
133
- column, allow_non_exists=True, return_first=True
134
- )
135
- is None
136
- ):
137
- exception = AnalysisException(
138
- USING_COLUMN_NOT_FOUND_ERROR.format(
139
- column, side, container.column_map.get_spark_columns()
140
- )
141
- )
142
- attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
143
- raise exception
144
-
145
- for col in join_columns:
146
- _validate_using_column(col, left_container, "left")
147
- _validate_using_column(col, right_container, "right")
148
-
149
- # We cannot assume that Snowpark will have the same names for left and right columns,
150
- # so we convert ["a", "b"] into (left["a"] == right["a"] & left["b"] == right["b"]),
151
- # then drop right["a"] and right["b"].
152
- snowpark_using_columns = [
153
- (
154
- left_input[
155
- left_container.column_map.get_snowpark_column_name_from_spark_column_name(
156
- spark_name, return_first=True
157
- )
158
- ],
159
- right_input[
160
- right_container.column_map.get_snowpark_column_name_from_spark_column_name(
161
- spark_name, return_first=True
162
- )
163
- ],
86
+ match join_info.condition_type:
87
+ case ConditionType.JOIN_CONDITION:
88
+ result_container = _join_using_condition(
89
+ left_container, right_container, join_info, rel
164
90
  )
165
- for spark_name in join_columns
166
- ]
167
- joined_df = left_input.join(
168
- right=right_input,
169
- on=reduce(
170
- snowpark.Column.__and__,
171
- (left == right for left, right in snowpark_using_columns),
172
- ),
173
- how=join_type,
174
- )
175
- # For outer joins, we need to preserve join keys from both sides using COALESCE
176
- if join_type == "full_outer":
177
- coalesced_columns = []
178
- columns_to_drop = []
179
- for i, (left_col, right_col) in enumerate(snowpark_using_columns):
180
- # Use the original user-specified column name to preserve case sensitivity
181
- original_column_name = rel.join.using_columns[i]
182
- coalesced_col = snowpark_fn.coalesce(left_col, right_col).alias(
183
- original_column_name
184
- )
185
- coalesced_columns.append(coalesced_col)
186
- columns_to_drop.extend([left_col, right_col])
187
-
188
- other_columns = [
189
- snowpark_fn.col(col_name)
190
- for col_name in joined_df.columns
191
- if col_name not in [col.getName() for col in columns_to_drop]
192
- ]
193
- result = joined_df.select(coalesced_columns + other_columns)
194
- else:
195
- result = joined_df.drop(*(right for _, right in snowpark_using_columns))
196
- else:
197
- if join_type != "cross" and not global_config.spark_sql_crossJoin_enabled:
198
- exception = SparkException.implicit_cartesian_product("inner")
199
- attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
200
- raise exception
201
- # For outer joins without a condition, we need to use a TRUE condition
202
- # to match Spark's behavior.
203
- if join_type in ["left", "right", "full_outer"]:
204
- result: snowpark.DataFrame = left_input.join(
205
- right=right_input,
206
- on=snowpark_fn.lit(True),
207
- how=join_type,
208
- )
209
- else:
210
- result: snowpark.DataFrame = left_input.join(
211
- right=right_input,
212
- how=join_type,
213
- )
214
-
215
- if join_type in ["leftanti", "leftsemi"]:
216
- # Join types that only return columns from the left side:
217
- # - LEFT SEMI JOIN: Returns left rows that have matches in right table (no right columns)
218
- # - LEFT ANTI JOIN: Returns left rows that have NO matches in right table (no right columns)
219
- # Both preserve only the columns from the left DataFrame without adding any columns from the right.
220
- spark_cols_after_join: list[str] = left_container.column_map.get_spark_columns()
221
- qualifiers = left_container.column_map.get_qualifiers()
222
- else:
223
- if not join_info.is_using_columns():
224
- spark_cols_after_join: list[str] = (
225
- left_container.column_map.get_spark_columns()
226
- + right_container.column_map.get_spark_columns()
91
+ case ConditionType.USING_COLUMNS:
92
+ result_container = _join_using_columns(
93
+ left_container, right_container, join_info
227
94
  )
228
- qualifiers: list[set[ColumnQualifier]] = (
229
- left_container.column_map.get_qualifiers()
230
- + right_container.column_map.get_qualifiers()
95
+ case _:
96
+ result_container = _join_unconditionally(
97
+ left_container, right_container, join_info
231
98
  )
232
- else:
233
- # get columns after join
234
- joined_columns = left_container.column_map.get_columns_after_join(
235
- right_container.column_map, join_info.join_columns
236
- )
237
- spark_cols_after_join: list[str] = [c.spark_name for c in joined_columns]
238
- qualifiers: list[set[ColumnQualifier]] = [
239
- c.qualifiers for c in joined_columns
240
- ]
241
-
242
- column_metadata = dict(left_container.column_map.column_metadata or {})
243
- if right_container.column_map.column_metadata:
244
- for key, value in right_container.column_map.column_metadata.items():
245
- if key not in column_metadata:
246
- column_metadata[key] = value
247
- else:
248
- # In case of collision, use snowpark's column's expr_id as prefix.
249
- # this is a temporary solution until SNOW-1926440 is resolved.
250
- try:
251
- snowpark_name = right_container.column_map.get_snowpark_column_name_from_spark_column_name(
252
- key
253
- )
254
- expr_id = right_input[snowpark_name]._expression.expr_id
255
- updated_key = COLUMN_METADATA_COLLISION_KEY.format(
256
- expr_id=expr_id, key=snowpark_name
257
- )
258
- column_metadata[updated_key] = value
259
- except Exception:
260
- # ignore any errors that happens while fetching the metadata
261
- pass
262
-
263
- result_container = DataFrameContainer.create_with_column_mapping(
264
- dataframe=result,
265
- spark_column_names=spark_cols_after_join,
266
- snowpark_column_names=result.columns,
267
- column_metadata=column_metadata,
268
- column_qualifiers=qualifiers,
269
- )
270
99
 
271
100
  # Fix for USING join column references with different plan IDs
272
101
  # After a USING join, references to the right dataframe's columns should resolve
@@ -283,47 +112,266 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
283
112
  # since both columns are replaced with a coalesced column
284
113
  if (
285
114
  join_info.is_using_columns()
286
- and join_type == "full_outer"
115
+ and join_info.join_type == "full_outer"
287
116
  and rel.join.left.HasField("common")
288
117
  and rel.join.left.common.HasField("plan_id")
289
118
  ):
290
119
  left_plan_id = rel.join.left.common.plan_id
291
120
  set_plan_id_map(left_plan_id, result_container)
292
121
 
293
- if join_info.is_using_columns():
294
- # When join 'using_columns', the 'join columns' should go first in result DF.
295
- # we're only shifting left side columns, since we dropped the right-side ones
296
- idxs_to_shift = left_container.column_map.get_column_indexes(
297
- join_info.join_columns
122
+ return result_container
123
+
124
+
125
+ def _join_unconditionally(
126
+ left_container: DataFrameContainer,
127
+ right_container: DataFrameContainer,
128
+ info: JoinInfo,
129
+ ) -> DataFrameContainer:
130
+ if info.join_type != "cross" and not global_config.spark_sql_crossJoin_enabled:
131
+ exception = SparkException.implicit_cartesian_product("inner")
132
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
133
+ raise exception
134
+
135
+ left_input = left_container.dataframe
136
+ right_input = right_container.dataframe
137
+ join_type = info.join_type
138
+
139
+ # For outer joins without a condition, we need to use a TRUE condition
140
+ # to match Spark's behavior.
141
+ result: snowpark.DataFrame = left_input.join(
142
+ right=right_input,
143
+ on=snowpark_fn.lit(True)
144
+ if join_type in ["left", "right", "full_outer"]
145
+ else None,
146
+ how=join_type,
147
+ )
148
+
149
+ columns = left_container.column_map.columns + right_container.column_map.columns
150
+ column_metadata = _combine_metadata(left_container, right_container)
151
+
152
+ if info.just_left_columns:
153
+ columns = left_container.column_map.columns
154
+ column_metadata = left_container.column_map.column_metadata
155
+ result = result.select(*left_container.column_map.get_snowpark_columns())
156
+
157
+ snowpark_columns = [c.snowpark_name for c in columns]
158
+
159
+ return DataFrameContainer.create_with_column_mapping(
160
+ dataframe=result,
161
+ spark_column_names=[c.spark_name for c in columns],
162
+ snowpark_column_names=snowpark_columns,
163
+ column_metadata=column_metadata,
164
+ column_qualifiers=[c.qualifiers for c in columns],
165
+ cached_schema_getter=_build_joined_schema(
166
+ snowpark_columns, left_input, right_input
167
+ ),
168
+ )
169
+
170
+
171
+ def _join_using_columns(
172
+ left_container: DataFrameContainer,
173
+ right_container: DataFrameContainer,
174
+ info: JoinInfo,
175
+ ) -> DataFrameContainer:
176
+ join_columns = info.join_columns
177
+
178
+ def _validate_using_column(
179
+ column: str, container: DataFrameContainer, side: str
180
+ ) -> None:
181
+ if (
182
+ container.column_map.get_snowpark_column_name_from_spark_column_name(
183
+ column, allow_non_exists=True, return_first=True
184
+ )
185
+ is None
186
+ ):
187
+ exception = AnalysisException(
188
+ USING_COLUMN_NOT_FOUND_ERROR.format(
189
+ column, side, container.column_map.get_spark_columns()
190
+ )
191
+ )
192
+ attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
193
+ raise exception
194
+
195
+ for col in join_columns:
196
+ _validate_using_column(col, left_container, "left")
197
+ _validate_using_column(col, right_container, "right")
198
+
199
+ left_input = left_container.dataframe
200
+ right_input = right_container.dataframe
201
+
202
+ # The inputs will have different snowpark names for the same spark name,
203
+ # so we convert ["a", "b"] into (left["a"] == right["a"] & left["b"] == right["b"]),
204
+ # then drop right["a"] and right["b"].
205
+ snowpark_using_columns = [
206
+ (
207
+ snowpark_fn.col(
208
+ left_container.column_map.get_snowpark_column_name_from_spark_column_name(
209
+ spark_name, return_first=True
210
+ )
211
+ ),
212
+ snowpark_fn.col(
213
+ right_container.column_map.get_snowpark_column_name_from_spark_column_name(
214
+ spark_name, return_first=True
215
+ )
216
+ ),
298
217
  )
218
+ for spark_name in join_columns
219
+ ]
220
+
221
+ # this is a condition join, so it will contain left + right columns
222
+ # we need to postprocess this later to have a correct projection
223
+ joined_df = left_input.join(
224
+ right=right_input,
225
+ on=reduce(
226
+ snowpark.Column.__and__,
227
+ (left == right for left, right in snowpark_using_columns),
228
+ ),
229
+ how=info.join_type,
230
+ )
231
+
232
+ # figure out default column ordering after the join
233
+ columns = left_container.column_map.get_columns_after_join(
234
+ right_container.column_map, join_columns, info.join_type
235
+ )
236
+
237
+ # For outer joins, we need to preserve join keys from both sides using COALESCE
238
+ if info.join_type == "full_outer":
239
+ coalesced_columns = []
240
+ coalesced_column_names = []
241
+ for i, (left_col, right_col) in enumerate(snowpark_using_columns):
242
+ # spark uses the left side spark name
243
+ spark_name = columns[i].spark_name
244
+ new_snowpark_name = make_unique_snowpark_name(spark_name)
245
+ coalesced_col = snowpark_fn.coalesce(left_col, right_col).alias(
246
+ new_snowpark_name
247
+ )
248
+ coalesced_columns.append(coalesced_col)
249
+ coalesced_column_names.append((spark_name, new_snowpark_name))
250
+
251
+ # join columns need to be replaced, so we need the original names for schema lookup later
252
+ snowpark_names_for_schema_lookup = [c.snowpark_name for c in columns]
253
+
254
+ # we need to use the coalesced columns instead of the left-side join columns
255
+ columns = columns[len(join_columns) :]
299
256
 
300
- def reorder(lst: list) -> list:
301
- to_move = [lst[i] for i in idxs_to_shift]
302
- remaining = [el for i, el in enumerate(lst) if i not in idxs_to_shift]
303
- return to_move + remaining
257
+ non_join_columns = [snowpark_fn.col(c.snowpark_name) for c in columns]
258
+ result = joined_df.select(coalesced_columns + non_join_columns)
259
+
260
+ spark_names = [spark_name for spark_name, _ in coalesced_column_names] + [
261
+ c.spark_name for c in columns
262
+ ]
263
+ snowpark_names = [
264
+ snowpark_name for _, snowpark_name in coalesced_column_names
265
+ ] + [c.snowpark_name for c in columns]
266
+ qualifiers = ([set()] * len(join_columns)) + [c.qualifiers for c in columns]
304
267
 
305
- # Create reordered DataFrame
306
- reordered_df = result_container.dataframe.select(
307
- [snowpark_fn.col(c) for c in reorder(result_container.dataframe.columns)]
268
+ return DataFrameContainer.create_with_column_mapping(
269
+ dataframe=result,
270
+ spark_column_names=spark_names,
271
+ snowpark_column_names=snowpark_names,
272
+ column_metadata=_combine_metadata(left_container, right_container),
273
+ column_qualifiers=qualifiers,
274
+ cached_schema_getter=_build_joined_schema(
275
+ snowpark_names_for_schema_lookup,
276
+ left_input,
277
+ right_input,
278
+ snowpark_names,
279
+ ),
308
280
  )
309
281
 
310
- # Create new container with reordered metadata
311
- original_df = result_container.dataframe
282
+ if info.just_left_columns:
283
+ # we just need the left columns
284
+ columns = columns[: len(left_container.column_map.columns)]
285
+ snowpark_columns = [c.snowpark_name for c in columns]
286
+ result = joined_df.select(*snowpark_columns)
287
+
312
288
  return DataFrameContainer.create_with_column_mapping(
313
- dataframe=reordered_df,
314
- spark_column_names=reorder(result_container.column_map.get_spark_columns()),
315
- snowpark_column_names=reorder(
316
- result_container.column_map.get_snowpark_columns()
289
+ dataframe=result,
290
+ spark_column_names=[c.spark_name for c in columns],
291
+ snowpark_column_names=snowpark_columns,
292
+ column_metadata=left_container.column_map.column_metadata,
293
+ column_qualifiers=[c.qualifiers for c in columns],
294
+ cached_schema_getter=_build_joined_schema(
295
+ snowpark_columns, left_input, right_input
317
296
  ),
318
- column_metadata=column_metadata,
319
- column_qualifiers=reorder(qualifiers),
320
- table_name=result_container.table_name,
321
- cached_schema_getter=lambda: snowpark.types.StructType(
322
- reorder(original_df.schema.fields)
297
+ )
298
+
299
+ snowpark_columns = [c.snowpark_name for c in columns]
300
+ result = joined_df.select(*snowpark_columns)
301
+ return DataFrameContainer.create_with_column_mapping(
302
+ dataframe=result,
303
+ spark_column_names=[c.spark_name for c in columns],
304
+ snowpark_column_names=snowpark_columns,
305
+ column_metadata=_combine_metadata(left_container, right_container),
306
+ column_qualifiers=[c.qualifiers for c in columns],
307
+ cached_schema_getter=_build_joined_schema(
308
+ snowpark_columns, left_input, right_input
309
+ ),
310
+ )
311
+
312
+
313
+ def _join_using_condition(
314
+ left_container: DataFrameContainer,
315
+ right_container: DataFrameContainer,
316
+ info: JoinInfo,
317
+ rel: relation_proto.Relation,
318
+ ) -> DataFrameContainer:
319
+ left_columns = left_container.column_map.get_spark_columns()
320
+ right_columns = right_container.column_map.get_spark_columns()
321
+
322
+ left_input = left_container.dataframe
323
+ right_input = right_container.dataframe
324
+
325
+ # All PySpark join types are in the format of JOIN_TYPE_XXX.
326
+ # We remove the first 10 characters (JOIN_TYPE_) and replace all underscores with spaces to match the exception.
327
+ pyspark_join_type = relation_proto.Join.JoinType.Name(rel.join.join_type)[
328
+ 10:
329
+ ].replace("_", " ")
330
+ with push_sql_scope(), push_evaluating_join_condition(
331
+ pyspark_join_type, left_columns, right_columns
332
+ ):
333
+ if left_container.alias is not None:
334
+ set_sql_plan_name(left_container.alias, rel.join.left.common.plan_id)
335
+ if right_container.alias is not None:
336
+ set_sql_plan_name(right_container.alias, rel.join.right.common.plan_id)
337
+ # resolve join condition expression
338
+ _, join_expression = map_single_column_expression(
339
+ rel.join.join_condition,
340
+ column_mapping=JoinColumnNameMap(
341
+ left_container.column_map,
342
+ right_container.column_map,
323
343
  ),
344
+ typer=JoinExpressionTyper(left_input, right_input),
324
345
  )
325
346
 
326
- return result_container
347
+ result: snowpark.DataFrame = left_input.join(
348
+ right=right_input,
349
+ on=join_expression.col,
350
+ how=info.join_type,
351
+ )
352
+
353
+ # column order is already correct, so we just take the left + right side list
354
+ columns = left_container.column_map.columns + right_container.column_map.columns
355
+ column_metadata = _combine_metadata(left_container, right_container)
356
+
357
+ if info.just_left_columns:
358
+ # we just need left-side columns
359
+ columns = left_container.column_map.columns
360
+ result = result.select(*[c.snowpark_name for c in columns])
361
+ column_metadata = left_container.column_map.column_metadata
362
+
363
+ snowpark_columns = [c.snowpark_name for c in columns]
364
+
365
+ return DataFrameContainer.create_with_column_mapping(
366
+ dataframe=result,
367
+ spark_column_names=[c.spark_name for c in columns],
368
+ snowpark_column_names=snowpark_columns,
369
+ column_metadata=column_metadata,
370
+ column_qualifiers=[c.qualifiers for c in columns],
371
+ cached_schema_getter=_build_joined_schema(
372
+ snowpark_columns, left_input, right_input
373
+ ),
374
+ )
327
375
 
328
376
 
329
377
  def _get_join_info(
@@ -372,6 +420,10 @@ def _get_join_info(
372
420
  has_join_condition = rel.join.HasField("join_condition")
373
421
  is_using_columns = bool(join_columns)
374
422
 
423
+ if join_type == "cross" and has_join_condition:
424
+ # if the user provided any condition, it's no longer a cross join
425
+ join_type = "inner"
426
+
375
427
  if has_join_condition:
376
428
  assert not is_using_columns
377
429
 
@@ -381,11 +433,17 @@ def _get_join_info(
381
433
  elif is_using_columns:
382
434
  condition_type = ConditionType.USING_COLUMNS
383
435
 
384
- return JoinInfo(join_type, condition_type, join_columns)
436
+ # Join types that only return columns from the left side:
437
+ # - LEFT SEMI JOIN: Returns left rows that have matches in right table (no right columns)
438
+ # - LEFT ANTI JOIN: Returns left rows that have NO matches in right table (no right columns)
439
+ # Both preserve only the columns from the left DataFrame without adding any columns from the right.
440
+ just_left_columns = join_type in ["leftanti", "leftsemi"]
441
+
442
+ return JoinInfo(join_type, condition_type, join_columns, just_left_columns)
385
443
 
386
444
 
387
445
  def _disambiguate_snowpark_columns(
388
- left: DataFrameContainer, right: DataFrameContainer
446
+ left: DataFrameContainer, right: DataFrameContainer, rel: relation_proto.Relation
389
447
  ) -> tuple[DataFrameContainer, DataFrameContainer]:
390
448
  conflicting_snowpark_columns = left.column_map.get_conflicting_snowpark_columns(
391
449
  right.column_map
@@ -394,14 +452,24 @@ def _disambiguate_snowpark_columns(
394
452
  if not conflicting_snowpark_columns:
395
453
  return left, right
396
454
 
397
- # rename and create new containers
398
- return _disambiguate_container(
399
- left, conflicting_snowpark_columns
400
- ), _disambiguate_container(right, conflicting_snowpark_columns)
455
+ left_plan = rel.join.left.common.plan_id
456
+ right_plan = rel.join.right.common.plan_id
457
+
458
+ if left_plan == right_plan:
459
+ # don't overwrite plan_id map for self joins
460
+ right_plan = None
461
+
462
+ # rename and create new right container
463
+ # TODO: rename both sides after SNOW-2382499
464
+ return left, _disambiguate_container(
465
+ right, conflicting_snowpark_columns, right_plan
466
+ )
401
467
 
402
468
 
403
469
  def _disambiguate_container(
404
- container: DataFrameContainer, conflicting_snowpark_columns: set[str]
470
+ container: DataFrameContainer,
471
+ conflicting_snowpark_columns: set[str],
472
+ plan_id: Optional[int],
405
473
  ) -> DataFrameContainer:
406
474
  column_map = container.column_map
407
475
  disambiguated_columns = []
@@ -420,25 +488,87 @@ def _disambiguate_container(
420
488
 
421
489
  disambiguated_df = container.dataframe.select(*disambiguated_columns)
422
490
 
423
- def _get_new_schema():
424
- old_schema = container.dataframe.schema
425
- if not old_schema.fields:
426
- return StructType([])
427
-
428
- new_fields = []
429
- for i, name in enumerate(disambiguated_snowpark_names):
430
- f = old_schema.fields[i]
431
- new_fields.append(
432
- StructField(name, f.datatype, nullable=f.nullable, _is_column=True)
433
- )
434
- return StructType(new_fields)
491
+ def _schema_getter() -> StructType:
492
+ fields = container.dataframe.schema.fields
493
+ return StructType(
494
+ [
495
+ StructField(name, fields[i].datatype, fields[i].nullable)
496
+ for i, name in enumerate(disambiguated_snowpark_names)
497
+ ]
498
+ )
435
499
 
436
- return DataFrameContainer.create_with_column_mapping(
500
+ disambiguated_container = DataFrameContainer.create_with_column_mapping(
437
501
  dataframe=disambiguated_df,
438
502
  spark_column_names=column_map.get_spark_columns(),
439
503
  snowpark_column_names=disambiguated_snowpark_names,
440
504
  column_metadata=column_map.column_metadata,
441
505
  column_qualifiers=column_map.get_qualifiers(),
442
506
  table_name=container.table_name,
443
- cached_schema_getter=_get_new_schema,
507
+ cached_schema_getter=_schema_getter,
444
508
  )
509
+
510
+ # since we just renamed some snowpark columns, we need to update the dataframe container for the given plan_id
511
+ # TODO: is there a better way to do this?
512
+ if plan_id is not None:
513
+ set_plan_id_map(plan_id, disambiguated_container)
514
+
515
+ return disambiguated_container
516
+
517
+
518
+ def _combine_metadata(
519
+ left_container: DataFrameContainer, right_container: DataFrameContainer
520
+ ) -> dict:
521
+ column_metadata = dict(left_container.column_map.column_metadata or {})
522
+ if right_container.column_map.column_metadata:
523
+ for key, value in right_container.column_map.column_metadata.items():
524
+ if key not in column_metadata:
525
+ column_metadata[key] = value
526
+ else:
527
+ # In case of collision, use snowpark's column's expr_id as prefix.
528
+ # this is a temporary solution until SNOW-1926440 is resolved.
529
+ try:
530
+ snowpark_name = right_container.column_map.get_snowpark_column_name_from_spark_column_name(
531
+ key
532
+ )
533
+ expr_id = right_container.dataframe[
534
+ snowpark_name
535
+ ]._expression.expr_id
536
+ updated_key = COLUMN_METADATA_COLLISION_KEY.format(
537
+ expr_id=expr_id, key=snowpark_name
538
+ )
539
+ column_metadata[updated_key] = value
540
+ except Exception:
541
+ # ignore any errors that happens while fetching the metadata
542
+ pass
543
+ return column_metadata
544
+
545
+
546
+ def _build_joined_schema(
547
+ snowpark_columns: list[str],
548
+ left_input: DataFrame,
549
+ right_input: DataFrame,
550
+ target_snowpark_columns: Optional[list[str]] = None,
551
+ ) -> Callable[[], StructType]:
552
+ """
553
+ Builds a lazy schema for the joined dataframe, based on the given snowpark_columns and input dataframes.
554
+ In case of full outer joins, we need a separate target_snowpark_columns, since join columns will have different
555
+ names in the output than in any input.
556
+ """
557
+
558
+ def _schema_getter() -> StructType:
559
+ all_fields = left_input.schema.fields + right_input.schema.fields
560
+ fields: dict[str, StructField] = {f.name: f for f in all_fields}
561
+ target_names = target_snowpark_columns or snowpark_columns
562
+
563
+ assert len(snowpark_columns) == len(target_names)
564
+
565
+ return StructType(
566
+ [
567
+ StructField(
568
+ target_names[i], fields[name].datatype, fields[name].nullable
569
+ )
570
+ for i, name in enumerate(snowpark_columns)
571
+ ]
572
+ )
573
+
574
+ return _schema_getter