PyPI - snowpark-connect - Versions diffs - 0.32.0__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend - Supply Chain Defender

snowpark-connect 0.32.0py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (98) hide show

snowflake/snowpark_connect/relation/map_join.py CHANGED Viewed

@@ -1,15 +1,21 @@
 #
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
+import dataclasses
+from enum import Enum
 from functools import reduce
+from typing import Optional
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
 from pyspark.errors import AnalysisException
 import snowflake.snowpark.functions as snowpark_fn
 from snowflake import snowpark
-from snowflake.snowpark_connect.column_name_handler import JoinColumnNameMap
+from snowflake.snowpark.types import StructField, StructType
+from snowflake.snowpark_connect.column_name_handler import (
+    JoinColumnNameMap,
+    make_unique_snowpark_name,
+)
 from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
 from snowflake.snowpark_connect.config import global_config
 from snowflake.snowpark_connect.constants import COLUMN_METADATA_COLLISION_KEY
@@ -43,6 +49,25 @@ from snowflake.snowpark_connect.utils.telemetry import (
 USING_COLUMN_NOT_FOUND_ERROR = "[UNRESOLVED_USING_COLUMN_FOR_JOIN] USING column `{0}` not found on the {1} side of the join. The {1}-side columns: {2}"
+class ConditionType(Enum):
+    USING_COLUMNS = 1
+    JOIN_CONDITION = 2
+    NO_CONDITION = 3
+@dataclasses.dataclass
+class JoinInfo:
+    join_type: str
+    condition_type: ConditionType
+    join_columns: Optional[list[str]]
+    def has_join_condition(self) -> bool:
+        return self.condition_type == ConditionType.JOIN_CONDITION
+    def is_using_columns(self):
+        return self.condition_type == ConditionType.USING_COLUMNS
 def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
     left_container: DataFrameContainer = map_relation(rel.join.left)
     right_container: DataFrameContainer = map_relation(rel.join.right)
@@ -54,48 +79,11 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
     left_input: snowpark.DataFrame = left_container.dataframe
     right_input: snowpark.DataFrame = right_container.dataframe
-    is_natural_join = rel.join.join_type >= NATURAL_JOIN_TYPE_BASE
-    using_columns = rel.join.using_columns
-    if is_natural_join:
-        rel.join.join_type -= NATURAL_JOIN_TYPE_BASE
-        left_spark_columns = left_container.column_map.get_spark_columns()
-        right_spark_columns = right_container.column_map.get_spark_columns()
-        common_spark_columns = [
-            x for x in left_spark_columns if x in right_spark_columns
-        ]
-        using_columns = common_spark_columns
-    match rel.join.join_type:
-        case relation_proto.Join.JOIN_TYPE_UNSPECIFIED:
-            # TODO: Understand what UNSPECIFIED Join type is
-            exception = SnowparkConnectNotImplementedError("Unspecified Join Type")
-            attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
-            raise exception
-        case relation_proto.Join.JOIN_TYPE_INNER:
-            join_type = "inner"
-        case relation_proto.Join.JOIN_TYPE_FULL_OUTER:
-            join_type = "full_outer"
-        case relation_proto.Join.JOIN_TYPE_LEFT_OUTER:
-            join_type = "left"
-        case relation_proto.Join.JOIN_TYPE_RIGHT_OUTER:
-            join_type = "right"
-        case relation_proto.Join.JOIN_TYPE_LEFT_ANTI:
-            join_type = "leftanti"
-        case relation_proto.Join.JOIN_TYPE_LEFT_SEMI:
-            join_type = "leftsemi"
-        case relation_proto.Join.JOIN_TYPE_CROSS:
-            join_type = "cross"
-        case other:
-            exception = SnowparkConnectNotImplementedError(f"Other Join Type: {other}")
-            attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
-            raise exception
-    # This handles case sensitivity for using_columns
-    case_corrected_right_columns: list[str] = []
-    if rel.join.HasField("join_condition"):
-        assert not using_columns
+    join_info = _get_join_info(rel, left_container, right_container)
+    join_type = join_info.join_type
+    if join_info.has_join_condition():
         left_columns = list(left_container.column_map.spark_to_col.keys())
         right_columns = list(right_container.column_map.spark_to_col.keys())
@@ -122,72 +110,42 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
         result: snowpark.DataFrame = left_input.join(
             right=right_input,
             on=join_expression.col,
-            how=join_type,
+            how="inner" if join_info.join_type == "cross" else join_info.join_type,
             lsuffix="_left",
             rsuffix="_right",
         )
-    elif using_columns:
-        if any(
-            left_container.column_map.get_snowpark_column_name_from_spark_column_name(
-                c, allow_non_exists=True, return_first=True
-            )
-            is None
-            for c in using_columns
-        ):
-            exception = AnalysisException(
-                USING_COLUMN_NOT_FOUND_ERROR.format(
-                    next(
-                        c
-                        for c in using_columns
-                        if left_container.column_map.get_snowpark_column_name_from_spark_column_name(
-                            c, allow_non_exists=True, return_first=True
-                        )
-                        is None
-                    ),
-                    "left",
-                    left_container.column_map.get_spark_columns(),
+    elif join_info.is_using_columns():
+        # TODO: disambiguate snowpark columns for all join condition types
+        # disambiguation temporarily done only for using_columns/natural joins to reduce changes
+        left_container, right_container = _disambiguate_snowpark_columns(
+            left_container, right_container
+        )
+        left_input = left_container.dataframe
+        right_input = right_container.dataframe
+        join_columns = join_info.join_columns
+        def _validate_using_column(
+            column: str, container: DataFrameContainer, side: str
+        ) -> None:
+            if (
+                container.column_map.get_snowpark_column_name_from_spark_column_name(
+                    column, allow_non_exists=True, return_first=True
                 )
-            )
-            attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
-            raise exception
-        if any(
-            right_container.column_map.get_snowpark_column_name_from_spark_column_name(
-                c, allow_non_exists=True, return_first=True
-            )
-            is None
-            for c in using_columns
-        ):
-            exception = AnalysisException(
-                USING_COLUMN_NOT_FOUND_ERROR.format(
-                    next(
-                        c
-                        for c in using_columns
-                        if right_container.column_map.get_snowpark_column_name_from_spark_column_name(
-                            c, allow_non_exists=True, return_first=True
-                        )
-                        is None
-                    ),
-                    "right",
-                    right_container.column_map.get_spark_columns(),
+                is None
+            ):
+                exception = AnalysisException(
+                    USING_COLUMN_NOT_FOUND_ERROR.format(
+                        column, side, container.column_map.get_spark_columns()
+                    )
                 )
-            )
-            attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
-            raise exception
+                attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
+                raise exception
+        for col in join_columns:
+            _validate_using_column(col, left_container, "left")
+            _validate_using_column(col, right_container, "right")
-        # Round trip the using columns through the column map to get the correct names
-        # in order to support case sensitivity.
-        # TODO: case_corrected_left_columns / case_corrected_right_columns may no longer be required as Snowpark dataframe preserves the column casing now.
-        case_corrected_left_columns = left_container.column_map.get_spark_column_names_from_snowpark_column_names(
-            left_container.column_map.get_snowpark_column_names_from_spark_column_names(
-                list(using_columns), return_first=True
-            )
-        )
-        case_corrected_right_columns = right_container.column_map.get_spark_column_names_from_snowpark_column_names(
-            right_container.column_map.get_snowpark_column_names_from_spark_column_names(
-                list(using_columns), return_first=True
-            )
-        )
-        using_columns = zip(case_corrected_left_columns, case_corrected_right_columns)
         # We cannot assume that Snowpark will have the same names for left and right columns,
         # so we convert ["a", "b"] into (left["a"] == right["a"] & left["b"] == right["b"]),
         # then drop right["a"] and right["b"].
@@ -195,16 +153,16 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
             (
                 left_input[
                     left_container.column_map.get_snowpark_column_name_from_spark_column_name(
-                        lft, return_first=True
+                        spark_name, return_first=True
                     )
                 ],
                 right_input[
                     right_container.column_map.get_snowpark_column_name_from_spark_column_name(
-                        r, return_first=True
+                        spark_name, return_first=True
                     )
                 ],
             )
-            for lft, r in using_columns
+            for spark_name in join_columns
         ]
         joined_df = left_input.join(
             right=right_input,
@@ -240,10 +198,19 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
             exception = SparkException.implicit_cartesian_product("inner")
             attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
             raise exception
-        result: snowpark.DataFrame = left_input.join(
-            right=right_input,
-            how=join_type,
-        )
+        # For outer joins without a condition, we need to use a TRUE condition
+        # to match Spark's behavior.
+        if join_type in ["left", "right", "full_outer"]:
+            result: snowpark.DataFrame = left_input.join(
+                right=right_input,
+                on=snowpark_fn.lit(True),
+                how=join_type,
+            )
+        else:
+            result: snowpark.DataFrame = left_input.join(
+                right=right_input,
+                how=join_type,
+            )
     if join_type in ["leftanti", "leftsemi"]:
         # Join types that only return columns from the left side:
@@ -253,39 +220,26 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
         spark_cols_after_join: list[str] = left_container.column_map.get_spark_columns()
         qualifiers = left_container.column_map.get_qualifiers()
     else:
-        # Add Spark columns and plan_ids from left DF
-        spark_cols_after_join: list[str] = list(
-            left_container.column_map.get_spark_columns()
-        ) + [
-            spark_col
-            for i, spark_col in enumerate(
-                right_container.column_map.get_spark_columns()
+        if not join_info.is_using_columns():
+            spark_cols_after_join: list[str] = (
+                left_container.column_map.get_spark_columns()
+                + right_container.column_map.get_spark_columns()
             )
-            if spark_col not in case_corrected_right_columns
-            or spark_col
-            in right_container.column_map.get_spark_columns()[
-                :i
-            ]  # this is to make sure we only remove the column once
-        ]
-        qualifiers: list[set[ColumnQualifier]] = list(
-            left_container.column_map.get_qualifiers()
-        ) + [
-            {right_container.column_map.get_qualifier_for_spark_column(spark_col)}
-            for i, spark_col in enumerate(
-                right_container.column_map.get_spark_columns()
+            qualifiers: list[set[ColumnQualifier]] = (
+                left_container.column_map.get_qualifiers()
+                + right_container.column_map.get_qualifiers()
             )
-            if spark_col not in case_corrected_right_columns
-            or spark_col
-            in right_container.column_map.get_spark_columns()[
-                :i
-            ]  # this is to make sure we only remove the column once]
-        ]
-    column_metadata = {}
-    if left_container.column_map.column_metadata:
-        column_metadata.update(left_container.column_map.column_metadata)
+        else:
+            # get columns after join
+            joined_columns = left_container.column_map.get_columns_after_join(
+                right_container.column_map, join_info.join_columns
+            )
+            spark_cols_after_join: list[str] = [c.spark_name for c in joined_columns]
+            qualifiers: list[set[ColumnQualifier]] = [
+                c.qualifiers for c in joined_columns
+            ]
+    column_metadata = dict(left_container.column_map.column_metadata or {})
     if right_container.column_map.column_metadata:
         for key, value in right_container.column_map.column_metadata.items():
             if key not in column_metadata:
@@ -318,7 +272,7 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
     # After a USING join, references to the right dataframe's columns should resolve
     # to the result dataframe that contains the merged columns
     if (
-        using_columns
+        join_info.is_using_columns()
         and rel.join.right.HasField("common")
         and rel.join.right.common.HasField("plan_id")
     ):
@@ -328,7 +282,7 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
     # For FULL OUTER joins, we also need to map the left dataframe's plan_id
     # since both columns are replaced with a coalesced column
     if (
-        using_columns
+        join_info.is_using_columns()
         and join_type == "full_outer"
         and rel.join.left.HasField("common")
         and rel.join.left.common.HasField("plan_id")
@@ -336,12 +290,12 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
         left_plan_id = rel.join.left.common.plan_id
         set_plan_id_map(left_plan_id, result_container)
-    if rel.join.using_columns:
+    if join_info.is_using_columns():
         # When join 'using_columns', the 'join columns' should go first in result DF.
-        idxs_to_shift = [
-            spark_cols_after_join.index(left_col_name)
-            for left_col_name in case_corrected_left_columns
-        ]
+        # we're only shifting left side columns, since we dropped the right-side ones
+        idxs_to_shift = left_container.column_map.get_column_indexes(
+            join_info.join_columns
+        )
         def reorder(lst: list) -> list:
             to_move = [lst[i] for i in idxs_to_shift]
@@ -370,3 +324,121 @@ def map_join(rel: relation_proto.Relation) -> DataFrameContainer:
         )
     return result_container
+def _get_join_info(
+    rel: relation_proto.Relation, left: DataFrameContainer, right: DataFrameContainer
+) -> JoinInfo:
+    """
+    Gathers basic information about the join, and performs basic assertions
+    """
+    is_natural_join = rel.join.join_type >= NATURAL_JOIN_TYPE_BASE
+    join_columns = rel.join.using_columns
+    if is_natural_join:
+        rel.join.join_type -= NATURAL_JOIN_TYPE_BASE
+        left_spark_columns = left.column_map.get_spark_columns()
+        right_spark_columns = right.column_map.get_spark_columns()
+        common_spark_columns = [
+            x for x in left_spark_columns if x in right_spark_columns
+        ]
+        join_columns = common_spark_columns
+    match rel.join.join_type:
+        case relation_proto.Join.JOIN_TYPE_UNSPECIFIED:
+            # TODO: Understand what UNSPECIFIED Join type is
+            exception = SnowparkConnectNotImplementedError("Unspecified Join Type")
+            attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+            raise exception
+        case relation_proto.Join.JOIN_TYPE_INNER:
+            join_type = "inner"
+        case relation_proto.Join.JOIN_TYPE_FULL_OUTER:
+            join_type = "full_outer"
+        case relation_proto.Join.JOIN_TYPE_LEFT_OUTER:
+            join_type = "left"
+        case relation_proto.Join.JOIN_TYPE_RIGHT_OUTER:
+            join_type = "right"
+        case relation_proto.Join.JOIN_TYPE_LEFT_ANTI:
+            join_type = "leftanti"
+        case relation_proto.Join.JOIN_TYPE_LEFT_SEMI:
+            join_type = "leftsemi"
+        case relation_proto.Join.JOIN_TYPE_CROSS:
+            join_type = "cross"
+        case other:
+            exception = SnowparkConnectNotImplementedError(f"Other Join Type: {other}")
+            attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+            raise exception
+    has_join_condition = rel.join.HasField("join_condition")
+    is_using_columns = bool(join_columns)
+    if has_join_condition:
+        assert not is_using_columns
+    condition_type = ConditionType.NO_CONDITION
+    if has_join_condition:
+        condition_type = ConditionType.JOIN_CONDITION
+    elif is_using_columns:
+        condition_type = ConditionType.USING_COLUMNS
+    return JoinInfo(join_type, condition_type, join_columns)
+def _disambiguate_snowpark_columns(
+    left: DataFrameContainer, right: DataFrameContainer
+) -> tuple[DataFrameContainer, DataFrameContainer]:
+    conflicting_snowpark_columns = left.column_map.get_conflicting_snowpark_columns(
+        right.column_map
+    )
+    if not conflicting_snowpark_columns:
+        return left, right
+    # rename and create new containers
+    return _disambiguate_container(
+        left, conflicting_snowpark_columns
+    ), _disambiguate_container(right, conflicting_snowpark_columns)
+def _disambiguate_container(
+    container: DataFrameContainer, conflicting_snowpark_columns: set[str]
+) -> DataFrameContainer:
+    column_map = container.column_map
+    disambiguated_columns = []
+    disambiguated_snowpark_names = []
+    for c in column_map.columns:
+        if c.snowpark_name in conflicting_snowpark_columns:
+            # alias snowpark column with a new unique name
+            new_name = make_unique_snowpark_name(c.spark_name)
+            disambiguated_snowpark_names.append(new_name)
+            disambiguated_columns.append(
+                snowpark_fn.col(c.snowpark_name).alias(new_name)
+            )
+        else:
+            disambiguated_snowpark_names.append(c.snowpark_name)
+            disambiguated_columns.append(snowpark_fn.col(c.snowpark_name))
+    disambiguated_df = container.dataframe.select(*disambiguated_columns)
+    def _get_new_schema():
+        old_schema = container.dataframe.schema
+        if not old_schema.fields:
+            return StructType([])
+        new_fields = []
+        for i, name in enumerate(disambiguated_snowpark_names):
+            f = old_schema.fields[i]
+            new_fields.append(
+                StructField(name, f.datatype, nullable=f.nullable, _is_column=True)
+            )
+        return StructType(new_fields)
+    return DataFrameContainer.create_with_column_mapping(
+        dataframe=disambiguated_df,
+        spark_column_names=column_map.get_spark_columns(),
+        snowpark_column_names=disambiguated_snowpark_names,
+        column_metadata=column_map.column_metadata,
+        column_qualifiers=column_map.get_qualifiers(),
+        table_name=container.table_name,
+        cached_schema_getter=_get_new_schema,
+    )

snowflake/snowpark_connect/relation/map_row_ops.py CHANGED Viewed

@@ -45,6 +45,61 @@ from snowflake.snowpark_connect.utils.telemetry import (
 )
+def cast_columns(
+    df_container: DataFrameContainer,
+    df_dtypes: list[snowpark.types.DataType],
+    target_dtypes: list[snowpark.types.DataType],
+    column_map: ColumnNameMap,
+):
+    df: snowpark.DataFrame = df_container.dataframe
+    if df_dtypes == target_dtypes:
+        return df_container
+    # Use cached schema if available to avoid triggering extra queries
+    if (
+        hasattr(df_container, "cached_schema_getter")
+        and df_container.cached_schema_getter is not None
+    ):
+        df_schema = df_container.cached_schema_getter()
+    else:
+        df_schema = df.schema  # Get current schema
+    new_columns = []
+    for i, field in enumerate(df_schema.fields):
+        col_name = field.name
+        current_type = field.datatype
+        target_type = target_dtypes[i]
+        if current_type != target_type:
+            new_columns.append(df[col_name].cast(target_type).alias(col_name))
+        else:
+            new_columns.append(df[col_name])
+    new_df = df.select(new_columns)
+    return DataFrameContainer.create_with_column_mapping(
+        dataframe=new_df,
+        spark_column_names=column_map.get_spark_columns(),
+        snowpark_column_names=column_map.get_snowpark_columns(),
+        snowpark_column_types=target_dtypes,
+        column_metadata=column_map.column_metadata,
+        parent_column_name_map=column_map,
+    )
+def get_schema_from_result(
+    result: DataFrameContainer,
+) -> StructType:
+    """
+    Get schema from a DataFrameContainer, using cached schema if available to avoid extra queries.
+    """
+    if (
+        hasattr(result, "cached_schema_getter")
+        and result.cached_schema_getter is not None
+    ):
+        return result.cached_schema_getter()
+    else:
+        return result.dataframe.schema
 def map_deduplicate(
     rel: relation_proto.Relation,
 ) -> DataFrameContainer:
@@ -205,21 +260,8 @@ def map_union(
     # workaround for unstructured type vs structured type
     # Use cached schema if available to avoid triggering extra queries
-    if (
-        hasattr(left_result, "cached_schema_getter")
-        and left_result.cached_schema_getter is not None
-    ):
-        left_schema = left_result.cached_schema_getter()
-    else:
-        left_schema = left_df.schema
-    if (
-        hasattr(right_result, "cached_schema_getter")
-        and right_result.cached_schema_getter is not None
-    ):
-        right_schema = right_result.cached_schema_getter()
-    else:
-        right_schema = right_df.schema
+    left_schema = get_schema_from_result(left_result)
+    right_schema = get_schema_from_result(right_result)
     left_dtypes = [field.datatype for field in left_schema.fields]
     right_dtypes = [field.datatype for field in right_schema.fields]
@@ -257,6 +299,29 @@ def map_union(
                     # Union of any type with null type is of the other type
                     target_left_dtypes.append(other_t)
                     target_right_dtypes.append(other_t)
+                case (snowpark.types.DecimalType(), snowpark.types.DecimalType()):
+                    # Widen decimal types to accommodate both sides
+                    # Calculate the maximum scale and maximum integer digits
+                    left_integer_digits = left_type.precision - left_type.scale
+                    right_integer_digits = right_type.precision - right_type.scale
+                    # The common type needs to accommodate:
+                    # - The maximum number of digits after the decimal point (scale)
+                    # - The maximum number of digits before the decimal point (integer digits)
+                    common_scale = max(left_type.scale, right_type.scale)
+                    common_integer_digits = max(
+                        left_integer_digits, right_integer_digits
+                    )
+                    common_precision = min(38, common_scale + common_integer_digits)
+                    # Ensure scale doesn't exceed precision
+                    common_scale = min(common_scale, common_precision)
+                    common_type = snowpark.types.DecimalType(
+                        common_precision, common_scale
+                    )
+                    target_left_dtypes.append(common_type)
+                    target_right_dtypes.append(common_type)
                 case (snowpark.types.BooleanType(), _) | (
                     _,
                     snowpark.types.BooleanType(),
@@ -272,49 +337,24 @@ def map_union(
                         raise exception
                     target_left_dtypes.append(left_type)
                     target_right_dtypes.append(right_type)
+                case (
+                    snowpark.types.TimestampType()
+                    | snowpark.types.DateType()
+                    | snowpark.types._NumericType(),
+                    snowpark.types.StringType(),
+                ) | (
+                    snowpark.types.StringType(),
+                    snowpark.types.TimestampType()
+                    | snowpark.types.DateType()
+                    | snowpark.types._NumericType(),
+                ) if not spark_sql_ansi_enabled:
+                    common_type = snowpark.types.StringType()
+                    target_left_dtypes.append(common_type)
+                    target_right_dtypes.append(common_type)
                 case _:
                     target_left_dtypes.append(left_type)
                     target_right_dtypes.append(right_type)
-        def cast_columns(
-            df_container: DataFrameContainer,
-            df_dtypes: list[snowpark.types.DataType],
-            target_dtypes: list[snowpark.types.DataType],
-            column_map: ColumnNameMap,
-        ):
-            df: snowpark.DataFrame = df_container.dataframe
-            if df_dtypes == target_dtypes:
-                return df_container
-            # Use cached schema if available to avoid triggering extra queries
-            if (
-                hasattr(df_container, "cached_schema_getter")
-                and df_container.cached_schema_getter is not None
-            ):
-                df_schema = df_container.cached_schema_getter()
-            else:
-                df_schema = df.schema  # Get current schema
-            new_columns = []
-            for i, field in enumerate(df_schema.fields):
-                col_name = field.name
-                current_type = field.datatype
-                target_type = target_dtypes[i]
-                if current_type != target_type:
-                    new_columns.append(df[col_name].cast(target_type).alias(col_name))
-                else:
-                    new_columns.append(df[col_name])
-            new_df = df.select(new_columns)
-            return DataFrameContainer.create_with_column_mapping(
-                dataframe=new_df,
-                spark_column_names=column_map.get_spark_columns(),
-                snowpark_column_names=column_map.get_snowpark_columns(),
-                snowpark_column_types=target_dtypes,
-                column_metadata=column_map.column_metadata,
-                parent_column_name_map=column_map,
-            )
         left_result = cast_columns(
             left_result,
             left_dtypes,
@@ -527,6 +567,48 @@ def map_except(
     left_df = left_result.dataframe
     right_df = right_result.dataframe
+    # workaround for unstructured type vs structured type
+    # Use cached schema if available to avoid triggering extra queries
+    left_schema = get_schema_from_result(left_result)
+    right_schema = get_schema_from_result(right_result)
+    left_dtypes = [field.datatype for field in left_schema.fields]
+    right_dtypes = [field.datatype for field in right_schema.fields]
+    if left_dtypes != right_dtypes and not rel.set_op.by_name:
+        if len(left_dtypes) != len(right_dtypes):
+            exception = AnalysisException("UNION: the number of columns must match")
+            attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
+            raise exception
+        target_left_dtypes, target_right_dtypes = [], []
+        for left_type, right_type in zip(left_dtypes, right_dtypes):
+            match (left_type, right_type):
+                case (snowpark.types._NumericType(), snowpark.types.StringType()) | (
+                    snowpark.types.StringType(),
+                    snowpark.types._NumericType(),
+                ):
+                    common_type = snowpark.types.StringType()
+                    target_left_dtypes.append(common_type)
+                    target_right_dtypes.append(common_type)
+                case _:
+                    target_left_dtypes.append(left_type)
+                    target_right_dtypes.append(right_type)
+        left_result = cast_columns(
+            left_result,
+            left_dtypes,
+            target_left_dtypes,
+            left_result.column_map,
+        )
+        right_result = cast_columns(
+            right_result,
+            right_dtypes,
+            target_right_dtypes,
+            right_result.column_map,
+        )
+        left_df = left_result.dataframe
+        right_df = right_result.dataframe
     if rel.set_op.is_all:
         # Snowflake except removes all duplicated rows. In order to handle the case,
         # we add a partition row number column to the df to make duplicated rows unique to