PyPI - snowpark-connect - Versions diffs - 0.32.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend - Supply Chain Defender

snowpark-connect 0.32.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (106) hide show

snowflake/snowpark_connect/relation/map_row_ops.py CHANGED Viewed

@@ -45,6 +45,61 @@ from snowflake.snowpark_connect.utils.telemetry import (
 )
+def cast_columns(
+    df_container: DataFrameContainer,
+    df_dtypes: list[snowpark.types.DataType],
+    target_dtypes: list[snowpark.types.DataType],
+    column_map: ColumnNameMap,
+):
+    df: snowpark.DataFrame = df_container.dataframe
+    if df_dtypes == target_dtypes:
+        return df_container
+    # Use cached schema if available to avoid triggering extra queries
+    if (
+        hasattr(df_container, "cached_schema_getter")
+        and df_container.cached_schema_getter is not None
+    ):
+        df_schema = df_container.cached_schema_getter()
+    else:
+        df_schema = df.schema  # Get current schema
+    new_columns = []
+    for i, field in enumerate(df_schema.fields):
+        col_name = field.name
+        current_type = field.datatype
+        target_type = target_dtypes[i]
+        if current_type != target_type:
+            new_columns.append(df[col_name].cast(target_type).alias(col_name))
+        else:
+            new_columns.append(df[col_name])
+    new_df = df.select(new_columns)
+    return DataFrameContainer.create_with_column_mapping(
+        dataframe=new_df,
+        spark_column_names=column_map.get_spark_columns(),
+        snowpark_column_names=column_map.get_snowpark_columns(),
+        snowpark_column_types=target_dtypes,
+        column_metadata=column_map.column_metadata,
+        parent_column_name_map=column_map,
+    )
+def get_schema_from_result(
+    result: DataFrameContainer,
+) -> StructType:
+    """
+    Get schema from a DataFrameContainer, using cached schema if available to avoid extra queries.
+    """
+    if (
+        hasattr(result, "cached_schema_getter")
+        and result.cached_schema_getter is not None
+    ):
+        return result.cached_schema_getter()
+    else:
+        return result.dataframe.schema
 def map_deduplicate(
     rel: relation_proto.Relation,
 ) -> DataFrameContainer:
@@ -205,21 +260,8 @@ def map_union(
     # workaround for unstructured type vs structured type
     # Use cached schema if available to avoid triggering extra queries
-    if (
-        hasattr(left_result, "cached_schema_getter")
-        and left_result.cached_schema_getter is not None
-    ):
-        left_schema = left_result.cached_schema_getter()
-    else:
-        left_schema = left_df.schema
-    if (
-        hasattr(right_result, "cached_schema_getter")
-        and right_result.cached_schema_getter is not None
-    ):
-        right_schema = right_result.cached_schema_getter()
-    else:
-        right_schema = right_df.schema
+    left_schema = get_schema_from_result(left_result)
+    right_schema = get_schema_from_result(right_result)
     left_dtypes = [field.datatype for field in left_schema.fields]
     right_dtypes = [field.datatype for field in right_schema.fields]
@@ -257,6 +299,29 @@ def map_union(
                     # Union of any type with null type is of the other type
                     target_left_dtypes.append(other_t)
                     target_right_dtypes.append(other_t)
+                case (snowpark.types.DecimalType(), snowpark.types.DecimalType()):
+                    # Widen decimal types to accommodate both sides
+                    # Calculate the maximum scale and maximum integer digits
+                    left_integer_digits = left_type.precision - left_type.scale
+                    right_integer_digits = right_type.precision - right_type.scale
+                    # The common type needs to accommodate:
+                    # - The maximum number of digits after the decimal point (scale)
+                    # - The maximum number of digits before the decimal point (integer digits)
+                    common_scale = max(left_type.scale, right_type.scale)
+                    common_integer_digits = max(
+                        left_integer_digits, right_integer_digits
+                    )
+                    common_precision = min(38, common_scale + common_integer_digits)
+                    # Ensure scale doesn't exceed precision
+                    common_scale = min(common_scale, common_precision)
+                    common_type = snowpark.types.DecimalType(
+                        common_precision, common_scale
+                    )
+                    target_left_dtypes.append(common_type)
+                    target_right_dtypes.append(common_type)
                 case (snowpark.types.BooleanType(), _) | (
                     _,
                     snowpark.types.BooleanType(),
@@ -272,49 +337,24 @@ def map_union(
                         raise exception
                     target_left_dtypes.append(left_type)
                     target_right_dtypes.append(right_type)
+                case (
+                    snowpark.types.TimestampType()
+                    | snowpark.types.DateType()
+                    | snowpark.types._NumericType(),
+                    snowpark.types.StringType(),
+                ) | (
+                    snowpark.types.StringType(),
+                    snowpark.types.TimestampType()
+                    | snowpark.types.DateType()
+                    | snowpark.types._NumericType(),
+                ) if not spark_sql_ansi_enabled:
+                    common_type = snowpark.types.StringType()
+                    target_left_dtypes.append(common_type)
+                    target_right_dtypes.append(common_type)
                 case _:
                     target_left_dtypes.append(left_type)
                     target_right_dtypes.append(right_type)
-        def cast_columns(
-            df_container: DataFrameContainer,
-            df_dtypes: list[snowpark.types.DataType],
-            target_dtypes: list[snowpark.types.DataType],
-            column_map: ColumnNameMap,
-        ):
-            df: snowpark.DataFrame = df_container.dataframe
-            if df_dtypes == target_dtypes:
-                return df_container
-            # Use cached schema if available to avoid triggering extra queries
-            if (
-                hasattr(df_container, "cached_schema_getter")
-                and df_container.cached_schema_getter is not None
-            ):
-                df_schema = df_container.cached_schema_getter()
-            else:
-                df_schema = df.schema  # Get current schema
-            new_columns = []
-            for i, field in enumerate(df_schema.fields):
-                col_name = field.name
-                current_type = field.datatype
-                target_type = target_dtypes[i]
-                if current_type != target_type:
-                    new_columns.append(df[col_name].cast(target_type).alias(col_name))
-                else:
-                    new_columns.append(df[col_name])
-            new_df = df.select(new_columns)
-            return DataFrameContainer.create_with_column_mapping(
-                dataframe=new_df,
-                spark_column_names=column_map.get_spark_columns(),
-                snowpark_column_names=column_map.get_snowpark_columns(),
-                snowpark_column_types=target_dtypes,
-                column_metadata=column_map.column_metadata,
-                parent_column_name_map=column_map,
-            )
         left_result = cast_columns(
             left_result,
             left_dtypes,
@@ -527,6 +567,48 @@ def map_except(
     left_df = left_result.dataframe
     right_df = right_result.dataframe
+    # workaround for unstructured type vs structured type
+    # Use cached schema if available to avoid triggering extra queries
+    left_schema = get_schema_from_result(left_result)
+    right_schema = get_schema_from_result(right_result)
+    left_dtypes = [field.datatype for field in left_schema.fields]
+    right_dtypes = [field.datatype for field in right_schema.fields]
+    if left_dtypes != right_dtypes and not rel.set_op.by_name:
+        if len(left_dtypes) != len(right_dtypes):
+            exception = AnalysisException("UNION: the number of columns must match")
+            attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
+            raise exception
+        target_left_dtypes, target_right_dtypes = [], []
+        for left_type, right_type in zip(left_dtypes, right_dtypes):
+            match (left_type, right_type):
+                case (snowpark.types._NumericType(), snowpark.types.StringType()) | (
+                    snowpark.types.StringType(),
+                    snowpark.types._NumericType(),
+                ):
+                    common_type = snowpark.types.StringType()
+                    target_left_dtypes.append(common_type)
+                    target_right_dtypes.append(common_type)
+                case _:
+                    target_left_dtypes.append(left_type)
+                    target_right_dtypes.append(right_type)
+        left_result = cast_columns(
+            left_result,
+            left_dtypes,
+            target_left_dtypes,
+            left_result.column_map,
+        )
+        right_result = cast_columns(
+            right_result,
+            right_dtypes,
+            target_right_dtypes,
+            right_result.column_map,
+        )
+        left_df = left_result.dataframe
+        right_df = right_result.dataframe
     if rel.set_op.is_all:
         # Snowflake except removes all duplicated rows. In order to handle the case,
         # we add a partition row number column to the df to make duplicated rows unique to