PyPI - snowpark-connect - Versions diffs - 0.20.2__py3-none-any.whl → 0.22.1__py3-none-any.whl - Mend - Supply Chain Defender

snowpark-connect 0.20.2py3-none-any.whl → 0.22.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (84) hide show

snowflake/snowpark_connect/expression/map_unresolved_function.py CHANGED Viewed

@@ -28,6 +28,7 @@ from google.protobuf.message import Message
 from pyspark.errors.exceptions.base import (
     AnalysisException,
     ArithmeticException,
+    ArrayIndexOutOfBoundsException,
     DateTimeException,
     IllegalArgumentException,
     NumberFormatException,
@@ -39,6 +40,7 @@ from pyspark.sql.types import _parse_datatype_json_string
 import snowflake.snowpark.functions as snowpark_fn
 from snowflake import snowpark
 from snowflake.snowpark import Column, Session
+from snowflake.snowpark._internal.analyzer.expression import Literal
 from snowflake.snowpark._internal.analyzer.unary_expression import Alias
 from snowflake.snowpark.types import (
     ArrayType,
@@ -69,7 +71,10 @@ from snowflake.snowpark_connect.column_name_handler import (
     ColumnNameMap,
     set_schema_getter,
 )
-from snowflake.snowpark_connect.config import global_config
+from snowflake.snowpark_connect.config import (
+    get_boolean_session_config_param,
+    global_config,
+)
 from snowflake.snowpark_connect.constants import (
     DUPLICATE_KEY_FOUND_ERROR_TEMPLATE,
     SPARK_TZ_ABBREVIATIONS_OVERRIDES,
@@ -100,6 +105,7 @@ from snowflake.snowpark_connect.typed_column import (
 )
 from snowflake.snowpark_connect.utils.context import (
     add_sql_aggregate_function,
+    get_current_grouping_columns,
     get_is_aggregate_function,
     get_is_evaluating_sql,
     get_is_in_udtf_context,
@@ -135,7 +141,7 @@ from snowflake.snowpark_connect.utils.xxhash64 import (
 MAX_UINT64 = 2**64 - 1
 MAX_INT64 = 2**63 - 1
 MIN_INT64 = -(2**63)
+MAX_ARRAY_SIZE = 2_147_483_647
 NAN, INFINITY = float("nan"), float("inf")
@@ -341,6 +347,9 @@ def map_unresolved_function(
     )
     spark_col_names = []
     spark_sql_ansi_enabled = global_config.spark_sql_ansi_enabled
+    spark_sql_legacy_allow_hash_on_map_type = (
+        global_config.spark_sql_legacy_allowHashOnMapType
+    )
     function_name = exp.unresolved_function.function_name.lower()
     telemetry.report_function_usage(function_name)
@@ -631,37 +640,22 @@ def map_unresolved_function(
                         [arg.typ for arg in snowpark_typed_args]
                     )
         case "/":
-            if isinstance(
-                snowpark_typed_args[0].typ, (IntegerType, LongType, ShortType)
-            ) and isinstance(
-                snowpark_typed_args[1].typ, (IntegerType, LongType, ShortType)
-            ):
-                #  Check if both arguments are integer types. Snowpark performs integer division, and precision is lost.
-                #  Cast to double and perform division
-                result_exp = _divnull(
-                    snowpark_args[0].cast(DoubleType()),
-                    snowpark_args[1].cast(DoubleType()),
-                )
-                result_type = DoubleType()
-            elif (
-                isinstance(snowpark_typed_args[0].typ, DecimalType)
-                and isinstance(snowpark_typed_args[1].typ, DecimalType)
-                or isinstance(snowpark_typed_args[0].typ, DecimalType)
-                and isinstance(snowpark_typed_args[1].typ, _IntegralType)
-                or isinstance(snowpark_typed_args[0].typ, _IntegralType)
-                and isinstance(snowpark_typed_args[1].typ, DecimalType)
-            ):
-                result_exp, (
-                    return_type_precision,
-                    return_type_scale,
-                ) = _mul_div_precision_helper(snowpark_typed_args, snowpark_args, 1)
-                result_type = DecimalType(return_type_precision, return_type_scale)
-            else:
-                # Perform division directly
-                result_exp = _divnull(snowpark_args[0], snowpark_args[1])
-                result_type = _find_common_type(
-                    [arg.typ for arg in snowpark_typed_args]
-                )
+            match (snowpark_typed_args[0].typ, snowpark_typed_args[1].typ):
+                case (DecimalType(), t) | (t, DecimalType()) if isinstance(
+                    t, DecimalType
+                ) or isinstance(t, _IntegralType) or isinstance(
+                    snowpark_typed_args[1].typ, NullType
+                ):
+                    result_exp, (
+                        return_type_precision,
+                        return_type_scale,
+                    ) = _mul_div_precision_helper(snowpark_typed_args, snowpark_args, 1)
+                    result_type = DecimalType(return_type_precision, return_type_scale)
+                case _:
+                    result_type = DoubleType()
+                    dividend = snowpark_args[0].cast(result_type)
+                    divisor = snowpark_args[1].cast(result_type)
+                    result_exp = _divnull(dividend, divisor)
         case "~":
             result_exp = TypedColumn(
                 snowpark_fn.bitnot(snowpark_args[0]),
@@ -867,14 +861,30 @@ def map_unresolved_function(
                     )
         case "approx_percentile" | "percentile_approx":
             # SNOW-1955784: Support accuracy parameter
+            # Use percentile_disc to return actual values from dataset (matches PySpark behavior)
-            # Even though the Spark function accepts a Column for percentage, it will fail unless it's a literal.
-            # Therefore, we can do error checking right here.
-            def _check_percentage(exp: expressions_proto.Expression) -> Column:
-                perc = unwrap_literal(exp)
-                if not 0.0 <= perc <= 1.0:
+            def _pyspark_approx_percentile(
+                column: Column, percentage: float, original_type: DataType
+            ) -> Column:
+                """
+                PySpark-compatible percentile that returns actual values from dataset.
+                - PySpark's approx_percentile returns the "smallest value in the ordered col values
+                  such that no more than percentage of col values is less than or equal to that value"
+                - This means it MUST return an actual value from the original dataset
+                - Snowflake's approx_percentile() may interpolate between values, breaking compatibility
+                - percentile_disc() returns discrete values (actual dataset values), matching PySpark
+                """
+                # Even though the Spark function accepts a Column for percentage, it will fail unless it's a literal.
+                # Therefore, we can do error checking right here.
+                if not 0.0 <= percentage <= 1.0:
                     raise AnalysisException("percentage must be between [0.0, 1.0]")
-                return snowpark_fn.lit(perc)
+                result = snowpark_fn.function("percentile_disc")(
+                    snowpark_fn.lit(percentage)
+                ).within_group(column)
+                return snowpark_fn.cast(result, original_type)
+            column_type = snowpark_typed_args[0].typ
             if isinstance(snowpark_typed_args[1].typ, ArrayType):
                 # Snowpark doesn't accept a list of percentile values.
@@ -882,26 +892,26 @@ def map_unresolved_function(
                 array_func = exp.unresolved_function.arguments[1].unresolved_function
                 assert array_func.function_name == "array", array_func
-                result_exp = snowpark_fn.array_construct(
-                    *[
-                        snowpark_fn.approx_percentile(
-                            snowpark_args[0], _check_percentage(arg)
-                        )
-                        for arg in array_func.arguments
-                    ]
-                )
+                percentile_results = [
+                    _pyspark_approx_percentile(
+                        snowpark_args[0], unwrap_literal(arg), column_type
+                    )
+                    for arg in array_func.arguments
+                ]
+                result_type = ArrayType(element_type=column_type, contains_null=False)
                 result_exp = snowpark_fn.cast(
-                    result_exp,
-                    ArrayType(element_type=DoubleType(), contains_null=False),
+                    snowpark_fn.array_construct(*percentile_results),
+                    result_type,
                 )
-                result_type = ArrayType(element_type=DoubleType(), contains_null=False)
             else:
+                # Handle single percentile
+                percentage = unwrap_literal(exp.unresolved_function.arguments[1])
                 result_exp = TypedColumn(
-                    snowpark_fn.approx_percentile(
-                        snowpark_args[0],
-                        _check_percentage(exp.unresolved_function.arguments[1]),
+                    _pyspark_approx_percentile(
+                        snowpark_args[0], percentage, column_type
                     ),
-                    lambda: [DoubleType()],
+                    lambda: [column_type],
                 )
         case "array":
             if len(snowpark_args) == 0:
@@ -1178,35 +1188,18 @@ def map_unresolved_function(
                 snowpark_fn.asinh(snowpark_args[0]), lambda: [DoubleType()]
             )
         case "assert_true":
+            result_type = NullType()
+            raise_error = _raise_error_helper(result_type)
-            @cached_udf(
-                input_types=[BooleanType()],
-                return_type=StringType(),
-            )
-            def _assert_true_single(expr):
-                if not expr:
-                    raise ValueError("assertion failed")
-                return None
-            @cached_udf(
-                input_types=[BooleanType(), StringType()],
-                return_type=StringType(),
-            )
-            def _assert_true_with_message(expr, message):
-                if not expr:
-                    raise ValueError(message)
-                return None
-            # Handle different argument counts using match pattern
             match snowpark_args:
                 case [expr]:
-                    result_exp = TypedColumn(
-                        _assert_true_single(expr), lambda: [StringType()]
-                    )
+                    result_exp = snowpark_fn.when(
+                        expr, snowpark_fn.lit(None)
+                    ).otherwise(raise_error(snowpark_fn.lit("assertion failed")))
                 case [expr, message]:
-                    result_exp = TypedColumn(
-                        _assert_true_with_message(expr, message), lambda: [StringType()]
-                    )
+                    result_exp = snowpark_fn.when(
+                        expr, snowpark_fn.lit(None)
+                    ).otherwise(raise_error(snowpark_fn.cast(message, StringType())))
                 case _:
                     raise AnalysisException(
                         f"[WRONG_NUM_ARGS.WITHOUT_SUGGESTION] The `assert_true` requires 1 or 2 parameters but the actual number is {len(snowpark_args)}."
@@ -2073,14 +2066,22 @@ def map_unresolved_function(
             assert (
                 len(exp.unresolved_function.arguments) == 2
             ), "date_format takes 2 arguments"
-            result_exp = snowpark_fn.date_format(
-                snowpark_args[0],
-                snowpark_fn.lit(
-                    map_spark_timestamp_format_expression(
-                        exp.unresolved_function.arguments[1], snowpark_typed_args[0].typ
-                    )
-                ),
-            )
+            # Check if format parameter is NULL
+            format_literal = unwrap_literal(exp.unresolved_function.arguments[1])
+            if format_literal is None:
+                # If format is NULL, return NULL for all rows
+                result_exp = snowpark_fn.lit(None)
+            else:
+                result_exp = snowpark_fn.date_format(
+                    snowpark_args[0],
+                    snowpark_fn.lit(
+                        map_spark_timestamp_format_expression(
+                            exp.unresolved_function.arguments[1],
+                            snowpark_typed_args[0].typ,
+                        )
+                    ),
+                )
             result_exp = TypedColumn(result_exp, lambda: [StringType()])
         case "date_from_unix_date":
             result_exp = snowpark_fn.date_add(
@@ -2260,31 +2261,32 @@ def map_unresolved_function(
                     )
         case "elt":
             n = snowpark_args[0]
             values = snowpark_fn.array_construct(*snowpark_args[1:])
             if spark_sql_ansi_enabled:
-                @cached_udf(
-                    input_types=[IntegerType()],
-                    return_type=StringType(),
+                raise_error = _raise_error_helper(
+                    StringType(), error_class=ArrayIndexOutOfBoundsException
                 )
-                def _raise_out_of_bounds_error(n: int) -> str:
-                    raise ValueError(
-                        f"ArrayIndexOutOfBoundsException: {n} is not within the input bounds."
-                    )
                 values_size = snowpark_fn.lit(len(snowpark_args) - 1)
                 result_exp = (
                     snowpark_fn.when(snowpark_fn.is_null(n), snowpark_fn.lit(None))
                     .when(
                         (snowpark_fn.lit(1) <= n) & (n <= values_size),
-                        snowpark_fn.get(
-                            values, snowpark_fn.nvl(n - 1, snowpark_fn.lit(0))
+                        snowpark_fn.cast(
+                            snowpark_fn.get(
+                                values, snowpark_fn.nvl(n - 1, snowpark_fn.lit(0))
+                            ),
+                            StringType(),
                         ),
                     )
-                    .otherwise(_raise_out_of_bounds_error(n))
+                    .otherwise(
+                        raise_error(
+                            snowpark_fn.lit("[INVALID_ARRAY_INDEX] The index "),
+                            snowpark_fn.cast(n, StringType()),
+                            snowpark_fn.lit(" is out of bounds."),
+                        )
+                    )
                 )
             else:
                 result_exp = snowpark_fn.when(
@@ -2535,6 +2537,19 @@ def map_unresolved_function(
                 input_types=[StringType(), StringType(), StructType()],
             )
             def _from_csv(csv_data: str, schema: str, options: Optional[dict]):
+                if csv_data is None:
+                    return None
+                if csv_data == "":
+                    # Return dict with None values for empty string
+                    schemas = schema.split(",")
+                    results = {}
+                    for sc in schemas:
+                        parts = [i for i in sc.split(" ") if len(i) != 0]
+                        assert len(parts) == 2, f"{sc} is not a valid schema"
+                        results[parts[0]] = None
+                    return results
                 max_chars_per_column = -1
                 sep = ","
@@ -2617,7 +2632,9 @@ def map_unresolved_function(
                 case _:
                     raise ValueError("Unrecognized from_csv parameters")
-            result_exp = snowpark_fn.cast(csv_result, ddl_schema)
+            result_exp = snowpark_fn.when(
+                snowpark_args[0].is_null(), snowpark_fn.lit(None)
+            ).otherwise(snowpark_fn.cast(csv_result, ddl_schema))
             result_type = ddl_schema
         case "from_json":
             # TODO: support options.
@@ -2651,6 +2668,9 @@ def map_unresolved_function(
             # try to parse first, since spark returns null for invalid json
             result_exp = snowpark_fn.call_function("try_parse_json", snowpark_args[0])
+            # Check if the original input is NULL - if so, return NULL for the entire result
+            original_input_is_null = snowpark_args[0].is_null()
             # helper function to make sure we have the expected array element type
             def _element_type_matches(
                 array_exp: Column, element_type: DataType
@@ -2749,9 +2769,13 @@ def map_unresolved_function(
                 else:
                     return exp
-            result_exp = snowpark_fn.cast(
-                _coerce_to_type(result_exp, result_type), result_type
-            )
+            # Apply the coercion to handle invalid JSON (creates struct with NULL fields)
+            coerced_exp = _coerce_to_type(result_exp, result_type)
+            # If the original input was NULL, return NULL instead of a struct
+            result_exp = snowpark_fn.when(
+                original_input_is_null, snowpark_fn.lit(None)
+            ).otherwise(snowpark_fn.cast(coerced_exp, result_type))
         case "from_unixtime":
             def raise_analysis_exception(
@@ -2896,10 +2920,53 @@ def map_unresolved_function(
             )
         case "grouping" | "grouping_id":
             # grouping_id is not an alias for grouping in PySpark, but Snowflake's implementation handles both
-            result_exp = snowpark_fn.grouping(*snowpark_args)
+            current_grouping_cols = get_current_grouping_columns()
+            if function_name == "grouping_id":
+                if not snowpark_args:
+                    # grouping_id() with empty args means use all grouping columns
+                    spark_function_name = "grouping_id()"
+                    snowpark_args = [
+                        column_mapping.get_snowpark_column_name_from_spark_column_name(
+                            spark_col
+                        )
+                        for spark_col in current_grouping_cols
+                    ]
+                else:
+                    # Verify that grouping arguments match current grouping columns
+                    spark_col_args = [
+                        column_mapping.get_spark_column_name_from_snowpark_column_name(
+                            sp_col.getName()
+                        )
+                        for sp_col in snowpark_args
+                    ]
+                    if current_grouping_cols != spark_col_args:
+                        raise AnalysisException(
+                            f"[GROUPING_ID_COLUMN_MISMATCH] Columns of grouping_id: {spark_col_args} doesnt match "
+                            f"Grouping columns: {current_grouping_cols}"
+                        )
+            if function_name == "grouping_id":
+                result_exp = snowpark_fn.grouping_id(*snowpark_args)
+            else:
+                result_exp = snowpark_fn.grouping(*snowpark_args)
             result_type = LongType()
         case "hash":
             # TODO: See the spark-compatibility-issues.md explanation, this is quite different from Spark.
+            # MapType columns as input should raise an exception as they are not hashable.
+            snowflake_compat = get_boolean_session_config_param(
+                "enable_snowflake_extension_behavior"
+            )
+            # Snowflake's hash function does allow MAP types, but Spark does not. Therefore, if we have the expansion flag enabled
+            # we want to let it pass through and hash MAP types.
+            # Also allow if the legacy config spark.sql.legacy.allowHashOnMapType is set to true
+            if not snowflake_compat and not spark_sql_legacy_allow_hash_on_map_type:
+                for arg in snowpark_typed_args:
+                    if any(isinstance(t, MapType) for t in arg.types):
+                        raise AnalysisException(
+                            '[DATATYPE_MISMATCH.HASH_MAP_TYPE] Cannot resolve "hash(value)" due to data type mismatch: '
+                            'Input to the function `hash` cannot contain elements of the "MAP" type. '
+                            'In Spark, same maps may have different hashcode, thus hash expressions are prohibited on "MAP" elements. '
+                            'To restore previous behavior set "spark.sql.legacy.allowHashOnMapType" to "true".'
+                        )
             result_exp = snowpark_fn.hash(*snowpark_args)
             result_type = LongType()
         case "hex":
@@ -2934,6 +3001,14 @@ def map_unresolved_function(
             result_type = StringType()
         case "histogram_numeric":
             aggregate_input_typ = snowpark_typed_args[0].typ
+            if isinstance(aggregate_input_typ, DecimalType):
+                # mimic bug from Spark 3.5.3.
+                # In 3.5.5 it's fixed and this exception shouldn't be thrown
+                raise ValueError(
+                    "class org.apache.spark.sql.types.Decimal cannot be cast to class java.lang.Number (org.apache.spark.sql.types.Decimal is in unnamed module of loader 'app'; java.lang.Number is in module java.base of loader 'bootstrap')"
+                )
             histogram_return_type = ArrayType(
                 StructType(
                     [
@@ -3154,6 +3229,18 @@ def map_unresolved_function(
             )
             result_type = histogram_return_type
         case "hll_sketch_agg":
+            # check if input type is correct
+            if type(snowpark_typed_args[0].typ) not in [
+                IntegerType,
+                LongType,
+                StringType,
+                BinaryType,
+            ]:
+                type_str = snowpark_typed_args[0].typ.simpleString().upper()
+                raise AnalysisException(
+                    f'[DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "{spark_function_name}" due to data type mismatch: Parameter 1 requires the ("INT" or "BIGINT" or "STRING" or "BINARY") type, however "{snowpark_arg_names[0]}" has the type "{type_str}".'
+                )
             match snowpark_args:
                 case [sketch]:
                     spark_function_name = (
@@ -3173,7 +3260,7 @@ def map_unresolved_function(
             ).cast(LongType())
             result_type = LongType()
         case "hll_union_agg":
-            raise_error = _raise_error_udf_helper(BinaryType())
+            raise_error = _raise_error_helper(BinaryType())
             args = exp.unresolved_function.arguments
             allow_different_lgConfigK = len(args) == 2 and unwrap_literal(args[1])
             spark_function_name = f"{function_name}({snowpark_arg_names[0]}, {str(allow_different_lgConfigK).lower()})"
@@ -3213,7 +3300,7 @@ def map_unresolved_function(
                     SELECT arg1 as x)
                 """,
             )
-            raise_error = _raise_error_udf_helper(BinaryType())
+            raise_error = _raise_error_helper(BinaryType())
             args = exp.unresolved_function.arguments
             allow_different_lgConfigK = len(args) == 3 and unwrap_literal(args[2])
             spark_function_name = f"{function_name}({snowpark_arg_names[0]}, {snowpark_arg_names[1]}, {str(allow_different_lgConfigK).lower()})"
@@ -3796,12 +3883,47 @@ def map_unresolved_function(
                 )
             result_type = StringType()
-        case "ltrim":
+        case "ltrim" | "rtrim":
+            function_name_argument = (
+                "TRAILING" if function_name == "rtrim" else "LEADING"
+            )
             if len(snowpark_args) == 2:
                 # Only possible using SQL
-                spark_function_name = f"TRIM(LEADING {snowpark_arg_names[1]} FROM {snowpark_arg_names[0]})"
+                spark_function_name = f"TRIM({function_name_argument} {snowpark_arg_names[1]} FROM {snowpark_arg_names[0]})"
             result_exp = snowpark_fn.ltrim(*snowpark_args)
             result_type = StringType()
+            if isinstance(snowpark_typed_args[0].typ, BinaryType):
+                argument_name = snowpark_arg_names[0]
+                if exp.unresolved_function.arguments[0].HasField("literal"):
+                    argument_name = f"""X'{exp.unresolved_function.arguments[0].literal.binary.hex()}'"""
+                if len(snowpark_args) == 1:
+                    spark_function_name = f"{function_name}({argument_name})"
+                    trim_value = snowpark_fn.lit(b"\x20")
+                if len(snowpark_args) == 2:
+                    # Only possible using SQL
+                    trim_arg = snowpark_arg_names[1]
+                    if isinstance(
+                        snowpark_typed_args[1].typ, BinaryType
+                    ) and exp.unresolved_function.arguments[1].HasField("literal"):
+                        trim_arg = f"""X'{exp.unresolved_function.arguments[1].literal.binary.hex()}'"""
+                        trim_value = snowpark_args[1]
+                    else:
+                        trim_value = snowpark_fn.lit(None)
+                    function_name_argument = (
+                        "TRAILING" if function_name == "rtrim" else "LEADING"
+                    )
+                    spark_function_name = f"TRIM({function_name_argument} {trim_arg} FROM {argument_name})"
+                result_exp = _trim_helper(
+                    snowpark_args[0], trim_value, snowpark_fn.lit(function_name)
+                )
+                result_type = BinaryType()
+            else:
+                if function_name == "ltrim":
+                    result_exp = snowpark_fn.ltrim(*snowpark_args)
+                    result_type = StringType()
+                elif function_name == "rtrim":
+                    result_exp = snowpark_fn.rtrim(*snowpark_args)
+                    result_type = StringType()
         case "make_date":
             y = snowpark_args[0].cast(LongType())
             m = snowpark_args[1].cast(LongType())
@@ -3902,7 +4024,7 @@ def map_unresolved_function(
                             snowpark_fn.is_null(snowpark_args[i]),
                             # udf execution on XP seems to be lazy, so this should only run when there is a null key
                             # otherwise there should be no udf env setup or execution
-                            _raise_error_udf_helper(VariantType())(
+                            _raise_error_helper(VariantType())(
                                 snowpark_fn.lit(
                                     "[NULL_MAP_KEY] Cannot use null as map key."
                                 )
@@ -3964,6 +4086,14 @@ def map_unresolved_function(
             )
             result_type = MapType(key_type, value_type)
         case "map_contains_key":
+            if isinstance(snowpark_typed_args[0].typ, NullType):
+                raise AnalysisException(
+                    f"""[DATATYPE_MISMATCH.MAP_FUNCTION_DIFF_TYPES] Cannot resolve "map_contains_key({snowpark_arg_names[0]}, {snowpark_arg_names[1]})" due to data type mismatch: Input to `map_contains_key` should have been "MAP" followed by a value with same key type, but it's ["VOID", "INT"]."""
+                )
+            if isinstance(snowpark_typed_args[1].typ, NullType):
+                raise AnalysisException(
+                    f"""[DATATYPE_MISMATCH.NULL_TYPE] Cannot resolve "map_contains_key({snowpark_arg_names[0]}, {snowpark_arg_names[1]})" due to data type mismatch: Null typed values cannot be used as arguments of `map_contains_key`."""
+                )
             args = (
                 [snowpark_args[1], snowpark_args[0]]
                 if isinstance(snowpark_typed_args[0].typ, MapType)
@@ -4093,17 +4223,37 @@ def map_unresolved_function(
             last_win_dedup = global_config.spark_sql_mapKeyDedupPolicy == "LAST_WIN"
-            result_exp = snowpark_fn.cast(
-                snowpark_fn.function("reduce")(
-                    snowpark_args[0],
-                    snowpark_fn.object_construct(),
-                    snowpark_fn.sql_expr(
-                        # value_field is cast to variant because object_insert doesn't allow structured types,
-                        # and structured types are not coercible to variant
-                        # TODO: allow structured types in object_insert?
-                        f"(acc, e) -> object_insert(acc, e:{key_field}, e:{value_field}::variant, {last_win_dedup})"
-                    ),
+            # Check if any entry has a NULL key
+            has_null_key = (
+                snowpark_fn.function("array_size")(
+                    snowpark_fn.function("filter")(
+                        snowpark_args[0],
+                        snowpark_fn.sql_expr(f"e -> e:{key_field} IS NULL"),
+                    )
+                )
+                > 0
+            )
+            # Create error UDF for NULL keys (same pattern as map function)
+            null_key_error = _raise_error_helper(VariantType())(
+                snowpark_fn.lit("[NULL_MAP_KEY] Cannot use null as map key.")
+            )
+            # Create the reduce operation
+            reduce_result = snowpark_fn.function("reduce")(
+                snowpark_args[0],
+                snowpark_fn.object_construct(),
+                snowpark_fn.sql_expr(
+                    # value_field is cast to variant because object_insert doesn't allow structured types,
+                    # and structured types are not coercible to variant
+                    # TODO: allow structured types in object_insert?
+                    f"(acc, e) -> object_insert(acc, e:{key_field}, e:{value_field}::variant, {last_win_dedup})"
                 ),
+            )
+            # Use conditional logic: if there are NULL keys, throw error; otherwise proceed with reduce
+            result_exp = snowpark_fn.cast(
+                snowpark_fn.when(has_null_key, null_key_error).otherwise(reduce_result),
                 MapType(key_type, value_type),
             )
             result_type = MapType(key_type, value_type)
@@ -4122,23 +4272,35 @@ def map_unresolved_function(
             # TODO: implement in Snowflake/Snowpark
             # technically this could be done with a lateral join, but it's probably not worth the effort
             arg_type = snowpark_typed_args[0].typ
-            if not isinstance(arg_type, MapType):
+            if not isinstance(arg_type, (MapType, NullType)):
                 raise AnalysisException(
                     f"map_values requires a MapType argument, got {arg_type}"
                 )
             def _map_values(obj: dict) -> list:
-                return list(obj.values()) if obj else None
+                if obj is None:
+                    return None
+                return list(obj.values())
             map_values = cached_udf(
                 _map_values, return_type=ArrayType(), input_types=[StructType()]
             )
-            result_exp = snowpark_fn.cast(
-                map_values(snowpark_fn.cast(snowpark_args[0], StructType())),
-                ArrayType(arg_type.value_type),
-            )
-            result_type = ArrayType(arg_type.value_type)
+            # Handle NULL input directly at expression level
+            if isinstance(arg_type, NullType):
+                # If input is NULL literal, return NULL
+                result_exp = snowpark_fn.lit(None)
+                result_type = ArrayType(NullType())
+            else:
+                result_exp = snowpark_fn.when(
+                    snowpark_args[0].is_null(), snowpark_fn.lit(None)
+                ).otherwise(
+                    snowpark_fn.cast(
+                        map_values(snowpark_fn.cast(snowpark_args[0], StructType())),
+                        ArrayType(arg_type.value_type),
+                    )
+                )
+                result_type = ArrayType(arg_type.value_type)
         case "mask":
             number_of_args = len(snowpark_args)
@@ -4258,6 +4420,17 @@ def map_unresolved_function(
                 lambda: snowpark_typed_args[0].types,
             )
         case "md5":
+            snowflake_compat = get_boolean_session_config_param(
+                "enable_snowflake_extension_behavior"
+            )
+            # MD5 in Spark only accepts BinaryType or types that can be implicitly cast to it (StringType)
+            if not snowflake_compat:
+                if not isinstance(snowpark_typed_args[0].typ, (BinaryType, StringType)):
+                    raise AnalysisException(
+                        f'[DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "md5({snowpark_arg_names[0]})" due to data type mismatch: '
+                        f'Parameter 1 requires the "BINARY" type, however "{snowpark_arg_names[0]}" has the type "{snowpark_typed_args[0].typ}".'
+                    )
             result_exp = snowpark_fn.md5(snowpark_args[0])
             result_type = StringType(32)
         case "median":
@@ -5032,7 +5205,7 @@ def map_unresolved_function(
             result_type = DoubleType()
         case "raise_error":
             result_type = StringType()
-            raise_error = _raise_error_udf_helper(result_type)
+            raise_error = _raise_error_helper(result_type)
             result_exp = raise_error(*snowpark_args)
         case "rand" | "random":
             # Snowpark random() generates a 64 bit signed integer, but pyspark is [0.0, 1.0).
@@ -5117,7 +5290,7 @@ def map_unresolved_function(
                             snowpark_args[2],
                         ),
                     ),
-                    _raise_error_udf_helper(StringType())(
+                    _raise_error_helper(StringType())(
                         snowpark_fn.lit(
                             "[INVALID_PARAMETER_VALUE.REGEX_GROUP_INDEX] The value of parameter(s) `idx` in `regexp_extract` is invalid."
                         )
@@ -5167,7 +5340,7 @@ def map_unresolved_function(
                             idx,
                         )
                     ),
-                    _raise_error_udf_helper(ArrayType(StringType()))(
+                    _raise_error_helper(ArrayType(StringType()))(
                         snowpark_fn.lit(
                             "[INVALID_PARAMETER_VALUE.REGEX_GROUP_INDEX] The value of parameter(s) `idx` in `regexp_extract_all` is invalid."
                         )
@@ -5466,13 +5639,28 @@ def map_unresolved_function(
         case "row_number":
             result_exp = snowpark_fn.row_number()
             result_exp = TypedColumn(result_exp, lambda: [LongType()])
-        case "rtrim":
-            if len(snowpark_args) == 2:
-                # Only possible using SQL
-                spark_function_name = f"TRIM(TRAILING {snowpark_arg_names[1]} FROM {snowpark_arg_names[0]})"
-            result_exp = snowpark_fn.rtrim(*snowpark_args)
-            result_type = StringType()
         case "schema_of_csv":
+            # Validate that the input is a foldable STRING expression
+            if (
+                exp.unresolved_function.arguments[0].WhichOneof("expr_type")
+                != "literal"
+            ):
+                raise AnalysisException(
+                    "[DATATYPE_MISMATCH.NON_FOLDABLE_INPUT] Cannot resolve "
+                    f'"schema_of_csv({snowpark_arg_names[0]})" due to data type mismatch: '
+                    'the input csv should be a foldable "STRING" expression; however, '
+                    f'got "{snowpark_arg_names[0]}".'
+                )
+            if isinstance(snowpark_typed_args[0].typ, StringType):
+                if exp.unresolved_function.arguments[0].literal.string == "":
+                    raise AnalysisException(
+                        "[DATATYPE_MISMATCH.NON_FOLDABLE_INPUT] Cannot resolve "
+                        f'"schema_of_csv({snowpark_arg_names[0]})" due to data type mismatch: '
+                        'the input csv should be a foldable "STRING" expression; however, '
+                        f'got "{snowpark_arg_names[0]}".'
+                    )
             snowpark_args = [
                 typed_arg.column(to_semi_structure=True)
                 for typed_arg in snowpark_typed_args
@@ -5689,6 +5877,16 @@ def map_unresolved_function(
             )
             result_type = ArrayType(ArrayType(StringType()))
         case "sequence":
+            if snowpark_typed_args[0].typ != snowpark_typed_args[1].typ or (
+                not isinstance(snowpark_typed_args[0].typ, _IntegralType)
+                or not isinstance(snowpark_typed_args[1].typ, _IntegralType)
+            ):
+                raise AnalysisException(
+                    f"""[DATATYPE_MISMATCH.SEQUENCE_WRONG_INPUT_TYPES] Cannot resolve "sequence({snowpark_arg_names[0]}, {snowpark_arg_names[1]})" due to data type mismatch: `sequence` uses the wrong parameter type. The parameter type must conform to:
+                        1. The start and stop expressions must resolve to the same type.
+                        2. Otherwise, if start and stop expressions resolve to the "INTEGRAL" type, then the step expression must resolve to the same type.
+                    """
+                )
             result_exp = snowpark_fn.cast(
                 snowpark_fn.sequence(*snowpark_args),
                 ArrayType(LongType(), contains_null=False),
@@ -5856,7 +6054,7 @@ def map_unresolved_function(
                 result_exp = snowpark_fn.skew(snowpark_fn.lit(None))
             result_type = DoubleType()
         case "slice":
-            raise_error = _raise_error_udf_helper(snowpark_typed_args[0].typ)
+            raise_error = _raise_error_helper(snowpark_typed_args[0].typ)
             spark_index = snowpark_args[1]
             arr_size = snowpark_fn.array_size(snowpark_args[0])
             slice_len = snowpark_args[2]
@@ -5926,10 +6124,11 @@ def map_unresolved_function(
             result_exp = snowpark_fn.lit(0)
             result_type = LongType()
         case "split":
+            result_type = ArrayType(StringType())
             @cached_udf(
                 input_types=[StringType(), StringType(), IntegerType()],
-                return_type=ArrayType(StringType()),
+                return_type=result_type,
             )
             def _split(
                 input: Optional[str], pattern: Optional[str], limit: Optional[int]
@@ -5937,34 +6136,80 @@ def map_unresolved_function(
                 if input is None or pattern is None:
                     return None
+                import re
+                try:
+                    re.compile(pattern)
+                except re.error:
+                    raise ValueError(
+                        f"Failed to split string, provided pattern: {pattern} is invalid"
+                    )
                 if limit == 1:
                     return [input]
-                import re
+                if not input:
+                    return []
                 # A default of -1 is passed in PySpark, but RE needs it to be 0 to provide all splits.
                 # In PySpark, the limit also indicates the max size of the resulting array, but in RE
                 # the remainder is returned as another element.
                 maxsplit = limit - 1 if limit > 0 else 0
-                split_result = re.split(pattern, input, maxsplit)
                 if len(pattern) == 0:
-                    # RE.split provides a first and last empty element that is not there in PySpark.
-                    split_result = split_result[1 : len(split_result) - 1]
+                    return list(input) if limit <= 0 else list(input)[:limit]
+                match pattern:
+                    case "|":
+                        split_result = re.split(pattern, input, 0)
+                        input_limit = limit + 1 if limit > 0 else len(split_result)
+                        return (
+                            split_result
+                            if input_limit == 0
+                            else split_result[1:input_limit]
+                        )
+                    case "$":
+                        return [input, ""] if maxsplit >= 0 else [input]
+                    case "^":
+                        return [input]
+                    case _:
+                        return re.split(pattern, input, maxsplit)
+            def split_string(str_: Column, pattern: Column, limit: Column):
+                native_split = _split(str_, pattern, limit)
+                # When pattern is a literal and doesn't contain any regex special characters
+                # And when limit is less than or equal to 0
+                # Native Snowflake Split function is used to optimise performance
+                if isinstance(pattern._expression, Literal):
+                    pattern_value = pattern._expression.value
+                    if pattern_value is None:
+                        return snowpark_fn.lit(None)
+                    is_regexp = re.match(
+                        ".*[\\[\\.\\]\\*\\?\\+\\^\\$\\{\\}\\|\\(\\)\\\\].*",
+                        pattern_value,
+                    )
+                    is_empty = len(pattern_value) == 0
+                    if not is_empty and not is_regexp:
+                        return snowpark_fn.when(
+                            limit <= 0,
+                            snowpark_fn.split(str_, pattern).cast(result_type),
+                        ).otherwise(native_split)
-                return split_result
+                return native_split
             match snowpark_args:
                 case [str_, pattern]:
                     spark_function_name = (
                         f"split({snowpark_arg_names[0]}, {snowpark_arg_names[1]}, -1)"
                     )
-                    result_exp = _split(str_, pattern, snowpark_fn.lit(0))
+                    result_exp = split_string(str_, pattern, snowpark_fn.lit(-1))
                 case [str_, pattern, limit]:  # noqa: F841
-                    result_exp = _split(str_, pattern, limit)
+                    result_exp = split_string(str_, pattern, limit)
                 case _:
                     raise ValueError(f"Invalid number of arguments to {function_name}")
-            result_type = ArrayType(StringType())
         case "split_part":
             result_exp = snowpark_fn.call_function("split_part", *snowpark_args)
             result_type = StringType()
@@ -6274,6 +6519,10 @@ def map_unresolved_function(
             )
             result_type = TimestampType(snowpark.types.TimestampTimeZone.NTZ)
         case "timestamp_millis":
+            if not isinstance(snowpark_typed_args[0].typ, _IntegralType):
+                raise AnalysisException(
+                    f'[DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "timestamp_millis({snowpark_arg_names[0]}" due to data type mismatch: Parameter 1 requires the "INTEGRAL" type, however "{snowpark_arg_names[0]}" has the type "{snowpark_typed_args[0].typ}".'
+                )
             result_exp = snowpark_fn.cast(
                 snowpark_fn.to_timestamp(snowpark_args[0] * 1_000, 6),
                 TimestampType(snowpark.types.TimestampTimeZone.NTZ),
@@ -6283,6 +6532,10 @@ def map_unresolved_function(
             # Spark allows seconds to be fractional. Snowflake does not allow that
             # even though the documentation explicitly says that it does.
             # As a workaround, use integer milliseconds instead of fractional seconds.
+            if not isinstance(snowpark_typed_args[0].typ, _NumericType):
+                raise AnalysisException(
+                    f"""AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "{function_name}({snowpark_arg_names[0]})" due to data type mismatch: Parameter 1 requires the "NUMERIC" type, however "{snowpark_arg_names[0]}" has the type "{snowpark_typed_args[0].typ}".;"""
+                )
             result_exp = snowpark_fn.cast(
                 snowpark_fn.to_timestamp(
                     snowpark_fn.cast(snowpark_args[0] * 1_000_000, LongType()), 6
@@ -6725,7 +6978,20 @@ def map_unresolved_function(
             result_type = StringType()
         case "trunc":
             part = unwrap_literal(exp.unresolved_function.arguments[1])
-            if part is None:
+            part = None if part is None else part.lower()
+            allowed_parts = {
+                "year",
+                "yyyy",
+                "yy",
+                "month",
+                "mon",
+                "mm",
+                "week",
+                "quarter",
+            }
+            if part not in allowed_parts:
                 result_exp = snowpark_fn.lit(None)
             else:
                 result_exp = _try_to_cast(
@@ -7116,6 +7382,12 @@ def map_unresolved_function(
                     )
                 )
             )
+            raise_fn = _raise_error_helper(BinaryType(), IllegalArgumentException)
+            result_exp = (
+                snowpark_fn.when(unbase_arg.is_null(), snowpark_fn.lit(None))
+                .when(result_exp.is_null(), raise_fn(snowpark_fn.lit("Invalid input")))
+                .otherwise(result_exp)
+            )
             result_type = BinaryType()
         case "unhex":
             # Non string columns, convert them to string type. This mimics pyspark behavior.
@@ -7316,6 +7588,15 @@ def map_unresolved_function(
                 )
             result_type = LongType()
         case "when" | "if":
+            # Validate that the condition is a boolean expression
+            if len(snowpark_typed_args) > 0:
+                condition_type = snowpark_typed_args[0].typ
+                if not isinstance(condition_type, BooleanType):
+                    raise AnalysisException(
+                        f"[DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve CASE WHEN condition due to data type mismatch: "
+                        f"Parameter 1 requires the 'BOOLEAN' type, however got '{condition_type}'"
+                    )
             name_components = ["CASE"]
             name_components.append("WHEN")
             name_components.append(snowpark_arg_names[0])
@@ -7334,6 +7615,13 @@ def map_unresolved_function(
                     name_components.append(snowpark_arg_names[i])
                     name_components.append("THEN")
                     name_components.append(snowpark_arg_names[i + 1])
+                    # Validate each WHEN condition
+                    condition_type = snowpark_typed_args[i].typ
+                    if not isinstance(condition_type, BooleanType):
+                        raise AnalysisException(
+                            f"[DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve CASE WHEN condition due to data type mismatch: "
+                            f"Parameter {i + 1} requires the 'BOOLEAN' type, however got '{condition_type}'"
+                        )
                     result_exp = result_exp.when(snowpark_args[i], snowpark_args[i + 1])
                     result_type_indexes.append(i + 1)
             name_components.append("END")
@@ -7710,16 +7998,8 @@ def _handle_current_timestamp():
 def _equivalent_decimal(type):
-    match (type):
-        case ByteType():
-            return DecimalType(3, 0)
-        case ShortType():
-            return DecimalType(5, 0)
-        case IntegerType():
-            return DecimalType(10, 0)
-        case LongType():
-            return DecimalType(20, 0)
-    return DecimalType(38, 0)
+    (precision, scale) = _get_type_precision(type)
+    return DecimalType(precision, scale)
 def _resolve_decimal_and_numeric(type1: DecimalType, type2: _NumericType) -> DataType:
@@ -8778,7 +9058,9 @@ def _get_type_precision(typ: DataType) -> tuple[int, int]:
         case IntegerType():
             return 10, 0  # -2147483648 to 2147483647
         case LongType():
-            return 19, 0  # -9223372036854775808 to 9223372036854775807
+            return 20, 0  # -9223372036854775808 to 9223372036854775807
+        case NullType():
+            return 6, 2  # NULL
         case _:
             return 38, 0  # Default to maximum precision for other types
@@ -8993,16 +9275,12 @@ def _try_arithmetic_helper(
                 typed_args[1].typ, DecimalType
             ):
                 new_scale = s2
-                new_precision = (
-                    p1 + s2 + 1
-                )  # Integral precision + decimal scale + 1 for carry
+                new_precision = max(p2, p1 + s2)
             elif isinstance(typed_args[0].typ, DecimalType) and isinstance(
                 typed_args[1].typ, _IntegralType
             ):
                 new_scale = s1
-                new_precision = (
-                    p2 + s1 + 1
-                )  # Integral precision + decimal scale + 1 for carry
+                new_precision = max(p1, p2 + s1)
             else:
                 # Both decimal types
                 if operation_type == 1 and s1 == s2:  # subtraction with matching scales
@@ -9081,13 +9359,13 @@ def _add_sub_precision_helper(
         typed_args[1].typ, DecimalType
     ):
         new_scale = s2
-        new_precision = p1 + s2 + 1  # Integral precision + decimal scale + 1 for carry
+        new_precision = max(p2, p1 + s2)
         return_type_precision, return_type_scale = new_precision, new_scale
     elif isinstance(typed_args[0].typ, DecimalType) and isinstance(
         typed_args[1].typ, _IntegralType
     ):
         new_scale = s1
-        new_precision = p2 + s1 + 1  # Integral precision + decimal scale + 1 for carry
+        new_precision = max(p1, p2 + s1)
         return_type_precision, return_type_scale = new_precision, new_scale
     else:
         (
@@ -9169,11 +9447,25 @@ def _mul_div_precision_helper(
     )
-def _raise_error_udf_helper(return_type: DataType):
-    def _raise_error(message=None):
-        raise ValueError(message)
+def _raise_error_helper(return_type: DataType, error_class=None):
+    error_class = (
+        f":{error_class.__name__}"
+        if error_class and hasattr(error_class, "__name__")
+        else ""
+    )
+    def _raise_fn(*msgs: Column) -> Column:
+        return snowpark_fn.cast(
+            snowpark_fn.abs(
+                snowpark_fn.concat(
+                    snowpark_fn.lit(f"[snowpark-connect-exception{error_class}]"),
+                    *(msg.try_cast(StringType()) for msg in msgs),
+                )
+            ).cast(StringType()),
+            return_type,
+        )
-    return cached_udf(_raise_error, return_type=return_type, input_types=[StringType()])
+    return _raise_fn
 def _divnull(dividend: Column, divisor: Column) -> Column:
@@ -9448,3 +9740,22 @@ def _validate_number_format_string(format_str: str) -> None:
         raise AnalysisException(
             f"[INVALID_FORMAT.WRONG_NUM_DIGIT] The format is invalid: '{format_str}'. The format string requires at least one number digit."
         )
+def _trim_helper(value: Column, trim_value: Column, trim_type: Column) -> Column:
+    @cached_udf(
+        return_type=BinaryType(),
+        input_types=[BinaryType(), BinaryType(), StringType()],
+    )
+    def _binary_trim_udf(value: bytes, trim_value: bytes, trim_type: str) -> bytes:
+        if value is None or trim_value is None:
+            return value
+        if trim_type in ("rtrim", "btrim", "trim"):
+            while value.endswith(trim_value):
+                value = value[: -len(trim_value)]
+        if trim_type in ("ltrim", "btrim", "trim"):
+            while value.startswith(trim_value):
+                value = value[len(trim_value) :]
+        return value
+    return _binary_trim_udf(value, trim_value, trim_type)