PyPI - snowpark-connect - Versions diffs - 0.21.0__py3-none-any.whl → 0.22.1__py3-none-any.whl - Mend

snowpark-connect 0.21.0py3-none-any.whl → 0.22.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (41) hide show

snowflake/snowpark_connect/expression/map_unresolved_function.py CHANGED Viewed

@@ -28,6 +28,7 @@ from google.protobuf.message import Message
 from pyspark.errors.exceptions.base import (
     AnalysisException,
     ArithmeticException,
+    ArrayIndexOutOfBoundsException,
     DateTimeException,
     IllegalArgumentException,
     NumberFormatException,
@@ -39,6 +40,7 @@ from pyspark.sql.types import _parse_datatype_json_string
 import snowflake.snowpark.functions as snowpark_fn
 from snowflake import snowpark
 from snowflake.snowpark import Column, Session
+from snowflake.snowpark._internal.analyzer.expression import Literal
 from snowflake.snowpark._internal.analyzer.unary_expression import Alias
 from snowflake.snowpark.types import (
     ArrayType,
@@ -139,7 +141,7 @@ from snowflake.snowpark_connect.utils.xxhash64 import (
 MAX_UINT64 = 2**64 - 1
 MAX_INT64 = 2**63 - 1
 MIN_INT64 = -(2**63)
+MAX_ARRAY_SIZE = 2_147_483_647
 NAN, INFINITY = float("nan"), float("inf")
@@ -638,37 +640,22 @@ def map_unresolved_function(
                         [arg.typ for arg in snowpark_typed_args]
                     )
         case "/":
-            if isinstance(
-                snowpark_typed_args[0].typ, (IntegerType, LongType, ShortType)
-            ) and isinstance(
-                snowpark_typed_args[1].typ, (IntegerType, LongType, ShortType)
-            ):
-                #  Check if both arguments are integer types. Snowpark performs integer division, and precision is lost.
-                #  Cast to double and perform division
-                result_exp = _divnull(
-                    snowpark_args[0].cast(DoubleType()),
-                    snowpark_args[1].cast(DoubleType()),
-                )
-                result_type = DoubleType()
-            elif (
-                isinstance(snowpark_typed_args[0].typ, DecimalType)
-                and isinstance(snowpark_typed_args[1].typ, DecimalType)
-                or isinstance(snowpark_typed_args[0].typ, DecimalType)
-                and isinstance(snowpark_typed_args[1].typ, _IntegralType)
-                or isinstance(snowpark_typed_args[0].typ, _IntegralType)
-                and isinstance(snowpark_typed_args[1].typ, DecimalType)
-            ):
-                result_exp, (
-                    return_type_precision,
-                    return_type_scale,
-                ) = _mul_div_precision_helper(snowpark_typed_args, snowpark_args, 1)
-                result_type = DecimalType(return_type_precision, return_type_scale)
-            else:
-                # Perform division directly
-                result_exp = _divnull(snowpark_args[0], snowpark_args[1])
-                result_type = _find_common_type(
-                    [arg.typ for arg in snowpark_typed_args]
-                )
+            match (snowpark_typed_args[0].typ, snowpark_typed_args[1].typ):
+                case (DecimalType(), t) | (t, DecimalType()) if isinstance(
+                    t, DecimalType
+                ) or isinstance(t, _IntegralType) or isinstance(
+                    snowpark_typed_args[1].typ, NullType
+                ):
+                    result_exp, (
+                        return_type_precision,
+                        return_type_scale,
+                    ) = _mul_div_precision_helper(snowpark_typed_args, snowpark_args, 1)
+                    result_type = DecimalType(return_type_precision, return_type_scale)
+                case _:
+                    result_type = DoubleType()
+                    dividend = snowpark_args[0].cast(result_type)
+                    divisor = snowpark_args[1].cast(result_type)
+                    result_exp = _divnull(dividend, divisor)
         case "~":
             result_exp = TypedColumn(
                 snowpark_fn.bitnot(snowpark_args[0]),
@@ -1201,35 +1188,18 @@ def map_unresolved_function(
                 snowpark_fn.asinh(snowpark_args[0]), lambda: [DoubleType()]
             )
         case "assert_true":
+            result_type = NullType()
+            raise_error = _raise_error_helper(result_type)
-            @cached_udf(
-                input_types=[BooleanType()],
-                return_type=StringType(),
-            )
-            def _assert_true_single(expr):
-                if not expr:
-                    raise ValueError("assertion failed")
-                return None
-            @cached_udf(
-                input_types=[BooleanType(), StringType()],
-                return_type=StringType(),
-            )
-            def _assert_true_with_message(expr, message):
-                if not expr:
-                    raise ValueError(message)
-                return None
-            # Handle different argument counts using match pattern
             match snowpark_args:
                 case [expr]:
-                    result_exp = TypedColumn(
-                        _assert_true_single(expr), lambda: [StringType()]
-                    )
+                    result_exp = snowpark_fn.when(
+                        expr, snowpark_fn.lit(None)
+                    ).otherwise(raise_error(snowpark_fn.lit("assertion failed")))
                 case [expr, message]:
-                    result_exp = TypedColumn(
-                        _assert_true_with_message(expr, message), lambda: [StringType()]
-                    )
+                    result_exp = snowpark_fn.when(
+                        expr, snowpark_fn.lit(None)
+                    ).otherwise(raise_error(snowpark_fn.cast(message, StringType())))
                 case _:
                     raise AnalysisException(
                         f"[WRONG_NUM_ARGS.WITHOUT_SUGGESTION] The `assert_true` requires 1 or 2 parameters but the actual number is {len(snowpark_args)}."
@@ -2291,31 +2261,32 @@ def map_unresolved_function(
                     )
         case "elt":
             n = snowpark_args[0]
             values = snowpark_fn.array_construct(*snowpark_args[1:])
             if spark_sql_ansi_enabled:
-                @cached_udf(
-                    input_types=[IntegerType()],
-                    return_type=StringType(),
+                raise_error = _raise_error_helper(
+                    StringType(), error_class=ArrayIndexOutOfBoundsException
                 )
-                def _raise_out_of_bounds_error(n: int) -> str:
-                    raise ValueError(
-                        f"ArrayIndexOutOfBoundsException: {n} is not within the input bounds."
-                    )
                 values_size = snowpark_fn.lit(len(snowpark_args) - 1)
                 result_exp = (
                     snowpark_fn.when(snowpark_fn.is_null(n), snowpark_fn.lit(None))
                     .when(
                         (snowpark_fn.lit(1) <= n) & (n <= values_size),
-                        snowpark_fn.get(
-                            values, snowpark_fn.nvl(n - 1, snowpark_fn.lit(0))
+                        snowpark_fn.cast(
+                            snowpark_fn.get(
+                                values, snowpark_fn.nvl(n - 1, snowpark_fn.lit(0))
+                            ),
+                            StringType(),
                         ),
                     )
-                    .otherwise(_raise_out_of_bounds_error(n))
+                    .otherwise(
+                        raise_error(
+                            snowpark_fn.lit("[INVALID_ARRAY_INDEX] The index "),
+                            snowpark_fn.cast(n, StringType()),
+                            snowpark_fn.lit(" is out of bounds."),
+                        )
+                    )
                 )
             else:
                 result_exp = snowpark_fn.when(
@@ -3289,7 +3260,7 @@ def map_unresolved_function(
             ).cast(LongType())
             result_type = LongType()
         case "hll_union_agg":
-            raise_error = _raise_error_udf_helper(BinaryType())
+            raise_error = _raise_error_helper(BinaryType())
             args = exp.unresolved_function.arguments
             allow_different_lgConfigK = len(args) == 2 and unwrap_literal(args[1])
             spark_function_name = f"{function_name}({snowpark_arg_names[0]}, {str(allow_different_lgConfigK).lower()})"
@@ -3329,7 +3300,7 @@ def map_unresolved_function(
                     SELECT arg1 as x)
                 """,
             )
-            raise_error = _raise_error_udf_helper(BinaryType())
+            raise_error = _raise_error_helper(BinaryType())
             args = exp.unresolved_function.arguments
             allow_different_lgConfigK = len(args) == 3 and unwrap_literal(args[2])
             spark_function_name = f"{function_name}({snowpark_arg_names[0]}, {snowpark_arg_names[1]}, {str(allow_different_lgConfigK).lower()})"
@@ -4053,7 +4024,7 @@ def map_unresolved_function(
                             snowpark_fn.is_null(snowpark_args[i]),
                             # udf execution on XP seems to be lazy, so this should only run when there is a null key
                             # otherwise there should be no udf env setup or execution
-                            _raise_error_udf_helper(VariantType())(
+                            _raise_error_helper(VariantType())(
                                 snowpark_fn.lit(
                                     "[NULL_MAP_KEY] Cannot use null as map key."
                                 )
@@ -4115,6 +4086,14 @@ def map_unresolved_function(
             )
             result_type = MapType(key_type, value_type)
         case "map_contains_key":
+            if isinstance(snowpark_typed_args[0].typ, NullType):
+                raise AnalysisException(
+                    f"""[DATATYPE_MISMATCH.MAP_FUNCTION_DIFF_TYPES] Cannot resolve "map_contains_key({snowpark_arg_names[0]}, {snowpark_arg_names[1]})" due to data type mismatch: Input to `map_contains_key` should have been "MAP" followed by a value with same key type, but it's ["VOID", "INT"]."""
+                )
+            if isinstance(snowpark_typed_args[1].typ, NullType):
+                raise AnalysisException(
+                    f"""[DATATYPE_MISMATCH.NULL_TYPE] Cannot resolve "map_contains_key({snowpark_arg_names[0]}, {snowpark_arg_names[1]})" due to data type mismatch: Null typed values cannot be used as arguments of `map_contains_key`."""
+                )
             args = (
                 [snowpark_args[1], snowpark_args[0]]
                 if isinstance(snowpark_typed_args[0].typ, MapType)
@@ -4244,17 +4223,37 @@ def map_unresolved_function(
             last_win_dedup = global_config.spark_sql_mapKeyDedupPolicy == "LAST_WIN"
-            result_exp = snowpark_fn.cast(
-                snowpark_fn.function("reduce")(
-                    snowpark_args[0],
-                    snowpark_fn.object_construct(),
-                    snowpark_fn.sql_expr(
-                        # value_field is cast to variant because object_insert doesn't allow structured types,
-                        # and structured types are not coercible to variant
-                        # TODO: allow structured types in object_insert?
-                        f"(acc, e) -> object_insert(acc, e:{key_field}, e:{value_field}::variant, {last_win_dedup})"
-                    ),
+            # Check if any entry has a NULL key
+            has_null_key = (
+                snowpark_fn.function("array_size")(
+                    snowpark_fn.function("filter")(
+                        snowpark_args[0],
+                        snowpark_fn.sql_expr(f"e -> e:{key_field} IS NULL"),
+                    )
+                )
+                > 0
+            )
+            # Create error UDF for NULL keys (same pattern as map function)
+            null_key_error = _raise_error_helper(VariantType())(
+                snowpark_fn.lit("[NULL_MAP_KEY] Cannot use null as map key.")
+            )
+            # Create the reduce operation
+            reduce_result = snowpark_fn.function("reduce")(
+                snowpark_args[0],
+                snowpark_fn.object_construct(),
+                snowpark_fn.sql_expr(
+                    # value_field is cast to variant because object_insert doesn't allow structured types,
+                    # and structured types are not coercible to variant
+                    # TODO: allow structured types in object_insert?
+                    f"(acc, e) -> object_insert(acc, e:{key_field}, e:{value_field}::variant, {last_win_dedup})"
                 ),
+            )
+            # Use conditional logic: if there are NULL keys, throw error; otherwise proceed with reduce
+            result_exp = snowpark_fn.cast(
+                snowpark_fn.when(has_null_key, null_key_error).otherwise(reduce_result),
                 MapType(key_type, value_type),
             )
             result_type = MapType(key_type, value_type)
@@ -4273,23 +4272,35 @@ def map_unresolved_function(
             # TODO: implement in Snowflake/Snowpark
             # technically this could be done with a lateral join, but it's probably not worth the effort
             arg_type = snowpark_typed_args[0].typ
-            if not isinstance(arg_type, MapType):
+            if not isinstance(arg_type, (MapType, NullType)):
                 raise AnalysisException(
                     f"map_values requires a MapType argument, got {arg_type}"
                 )
             def _map_values(obj: dict) -> list:
-                return list(obj.values()) if obj else None
+                if obj is None:
+                    return None
+                return list(obj.values())
             map_values = cached_udf(
                 _map_values, return_type=ArrayType(), input_types=[StructType()]
             )
-            result_exp = snowpark_fn.cast(
-                map_values(snowpark_fn.cast(snowpark_args[0], StructType())),
-                ArrayType(arg_type.value_type),
-            )
-            result_type = ArrayType(arg_type.value_type)
+            # Handle NULL input directly at expression level
+            if isinstance(arg_type, NullType):
+                # If input is NULL literal, return NULL
+                result_exp = snowpark_fn.lit(None)
+                result_type = ArrayType(NullType())
+            else:
+                result_exp = snowpark_fn.when(
+                    snowpark_args[0].is_null(), snowpark_fn.lit(None)
+                ).otherwise(
+                    snowpark_fn.cast(
+                        map_values(snowpark_fn.cast(snowpark_args[0], StructType())),
+                        ArrayType(arg_type.value_type),
+                    )
+                )
+                result_type = ArrayType(arg_type.value_type)
         case "mask":
             number_of_args = len(snowpark_args)
@@ -5194,7 +5205,7 @@ def map_unresolved_function(
             result_type = DoubleType()
         case "raise_error":
             result_type = StringType()
-            raise_error = _raise_error_udf_helper(result_type)
+            raise_error = _raise_error_helper(result_type)
             result_exp = raise_error(*snowpark_args)
         case "rand" | "random":
             # Snowpark random() generates a 64 bit signed integer, but pyspark is [0.0, 1.0).
@@ -5279,7 +5290,7 @@ def map_unresolved_function(
                             snowpark_args[2],
                         ),
                     ),
-                    _raise_error_udf_helper(StringType())(
+                    _raise_error_helper(StringType())(
                         snowpark_fn.lit(
                             "[INVALID_PARAMETER_VALUE.REGEX_GROUP_INDEX] The value of parameter(s) `idx` in `regexp_extract` is invalid."
                         )
@@ -5329,7 +5340,7 @@ def map_unresolved_function(
                             idx,
                         )
                     ),
-                    _raise_error_udf_helper(ArrayType(StringType()))(
+                    _raise_error_helper(ArrayType(StringType()))(
                         snowpark_fn.lit(
                             "[INVALID_PARAMETER_VALUE.REGEX_GROUP_INDEX] The value of parameter(s) `idx` in `regexp_extract_all` is invalid."
                         )
@@ -6043,7 +6054,7 @@ def map_unresolved_function(
                 result_exp = snowpark_fn.skew(snowpark_fn.lit(None))
             result_type = DoubleType()
         case "slice":
-            raise_error = _raise_error_udf_helper(snowpark_typed_args[0].typ)
+            raise_error = _raise_error_helper(snowpark_typed_args[0].typ)
             spark_index = snowpark_args[1]
             arr_size = snowpark_fn.array_size(snowpark_args[0])
             slice_len = snowpark_args[2]
@@ -6113,10 +6124,11 @@ def map_unresolved_function(
             result_exp = snowpark_fn.lit(0)
             result_type = LongType()
         case "split":
+            result_type = ArrayType(StringType())
             @cached_udf(
                 input_types=[StringType(), StringType(), IntegerType()],
-                return_type=ArrayType(StringType()),
+                return_type=result_type,
             )
             def _split(
                 input: Optional[str], pattern: Optional[str], limit: Optional[int]
@@ -6124,34 +6136,80 @@ def map_unresolved_function(
                 if input is None or pattern is None:
                     return None
+                import re
+                try:
+                    re.compile(pattern)
+                except re.error:
+                    raise ValueError(
+                        f"Failed to split string, provided pattern: {pattern} is invalid"
+                    )
                 if limit == 1:
                     return [input]
-                import re
+                if not input:
+                    return []
                 # A default of -1 is passed in PySpark, but RE needs it to be 0 to provide all splits.
                 # In PySpark, the limit also indicates the max size of the resulting array, but in RE
                 # the remainder is returned as another element.
                 maxsplit = limit - 1 if limit > 0 else 0
-                split_result = re.split(pattern, input, maxsplit)
                 if len(pattern) == 0:
-                    # RE.split provides a first and last empty element that is not there in PySpark.
-                    split_result = split_result[1 : len(split_result) - 1]
+                    return list(input) if limit <= 0 else list(input)[:limit]
-                return split_result
+                match pattern:
+                    case "|":
+                        split_result = re.split(pattern, input, 0)
+                        input_limit = limit + 1 if limit > 0 else len(split_result)
+                        return (
+                            split_result
+                            if input_limit == 0
+                            else split_result[1:input_limit]
+                        )
+                    case "$":
+                        return [input, ""] if maxsplit >= 0 else [input]
+                    case "^":
+                        return [input]
+                    case _:
+                        return re.split(pattern, input, maxsplit)
+            def split_string(str_: Column, pattern: Column, limit: Column):
+                native_split = _split(str_, pattern, limit)
+                # When pattern is a literal and doesn't contain any regex special characters
+                # And when limit is less than or equal to 0
+                # Native Snowflake Split function is used to optimise performance
+                if isinstance(pattern._expression, Literal):
+                    pattern_value = pattern._expression.value
+                    if pattern_value is None:
+                        return snowpark_fn.lit(None)
+                    is_regexp = re.match(
+                        ".*[\\[\\.\\]\\*\\?\\+\\^\\$\\{\\}\\|\\(\\)\\\\].*",
+                        pattern_value,
+                    )
+                    is_empty = len(pattern_value) == 0
+                    if not is_empty and not is_regexp:
+                        return snowpark_fn.when(
+                            limit <= 0,
+                            snowpark_fn.split(str_, pattern).cast(result_type),
+                        ).otherwise(native_split)
+                return native_split
             match snowpark_args:
                 case [str_, pattern]:
                     spark_function_name = (
                         f"split({snowpark_arg_names[0]}, {snowpark_arg_names[1]}, -1)"
                     )
-                    result_exp = _split(str_, pattern, snowpark_fn.lit(0))
+                    result_exp = split_string(str_, pattern, snowpark_fn.lit(-1))
                 case [str_, pattern, limit]:  # noqa: F841
-                    result_exp = _split(str_, pattern, limit)
+                    result_exp = split_string(str_, pattern, limit)
                 case _:
                     raise ValueError(f"Invalid number of arguments to {function_name}")
-            result_type = ArrayType(StringType())
         case "split_part":
             result_exp = snowpark_fn.call_function("split_part", *snowpark_args)
             result_type = StringType()
@@ -6920,7 +6978,20 @@ def map_unresolved_function(
             result_type = StringType()
         case "trunc":
             part = unwrap_literal(exp.unresolved_function.arguments[1])
-            if part is None:
+            part = None if part is None else part.lower()
+            allowed_parts = {
+                "year",
+                "yyyy",
+                "yy",
+                "month",
+                "mon",
+                "mm",
+                "week",
+                "quarter",
+            }
+            if part not in allowed_parts:
                 result_exp = snowpark_fn.lit(None)
             else:
                 result_exp = _try_to_cast(
@@ -7311,7 +7382,7 @@ def map_unresolved_function(
                     )
                 )
             )
-            raise_fn = _raise_error_udf_helper(BinaryType())
+            raise_fn = _raise_error_helper(BinaryType(), IllegalArgumentException)
             result_exp = (
                 snowpark_fn.when(unbase_arg.is_null(), snowpark_fn.lit(None))
                 .when(result_exp.is_null(), raise_fn(snowpark_fn.lit("Invalid input")))
@@ -7927,16 +7998,8 @@ def _handle_current_timestamp():
 def _equivalent_decimal(type):
-    match (type):
-        case ByteType():
-            return DecimalType(3, 0)
-        case ShortType():
-            return DecimalType(5, 0)
-        case IntegerType():
-            return DecimalType(10, 0)
-        case LongType():
-            return DecimalType(20, 0)
-    return DecimalType(38, 0)
+    (precision, scale) = _get_type_precision(type)
+    return DecimalType(precision, scale)
 def _resolve_decimal_and_numeric(type1: DecimalType, type2: _NumericType) -> DataType:
@@ -8995,7 +9058,9 @@ def _get_type_precision(typ: DataType) -> tuple[int, int]:
         case IntegerType():
             return 10, 0  # -2147483648 to 2147483647
         case LongType():
-            return 19, 0  # -9223372036854775808 to 9223372036854775807
+            return 20, 0  # -9223372036854775808 to 9223372036854775807
+        case NullType():
+            return 6, 2  # NULL
         case _:
             return 38, 0  # Default to maximum precision for other types
@@ -9210,16 +9275,12 @@ def _try_arithmetic_helper(
                 typed_args[1].typ, DecimalType
             ):
                 new_scale = s2
-                new_precision = (
-                    p1 + s2 + 1
-                )  # Integral precision + decimal scale + 1 for carry
+                new_precision = max(p2, p1 + s2)
             elif isinstance(typed_args[0].typ, DecimalType) and isinstance(
                 typed_args[1].typ, _IntegralType
             ):
                 new_scale = s1
-                new_precision = (
-                    p2 + s1 + 1
-                )  # Integral precision + decimal scale + 1 for carry
+                new_precision = max(p1, p2 + s1)
             else:
                 # Both decimal types
                 if operation_type == 1 and s1 == s2:  # subtraction with matching scales
@@ -9298,13 +9359,13 @@ def _add_sub_precision_helper(
         typed_args[1].typ, DecimalType
     ):
         new_scale = s2
-        new_precision = p1 + s2 + 1  # Integral precision + decimal scale + 1 for carry
+        new_precision = max(p2, p1 + s2)
         return_type_precision, return_type_scale = new_precision, new_scale
     elif isinstance(typed_args[0].typ, DecimalType) and isinstance(
         typed_args[1].typ, _IntegralType
     ):
         new_scale = s1
-        new_precision = p2 + s1 + 1  # Integral precision + decimal scale + 1 for carry
+        new_precision = max(p1, p2 + s1)
         return_type_precision, return_type_scale = new_precision, new_scale
     else:
         (
@@ -9386,11 +9447,25 @@ def _mul_div_precision_helper(
     )
-def _raise_error_udf_helper(return_type: DataType):
-    def _raise_error(message=None):
-        raise ValueError(message)
+def _raise_error_helper(return_type: DataType, error_class=None):
+    error_class = (
+        f":{error_class.__name__}"
+        if error_class and hasattr(error_class, "__name__")
+        else ""
+    )
+    def _raise_fn(*msgs: Column) -> Column:
+        return snowpark_fn.cast(
+            snowpark_fn.abs(
+                snowpark_fn.concat(
+                    snowpark_fn.lit(f"[snowpark-connect-exception{error_class}]"),
+                    *(msg.try_cast(StringType()) for msg in msgs),
+                )
+            ).cast(StringType()),
+            return_type,
+        )
-    return cached_udf(_raise_error, return_type=return_type, input_types=[StringType()])
+    return _raise_fn
 def _divnull(dividend: Column, divisor: Column) -> Column:

snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2_grpc.py ADDED Viewed

@@ -0,0 +1,4 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc

snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2_grpc.py ADDED Viewed

@@ -0,0 +1,4 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc

snowpark-connect 0.21.0__py3-none-any.whl → 0.22.1__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.21.0py3-none-any.whl → 0.22.1py3-none-any.whl