PyPI - snowpark-connect - Versions diffs - 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

snowpark-connect 0.27.0py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (192) hide show

snowflake/snowpark_connect/utils/env_utils.py CHANGED Viewed

@@ -8,6 +8,8 @@ Environment variable utilities for Snowpark Connect.
 import os
+from snowflake.snowpark_connect.error.error_codes import ErrorCodes
+from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
 from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
@@ -37,9 +39,11 @@ def get_int_from_env(env_var: str, default: int) -> int:
     """
     # Validate that default is actually an integer
     if not isinstance(default, int):
-        raise TypeError(
+        exception = TypeError(
             f"Default value must be an integer, got {type(default).__name__}: {default}"
         )
+        attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
+        raise exception
     value = os.getenv(env_var)
     if value is None:

snowflake/snowpark_connect/utils/expression_transformer.py ADDED Viewed

@@ -0,0 +1,172 @@
+#
+# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
+#
+from snowflake.snowpark import Column, functions as snowpark_fn
+from snowflake.snowpark._internal.analyzer.expression import (
+    CaseWhen,
+    Expression,
+    FunctionExpression,
+    SnowflakeUDF,
+)
+_SF_AGGREGATE_FUNCTIONS = [
+    "any_value",
+    "avg",
+    "corr",
+    "count",
+    "count_if",
+    "covar_pop",
+    "covar_samp",
+    "listagg",
+    "max",
+    "max_by",
+    "median",
+    "min",
+    "min_by",
+    "mode",
+    "percentile_cont",
+    "percentile_disc",
+    "stddev",
+    "stddev_samp",
+    "stddev_pop",
+    "sum",
+    "var_pop",
+    "var_samp",
+    "variance_pop",
+    "variance",
+    "variance_samp",
+    "bitand_agg",
+    "bitor_agg",
+    "bitxor_agg",
+    "booland_agg",
+    "boolor_agg",
+    "boolxor_agg",
+    "hash_agg",
+    "array_agg",
+    "object_agg",
+    "regr_avgx",
+    "regr_avgy",
+    "regr_count",
+    "regr_intercept",
+    "regr_r2",
+    "regr_slope",
+    "regr_sxx",
+    "regr_sxy",
+    "regr_syy",
+    "kurtosis",
+    "skew",
+    "array_union_agg",
+    "array_unique_agg",
+    "bitmap_bit_position",
+    "bitmap_bucket_number",
+    "bitmap_count",
+    "bitmap_construct_agg",
+    "bitmap_or_agg",
+    "approx_count_distinct",
+    "datasketches_hll",
+    "datasketches_hll_accumulate",
+    "datasketches_hll_combine",
+    "datasketches_hll_estimate",
+    "hll",
+    "hll_accumulate",
+    "hll_combine",
+    "hll_estimate",
+    "hll_export",
+    "hll_import",
+    "approximate_jaccard_index",
+    "approximate_similarity",
+    "minhash",
+    "minhash_combine",
+    "approx_top_k",
+    "approx_top_k_accumulate",
+    "approx_top_k_combine",
+    "approx_top_k_estimate",
+    "approx_percentile",
+    "approx_percentile_accumulate",
+    "approx_percentile_combine",
+    "approx_percentile_estimate",
+    "grouping",
+    "grouping_id",
+    "ai_agg",
+    "ai_summarize_agg",
+]
+def _is_agg_function_expression(expression: Expression) -> bool:
+    if (
+        isinstance(expression, FunctionExpression)
+        and expression.pretty_name.lower() in _SF_AGGREGATE_FUNCTIONS
+    ):
+        return True
+    # For PySpark aggregate functions that were mapped using a UDAF, e.g. try_sum
+    if isinstance(expression, SnowflakeUDF) and expression.is_aggregate_function:
+        return True
+    return False
+def _get_child_expressions(expression: Expression) -> list[Expression]:
+    if isinstance(expression, CaseWhen):
+        return expression._child_expressions
+    return expression.children or []
+def inject_condition_to_all_agg_functions(
+    expression: Expression, condition: Column
+) -> None:
+    """
+    Recursively traverses an expression tree and wraps all aggregate function arguments with a CASE WHEN condition.
+    Args:
+        expression: The Snowpark expression tree to traverse and modify.
+        condition: The Column condition to inject into aggregate function arguments.
+    """
+    any_agg_function_found = _inject_condition_to_all_agg_functions(
+        expression, condition
+    )
+    if not any_agg_function_found:
+        raise ValueError(f"No aggregate functions found in: {expression.sql}")
+def _inject_condition_to_all_agg_functions(
+    expression: Expression, condition: Column
+) -> bool:
+    any_agg_function_found = False
+    if _is_agg_function_expression(expression):
+        new_children = []
+        for child in _get_child_expressions(expression):
+            case_when = snowpark_fn.when(condition, Column(child))
+            new_children.append(case_when._expr1)
+        # Swap children
+        expression.children = new_children
+        if len(new_children) > 0:
+            expression.child = new_children[0]
+        return True
+    for child in _get_child_expressions(expression):
+        is_agg_function_in_child = _inject_condition_to_all_agg_functions(
+            child, condition
+        )
+        if is_agg_function_in_child:
+            any_agg_function_found = True
+    return any_agg_function_found
+def is_child_agg_function_expression(exp: Expression) -> bool:
+    if _is_agg_function_expression(exp):
+        return True
+    return any(
+        is_child_agg_function_expression(child) for child in _get_child_expressions(exp)
+    )

snowflake/snowpark_connect/utils/identifiers.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
 import re
+from typing import Any, TypeVar
 from pyspark.errors import AnalysisException
@@ -12,6 +13,8 @@ from snowflake.snowpark_connect.config import (
     auto_uppercase_column_identifiers,
     auto_uppercase_non_column_identifiers,
 )
+from snowflake.snowpark_connect.error.error_codes import ErrorCodes
+from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
 QUOTED_SPARK_IDENTIFIER = re.compile(r"^`[^`]*(?:``[^`]*)*`$")
 UNQUOTED_SPARK_IDENTIFIER = re.compile(r"^\w+$")
@@ -24,15 +27,23 @@ def unquote_spark_identifier_if_quoted(spark_name: str) -> str:
     if QUOTED_SPARK_IDENTIFIER.match(spark_name):
         return spark_name[1:-1].replace("``", "`")
-    raise AnalysisException(f"Invalid name: {spark_name}")
+    exception = AnalysisException(f"Invalid name: {spark_name}")
+    attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
+    raise exception
-def spark_to_sf_single_id_with_unquoting(name: str) -> str:
+def spark_to_sf_single_id_with_unquoting(
+    name: str, use_auto_upper_case: bool = False
+) -> str:
     """
     Transforms a spark name to a valid snowflake name by quoting and potentially uppercasing it.
     Unquotes the spark name if necessary. Will raise an AnalysisException if given name is not valid.
     """
-    return spark_to_sf_single_id(unquote_spark_identifier_if_quoted(name))
+    return (
+        spark_to_sf_single_id(unquote_spark_identifier_if_quoted(name))
+        if use_auto_upper_case
+        else quote_name_without_upper_casing(unquote_spark_identifier_if_quoted(name))
+    )
 def spark_to_sf_single_id(name: str, is_column: bool = False) -> str:
@@ -117,3 +128,126 @@ def split_fully_qualified_spark_name(qualified_name: str | None) -> list[str]:
         parts.append("".join(token_chars))
     return parts
+# See https://docs.snowflake.com/en/sql-reference/identifiers-syntax for identifier syntax
+UNQUOTED_IDENTIFIER_REGEX = r"([a-zA-Z_])([a-zA-Z0-9_$]{0,254})"
+QUOTED_IDENTIFIER_REGEX = r'"((""|[^"]){0,255})"'
+VALID_IDENTIFIER_REGEX = f"(?:{UNQUOTED_IDENTIFIER_REGEX}|{QUOTED_IDENTIFIER_REGEX})"
+Self = TypeVar("Self", bound="FQN")
+class FQN:
+    """Represents an object identifier, supporting fully qualified names.
+    The instance supports builder pattern that allows updating the identifier with database and
+    schema from different sources.
+    Examples
+    ________
+    >>> fqn = FQN.from_string("my_schema.object").using_connection(conn)
+    >>> fqn = FQN.from_string("my_name").set_database("db").set_schema("foo")
+    """
+    def __init__(
+        self,
+        database: str | None,
+        schema: str | None,
+        name: str,
+        signature: str | None = None,
+    ) -> None:
+        self._database = database
+        self._schema = schema
+        self._name = name
+        self.signature = signature
+    @property
+    def database(self) -> str | None:
+        return self._database
+    @property
+    def schema(self) -> str | None:
+        return self._schema
+    @property
+    def name(self) -> str:
+        return self._name
+    @property
+    def prefix(self) -> str:
+        if self.database:
+            return f"{self.database}.{self.schema if self.schema else 'PUBLIC'}"
+        if self.schema:
+            return f"{self.schema}"
+        return ""
+    @property
+    def identifier(self) -> str:
+        if self.prefix:
+            return f"{self.prefix}.{self.name}"
+        return self.name
+    def __str__(self) -> str:
+        return self.identifier
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, FQN):
+            exception = AnalysisException(f"{other} is not a valid FQN")
+            attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
+            raise exception
+        return self.identifier == other.identifier
+    @classmethod
+    def from_string(cls, identifier: str) -> Self:
+        """Take in an object name in the form [[database.]schema.]name and return a new :class:`FQN` instance.
+        Raises:
+            InvalidIdentifierError: If the object identifier does not meet identifier requirements.
+        """
+        qualifier_pattern = (
+            rf"(?:(?P<first_qualifier>{VALID_IDENTIFIER_REGEX})\.)?"
+            rf"(?:(?P<second_qualifier>{VALID_IDENTIFIER_REGEX})\.)?"
+            rf"(?P<name>{VALID_IDENTIFIER_REGEX})(?P<signature>\(.*\))?"
+        )
+        result = re.fullmatch(qualifier_pattern, identifier)
+        if result is None:
+            exception = AnalysisException(f"{identifier} is not a valid identifier")
+            attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
+            raise exception
+        unqualified_name = result.group("name")
+        if result.group("second_qualifier") is not None:
+            database = result.group("first_qualifier")
+            schema = result.group("second_qualifier")
+        else:
+            database = None
+            schema = result.group("first_qualifier")
+        signature = None
+        if result.group("signature"):
+            signature = result.group("signature")
+        return cls(
+            name=unqualified_name, schema=schema, database=database, signature=signature
+        )
+    def set_database(self, database: str | None) -> Self:
+        if database:
+            self._database = database
+        return self
+    def set_schema(self, schema: str | None) -> Self:
+        if schema:
+            self._schema = schema
+        return self
+    def set_name(self, name: str) -> Self:
+        self._name = name
+        return self
+    def to_dict(self) -> dict[str, str | None]:
+        """Return the dictionary representation of the instance."""
+        return {"name": self.name, "schema": self.schema, "database": self.database}

snowflake/snowpark_connect/utils/io_utils.py CHANGED Viewed

@@ -1,10 +1,47 @@
 #
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
+import contextlib
 import functools
+import re
 from snowflake.snowpark import Session
+from snowflake.snowpark._internal.analyzer.analyzer_utils import (
+    create_file_format_statement,
+)
+from snowflake.snowpark_connect.utils.identifiers import FQN
+_MINUS_AT_THE_BEGINNING_REGEX = re.compile(r"^-")
+def cached_file_format(
+    session: Session, file_format: str, format_type_options: dict[str, str]
+) -> str:
+    """
+    Cache and return a file format name based on the given options.
+    """
+    function_name = _MINUS_AT_THE_BEGINNING_REGEX.sub(
+        "1", str(hash(frozenset(format_type_options.items())))
+    )
+    file_format_name = f"__SNOWPARK_CONNECT_FILE_FORMAT__{file_format}_{function_name}"
+    if file_format_name in session._file_formats:
+        return file_format_name
+    session.sql(
+        create_file_format_statement(
+            file_format_name,
+            file_format,
+            format_type_options,
+            temp=True,
+            if_not_exist=True,
+            use_scoped_temp_objects=False,
+            is_generated=True,
+        )
+    ).collect()
+    session._file_formats.add(file_format_name)
+    return file_format_name
 @functools.cache
@@ -33,3 +70,22 @@ def file_format(
     ).collect()
     return file_format_name
+def get_table_type(
+    snowpark_table_name: str,
+    snowpark_session: Session,
+) -> str:
+    fqn = FQN.from_string(snowpark_table_name)
+    with contextlib.suppress(Exception):
+        if fqn.database is not None:
+            return snowpark_session.catalog.getTable(
+                table_name=fqn.name, schema=fqn.schema, database=fqn.database
+            ).table_type
+        elif fqn.schema is not None:
+            return snowpark_session.catalog.getTable(
+                table_name=fqn.name, schema=fqn.schema
+            ).table_type
+        else:
+            return snowpark_session.catalog.getTable(table_name=fqn.name).table_type
+    return "TABLE"

snowflake/snowpark_connect/utils/java_stored_procedure.py ADDED Viewed

@@ -0,0 +1,125 @@
+#
+# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
+#
+from pyspark.errors import AnalysisException
+import snowflake.snowpark.types as snowpark_type
+from snowflake.snowpark import Session
+from snowflake.snowpark._internal.type_utils import type_string_to_type_object
+from snowflake.snowpark_connect.resources_initializer import (
+    RESOURCE_PATH,
+    SPARK_COMMON_UTILS_JAR,
+    SPARK_CONNECT_CLIENT_JAR,
+    SPARK_SQL_JAR,
+)
+from snowflake.snowpark_connect.utils.upload_java_jar import upload_java_udf_jar
+CREATE_JAVA_UDF_PREFIX = "__SC_JAVA_UDF_"
+PROCEDURE_NAME = "__SC_JAVA_SP_CREATE_JAVA_UDF"
+SP_TEMPLATE = """
+CREATE OR REPLACE TEMPORARY PROCEDURE __SC_JAVA_SP_CREATE_JAVA_UDF(udf_name VARCHAR, udf_class VARCHAR, imports ARRAY(VARCHAR))
+RETURNS VARCHAR
+LANGUAGE JAVA
+RUNTIME_VERSION = 17
+PACKAGES = ('com.snowflake:snowpark:latest')
+__snowflake_udf_imports__
+HANDLER = 'com.snowflake.snowpark_connect.procedures.JavaUDFCreator.process'
+EXECUTE AS CALLER
+;
+"""
+_is_initialized = False
+def is_initialized() -> bool:
+    global _is_initialized
+    return _is_initialized
+def set_java_udf_creator_initialized_state(value: bool) -> None:
+    global _is_initialized
+    _is_initialized = value
+class JavaUdf:
+    """
+    Reference class for Java UDFs, providing similar properties like Python UserDefinedFunction.
+    This class serves as a lightweight reference to a Java UDF that has been created
+    in Snowflake, storing the essential metadata needed for function calls.
+    """
+    def __init__(
+        self,
+        name: str,
+        input_types: list[snowpark_type.DataType],
+        return_type: snowpark_type.DataType,
+    ) -> None:
+        """
+        Initialize a Java UDF reference.
+        Args:
+            name: The name of the UDF in Snowflake
+            input_types: List of input parameter types
+            return_type: The return type of the UDF
+        """
+        self.name = name
+        self._input_types = input_types
+        self._return_type = return_type
+def get_quoted_imports(session: Session) -> str:
+    stage_resource_path = session.get_session_stage() + RESOURCE_PATH
+    spark_imports = {
+        f"{stage_resource_path}/{SPARK_CONNECT_CLIENT_JAR}",
+        f"{stage_resource_path}/{SPARK_COMMON_UTILS_JAR}",
+        f"{stage_resource_path}/{SPARK_SQL_JAR}",
+        f"{stage_resource_path}/java_udfs-1.0-SNAPSHOT.jar",
+    }
+    def quote_single(s: str) -> str:
+        """Helper function to wrap strings in single quotes for SQL."""
+        return "'" + s + "'"
+    return ", ".join(quote_single(x) for x in session._artifact_jars | spark_imports)
+def create_snowflake_imports(session: Session) -> str:
+    from snowflake.snowpark_connect.resources_initializer import (
+        ensure_scala_udf_jars_uploaded,
+    )
+    # Make sure that the resource initializer thread is completed before creating Java UDFs since we depend on the jars
+    # uploaded by it.
+    ensure_scala_udf_jars_uploaded()
+    return f"IMPORTS = ({get_quoted_imports(session)})"
+def create_java_udf(session: Session, function_name: str, java_class: str):
+    if not is_initialized():
+        upload_java_udf_jar(session)
+        session.sql(
+            SP_TEMPLATE.replace(
+                "__snowflake_udf_imports__", create_snowflake_imports(session)
+            )
+        ).collect()
+        set_java_udf_creator_initialized_state(True)
+    name = CREATE_JAVA_UDF_PREFIX + function_name
+    result = session.sql(
+        f"CALL {PROCEDURE_NAME}('{name}', '{java_class}', ARRAY_CONSTRUCT({get_quoted_imports(session)})::ARRAY(VARCHAR))"
+    ).collect()
+    result_value = result[0][0]
+    if not result_value:
+        raise AnalysisException(f"Can not load class {java_class}")
+    types = result_value.split(";")
+    input_types = [type_string_to_type_object(t) for t in types[:-1]]
+    output_type = types[-1]
+    return JavaUdf(
+        name,
+        input_types,
+        type_string_to_type_object(output_type),
+    )

snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

snowpark-connect 0.27.0py3-none-any.whl → 1.6.0py3-none-any.whl