PyPI - snowpark-connect - Versions diffs - 0.26.0__py3-none-any.whl → 0.28.0__py3-none-any.whl - Mend

snowpark-connect 0.26.0py3-none-any.whl → 0.28.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (42) hide show

snowflake/snowpark_connect/utils/context.py CHANGED Viewed

@@ -30,6 +30,9 @@ _sql_aggregate_function_count = ContextVar[int](
     "_contains_aggregate_function", default=0
 )
+# Context for parsing map_partitions
+_map_partitions_stack = ContextVar[int]("_map_partitions_stack", default=0)
 # We have to generate our own plan IDs that are different from Spark's.
 # Spark plan IDs start at 0, so pick a "big enough" number to avoid overlaps.
 _STARTING_SQL_PLAN_ID = 0x80000000
@@ -49,6 +52,7 @@ _spark_client_type_regex = re.compile(r"spark/(?P<spark_version>\d+\.\d+\.\d+)")
 _current_operation = ContextVar[str]("_current_operation", default="default")
 _resolving_fun_args = ContextVar[bool]("_resolving_fun_args", default=False)
 _resolving_lambda_fun = ContextVar[bool]("_resolving_lambdas", default=False)
+_current_lambda_params = ContextVar[list[str]]("_current_lambda_params", default=[])
 _is_window_enabled = ContextVar[bool]("_is_window_enabled", default=False)
 _is_in_pivot = ContextVar[bool]("_is_in_pivot", default=False)
@@ -206,6 +210,16 @@ def push_evaluating_join_condition(join_type, left_keys, right_keys):
         _is_evaluating_join_condition.set(prev)
+@contextmanager
+def push_map_partitions():
+    _map_partitions_stack.set(_map_partitions_stack.get() + 1)
+    yield
+def map_partitions_depth() -> int:
+    return _map_partitions_stack.get()
 @contextmanager
 def push_sql_scope():
     """
@@ -238,16 +252,21 @@ def push_operation_scope(operation: str):
 @contextmanager
-def resolving_lambda_function():
+def resolving_lambda_function(param_names: list[str] = None):
     """
     Context manager that sets a flag indicating lambda function is being resolved.
+    Also tracks the lambda parameter names for validation.
     """
     prev = _resolving_lambda_fun.get()
+    prev_params = _current_lambda_params.get()
     try:
         _resolving_lambda_fun.set(True)
+        if param_names is not None:
+            _current_lambda_params.set(param_names)
         yield
     finally:
         _resolving_lambda_fun.set(prev)
+        _current_lambda_params.set(prev_params)
 def is_lambda_being_resolved() -> bool:
@@ -257,6 +276,13 @@ def is_lambda_being_resolved() -> bool:
     return _resolving_lambda_fun.get()
+def get_current_lambda_params() -> list[str]:
+    """
+    Returns the current lambda parameter names.
+    """
+    return _current_lambda_params.get()
 @contextmanager
 def resolving_fun_args():
     """
@@ -270,6 +296,19 @@ def resolving_fun_args():
         _resolving_fun_args.set(prev)
+@contextmanager
+def not_resolving_fun_args():
+    """
+    Context manager that sets a flag indicating function arguments are *not* being resolved.
+    """
+    prev = _resolving_fun_args.get()
+    try:
+        _resolving_fun_args.set(False)
+        yield
+    finally:
+        _resolving_fun_args.set(prev)
 def is_function_argument_being_resolved() -> bool:
     """
     Returns True if function arguments are being resolved.
@@ -350,6 +389,7 @@ def clear_context_data() -> None:
     _next_sql_plan_id.set(_STARTING_SQL_PLAN_ID)
     _sql_plan_name_map.set({})
+    _map_partitions_stack.set(0)
     _sql_aggregate_function_count.set(0)
     _sql_named_args.set({})
     _sql_pos_args.set({})

snowflake/snowpark_connect/utils/describe_query_cache.py CHANGED Viewed

@@ -6,20 +6,24 @@ import hashlib
 import inspect
 import random
 import re
-import threading
 import time
 from typing import Any
 from snowflake import snowpark
 from snowflake.connector.cursor import ResultMetadataV2
 from snowflake.snowpark._internal.server_connection import ServerConnection
+from snowflake.snowpark_connect.utils.concurrent import SynchronizedDict
 from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
 from snowflake.snowpark_connect.utils.telemetry import telemetry
 DESCRIBE_CACHE_TTL_SECONDS = 15
 USE_DESCRIBE_QUERY_CACHE = True
-DDL_DETECTION_PATTERN = re.compile(r"^\s*(CREATE|ALTER|DROP|RENAME)\b", re.IGNORECASE)
+DDL_DETECTION_PATTERN = re.compile(r"\s*(CREATE|ALTER|DROP)\b", re.IGNORECASE)
+PLAIN_CREATE_PATTERN = re.compile(
+    r"\s*CREATE\s+((LOCAL|GLOBAL)\s+)?(TRANSIENT\s+)?TABLE\b", re.IGNORECASE
+)
 # Pattern for simple constant queries like: SELECT 3 :: INT AS "3-80000030-0" FROM ( SELECT $1 AS "__DUMMY" FROM  VALUES (NULL :: STRING))
 # Using exact spacing pattern from generated SQL for deterministic matching
 # Column ID format: {original_name}-{8_digit_hex_plan_id}-{column_index}
@@ -32,8 +36,7 @@ SIMPLE_CONSTANT_PATTERN = re.compile(
 class DescribeQueryCache:
     def __init__(self) -> None:
-        self._cache = {}
-        self._lock = threading.Lock()
+        self._cache = SynchronizedDict()
     @staticmethod
     def _hash_query(sql_query: str) -> str:
@@ -48,49 +51,49 @@ class DescribeQueryCache:
         return sql_query
     def get(self, sql_query: str) -> list[ResultMetadataV2] | None:
+        telemetry.report_describe_query_cache_lookup()
         cache_key = self._get_cache_key(sql_query)
         key = self._hash_query(cache_key)
         current_time = time.monotonic()
-        # TODO: maybe too much locking, we could use read-write lock also. Or a thread safe dictionary.
-        with self._lock:
-            if key in self._cache:
-                result, timestamp = self._cache[key]
-                if current_time < timestamp + DESCRIBE_CACHE_TTL_SECONDS:
-                    logger.debug(
-                        f"Returning query result from cache for query: {sql_query[:20]}"
-                    )
-                    # If this is a constant query, we need to transform the result metadata
-                    # to match the actual query's column name
-                    if (
-                        cache_key != sql_query
-                    ):  # Only transform if we normalized the key
-                        match = SIMPLE_CONSTANT_PATTERN.match(sql_query)
-                        if match:
-                            number, column_id = match.groups()
-                            expected_column_name = column_id
-                            # Transform the cached result to match this query's column name
-                            # There should only be one column in these constant queries
-                            metadata = result[0]
-                            new_metadata = ResultMetadataV2(
-                                name=expected_column_name,
-                                type_code=metadata.type_code,
-                                display_size=metadata.display_size,
-                                internal_size=metadata.internal_size,
-                                precision=metadata.precision,
-                                scale=metadata.scale,
-                                is_nullable=metadata.is_nullable,
-                            )
-                            return [new_metadata]
-                    return result
-                else:
-                    logger.debug(
-                        f"Had a cached entry, but it expired for query: {sql_query[:20]}"
-                    )
-                    del self._cache[key]
+        if key in self._cache:
+            result, timestamp = self._cache[key]
+            if current_time < timestamp + DESCRIBE_CACHE_TTL_SECONDS:
+                logger.debug(
+                    f"Returning query result from cache for query: {sql_query[:20]}"
+                )
+                self._cache[key] = (result, current_time)
+                # If this is a constant query, we need to transform the result metadata
+                # to match the actual query's column name
+                if cache_key != sql_query:  # Only transform if we normalized the key
+                    match = SIMPLE_CONSTANT_PATTERN.match(sql_query)
+                    if match:
+                        number, column_id = match.groups()
+                        expected_column_name = column_id
+                        # Transform the cached result to match this query's column name
+                        # There should only be one column in these constant queries
+                        metadata = result[0]
+                        new_metadata = ResultMetadataV2(
+                            name=expected_column_name,
+                            type_code=metadata.type_code,
+                            display_size=metadata.display_size,
+                            internal_size=metadata.internal_size,
+                            precision=metadata.precision,
+                            scale=metadata.scale,
+                            is_nullable=metadata.is_nullable,
+                        )
+                        telemetry.report_describe_query_cache_hit()
+                        return [new_metadata]
+                telemetry.report_describe_query_cache_hit()
+                return result
+            else:
+                telemetry.report_describe_query_cache_expired()
+                del self._cache[key]
         return None
     def put(self, sql_query: str, result: list[ResultMetadataV2] | None) -> None:
@@ -102,12 +105,18 @@ class DescribeQueryCache:
         logger.debug(f"Putting query into cache: {sql_query[:50]}...")
-        with self._lock:
-            self._cache[key] = (result, time.monotonic())
+        self._cache[key] = (result, time.monotonic())
     def clear(self) -> None:
-        with self._lock:
-            self._cache.clear()
+        self._cache.clear()
+    def update_cache_for_query(self, query: str) -> None:
+        # Clear cache for DDL operations that modify existing objects (exclude CREATE TABLE)
+        if DDL_DETECTION_PATTERN.search(query) and not PLAIN_CREATE_PATTERN.search(
+            query
+        ):
+            self.clear()
+            telemetry.report_describe_query_cache_clear(query[:100])
 def instrument_session_for_describe_cache(session: snowpark.Session):
@@ -126,10 +135,7 @@ def instrument_session_for_describe_cache(session: snowpark.Session):
         if isinstance(cache_instance, DescribeQueryCache):
             cache = cache_instance
-        # TODO: This is very broad right now. We should be able to reduce the scope of clearing.
-        if DDL_DETECTION_PATTERN.search(query):
-            logger.debug(f"DDL detected, clearing describe query cache: '{query}'")
-            cache.clear()
+        cache.update_cache_for_query(query)
     def wrap_execute(wrapped_fn):
         def fn(query: str, **kwargs):

snowflake/snowpark_connect/utils/identifiers.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
 import re
+from typing import Any, TypeVar
 from pyspark.errors import AnalysisException
@@ -117,3 +118,122 @@ def split_fully_qualified_spark_name(qualified_name: str | None) -> list[str]:
         parts.append("".join(token_chars))
     return parts
+# See https://docs.snowflake.com/en/sql-reference/identifiers-syntax for identifier syntax
+UNQUOTED_IDENTIFIER_REGEX = r"([a-zA-Z_])([a-zA-Z0-9_$]{0,254})"
+QUOTED_IDENTIFIER_REGEX = r'"((""|[^"]){0,255})"'
+VALID_IDENTIFIER_REGEX = f"(?:{UNQUOTED_IDENTIFIER_REGEX}|{QUOTED_IDENTIFIER_REGEX})"
+Self = TypeVar("Self", bound="FQN")
+class FQN:
+    """Represents an object identifier, supporting fully qualified names.
+    The instance supports builder pattern that allows updating the identifier with database and
+    schema from different sources.
+    Examples
+    ________
+    >>> fqn = FQN.from_string("my_schema.object").using_connection(conn)
+    >>> fqn = FQN.from_string("my_name").set_database("db").set_schema("foo")
+    """
+    def __init__(
+        self,
+        database: str | None,
+        schema: str | None,
+        name: str,
+        signature: str | None = None,
+    ) -> None:
+        self._database = database
+        self._schema = schema
+        self._name = name
+        self.signature = signature
+    @property
+    def database(self) -> str | None:
+        return self._database
+    @property
+    def schema(self) -> str | None:
+        return self._schema
+    @property
+    def name(self) -> str:
+        return self._name
+    @property
+    def prefix(self) -> str:
+        if self.database:
+            return f"{self.database}.{self.schema if self.schema else 'PUBLIC'}"
+        if self.schema:
+            return f"{self.schema}"
+        return ""
+    @property
+    def identifier(self) -> str:
+        if self.prefix:
+            return f"{self.prefix}.{self.name}"
+        return self.name
+    def __str__(self) -> str:
+        return self.identifier
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, FQN):
+            raise AnalysisException(f"{other} is not a valid FQN")
+        return self.identifier == other.identifier
+    @classmethod
+    def from_string(cls, identifier: str) -> Self:
+        """Take in an object name in the form [[database.]schema.]name and return a new :class:`FQN` instance.
+        Raises:
+            InvalidIdentifierError: If the object identifier does not meet identifier requirements.
+        """
+        qualifier_pattern = (
+            rf"(?:(?P<first_qualifier>{VALID_IDENTIFIER_REGEX})\.)?"
+            rf"(?:(?P<second_qualifier>{VALID_IDENTIFIER_REGEX})\.)?"
+            rf"(?P<name>{VALID_IDENTIFIER_REGEX})(?P<signature>\(.*\))?"
+        )
+        result = re.fullmatch(qualifier_pattern, identifier)
+        if result is None:
+            raise AnalysisException(f"{identifier} is not a valid identifier")
+        unqualified_name = result.group("name")
+        if result.group("second_qualifier") is not None:
+            database = result.group("first_qualifier")
+            schema = result.group("second_qualifier")
+        else:
+            database = None
+            schema = result.group("first_qualifier")
+        signature = None
+        if result.group("signature"):
+            signature = result.group("signature")
+        return cls(
+            name=unqualified_name, schema=schema, database=database, signature=signature
+        )
+    def set_database(self, database: str | None) -> Self:
+        if database:
+            self._database = database
+        return self
+    def set_schema(self, schema: str | None) -> Self:
+        if schema:
+            self._schema = schema
+        return self
+    def set_name(self, name: str) -> Self:
+        self._name = name
+        return self
+    def to_dict(self) -> dict[str, str | None]:
+        """Return the dictionary representation of the instance."""
+        return {"name": self.name, "schema": self.schema, "database": self.database}

snowflake/snowpark_connect/utils/io_utils.py CHANGED Viewed

@@ -1,10 +1,11 @@
 #
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
+import contextlib
 import functools
 from snowflake.snowpark import Session
+from snowflake.snowpark_connect.utils.identifiers import FQN
 @functools.cache
@@ -33,3 +34,22 @@ def file_format(
     ).collect()
     return file_format_name
+def get_table_type(
+    snowpark_table_name: str,
+    snowpark_session: Session,
+) -> str:
+    fqn = FQN.from_string(snowpark_table_name)
+    with contextlib.suppress(Exception):
+        if fqn.database is not None:
+            return snowpark_session.catalog.getTable(
+                table_name=fqn.name, schema=fqn.schema, database=fqn.database
+            ).table_type
+        elif fqn.schema is not None:
+            return snowpark_session.catalog.getTable(
+                table_name=fqn.name, schema=fqn.schema
+            ).table_type
+        else:
+            return snowpark_session.catalog.getTable(table_name=fqn.name).table_type
+    return "TABLE"

snowflake/snowpark_connect/utils/pandas_udtf_utils.py CHANGED Viewed

@@ -87,9 +87,93 @@ def get_map_in_arrow_udtf(
 def create_pandas_udtf(
     udtf_proto: CommonInlineUserDefinedFunction,
     spark_column_names: list[str],
-    input_schema: StructType | None = None,
-    return_schema: StructType | None = None,
+    input_schema: StructType,
+    return_schema: StructType,
+):
+    user_function, _ = cloudpickle.loads(udtf_proto.python_udf.command)
+    output_column_names = [field.name for field in return_schema.fields]
+    output_column_original_names = [
+        field.original_column_identifier for field in return_schema.fields
+    ]
+    class MapPandasUDTF:
+        def __init__(self) -> None:
+            self.user_function = user_function
+            self.output_column_names = output_column_names
+            self.spark_column_names = spark_column_names
+            self.output_column_original_names = output_column_original_names
+        def end_partition(self, df: pd.DataFrame):
+            if df.empty:
+                empty_df = pd.DataFrame(columns=self.output_column_names)
+                yield empty_df
+                return
+            df_without_dummy = df.drop(
+                columns=["_DUMMY_PARTITION_KEY"], errors="ignore"
+            )
+            df_without_dummy.columns = self.spark_column_names
+            result_iterator = self.user_function(
+                [pd.DataFrame([row]) for _, row in df_without_dummy.iterrows()]
+            )
+            if not isinstance(result_iterator, Iterator) and not hasattr(
+                result_iterator, "__iter__"
+            ):
+                raise RuntimeError(
+                    f"snowpark_connect::UDF_RETURN_TYPE Return type of the user-defined function should be "
+                    f"iterator of pandas.DataFrame, but is {type(result_iterator).__name__}"
+                )
+            output_df = pd.concat(result_iterator)
+            generated_output_column_names = list(output_df.columns)
+            missing_columns = []
+            for original_column in self.output_column_original_names:
+                if original_column not in generated_output_column_names:
+                    missing_columns.append(original_column)
+            if missing_columns:
+                unexpected_columns = [
+                    column
+                    for column in generated_output_column_names
+                    if column not in self.output_column_original_names
+                ]
+                raise RuntimeError(
+                    f"[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF] Column names of the returned pandas.DataFrame do not match specified schema. Missing: {', '.join(sorted(missing_columns))}. Unexpected: {', '.join(sorted(unexpected_columns))}"
+                    "."
+                )
+            reordered_df = output_df[self.output_column_original_names]
+            reordered_df.columns = self.output_column_names
+            yield reordered_df
+    return snowpark_fn.pandas_udtf(
+        MapPandasUDTF,
+        output_schema=PandasDataFrameType(
+            [field.datatype for field in return_schema.fields],
+            [field.name for field in return_schema.fields],
+        ),
+        input_types=[
+            PandasDataFrameType(
+                [field.datatype for field in input_schema.fields] + [IntegerType()]
+            )
+        ],
+        input_names=[field.name for field in input_schema.fields]
+        + ["_DUMMY_PARTITION_KEY"],
+        name="map_pandas_udtf",
+        replace=True,
+        packages=["pandas"],
+        is_permanent=False,
+    )
+def create_pandas_udtf_with_arrow(
+    udtf_proto: CommonInlineUserDefinedFunction,
+    spark_column_names: list[str],
+    input_schema: StructType,
+    return_schema: StructType,
 ) -> str | snowpark.udtf.UserDefinedTableFunction:
     user_function, _ = cloudpickle.loads(udtf_proto.python_udf.command)
     output_column_names = [field.name for field in return_schema.fields]

snowflake/snowpark_connect/utils/scala_udf_utils.py CHANGED Viewed

@@ -171,12 +171,19 @@ class ScalaUDFDef:
         is_map_return = udf_func_return_type.startswith("Map")
         wrapper_return_type = "String" if is_map_return else udf_func_return_type
+        # For handling Seq type correctly, ensure that the wrapper function always uses Array as its input and
+        # return types (when required) and the wrapped function uses Seq.
+        udf_func_return_type = udf_func_return_type.replace("Array", "Seq")
+        is_seq_return = udf_func_return_type.startswith("Seq")
         # Need to call the map to JSON string converter when a map is returned by the user's function.
-        invoke_udf_func = (
-            f"write(func({invocation_args}))"
-            if is_map_return
-            else f"func({invocation_args})"
-        )
+        if is_map_return:
+            invoke_udf_func = f"write(func({invocation_args}))"
+        elif is_seq_return:
+            # TODO: SNOW-2339385 Handle Array[T] return types correctly. Currently, only Seq[T] is supported.
+            invoke_udf_func = f"func({invocation_args}).toArray"
+        else:
+            invoke_udf_func = f"func({invocation_args})"
         # The lines of code below are required only when a Map is returned by the UDF. This is needed to serialize the
         # map output to a JSON string.
@@ -184,9 +191,9 @@ class ScalaUDFDef:
             ""
             if not is_map_return
             else """
-import org.json4s._
-import org.json4s.native.Serialization._
-import org.json4s.native.Serialization
+import shaded_json4s._
+import shaded_json4s.native.Serialization._
+import shaded_json4s.native.Serialization
 """
         )
         map_return_formatter = (
@@ -199,22 +206,12 @@ import org.json4s.native.Serialization
         return f"""import org.apache.spark.sql.connect.common.UdfPacket
 {map_return_imports}
-import java.io.{{ByteArrayInputStream, ObjectInputStream}}
-import java.nio.file.{{Files, Paths}}
+import com.snowflake.sas.scala.Utils
 object __RecreatedSparkUdf {{
   {map_return_formatter}
-  private lazy val func: ({udf_func_input_types}) => {udf_func_return_type} = {{
-    val importDirectory = System.getProperty("com.snowflake.import_directory")
-    val fPath = importDirectory + "{self.name}.bin"
-    val bytes = Files.readAllBytes(Paths.get(fPath))
-    val ois = new ObjectInputStream(new ByteArrayInputStream(bytes))
-    try {{
-      ois.readObject().asInstanceOf[UdfPacket].function.asInstanceOf[({udf_func_input_types}) => {udf_func_return_type}]
-    }} finally {{
-      ois.close()
-    }}
-  }}
+  private lazy val func: ({udf_func_input_types}) => {udf_func_return_type} =
+    Utils.deserializeFunc("{self.name}.bin").asInstanceOf[({udf_func_input_types}) => {udf_func_return_type}]
   def __wrapperFunc({wrapper_arg_and_input_types_str}): {wrapper_return_type} = {{
     {invoke_udf_func}
@@ -299,29 +296,15 @@ def build_scala_udf_imports(session, payload, udf_name, is_map_return) -> List[s
             # Remove the stage path since it is not properly formatted.
             user_jars.append(row[0][row[0].find("/") :])
-    # Jars used when the return type is a Map.
-    map_jars = (
-        []
-        if not is_map_return
-        else [
-            f"{stage_resource_path}/json4s-core_2.12-3.7.0-M11.jar",
-            f"{stage_resource_path}/json4s-native_2.12-3.7.0-M11.jar",
-            f"{stage_resource_path}/paranamer-2.8.3.jar",
-        ]
-    )
     # Format the user jars to be used in the IMPORTS clause of the stored procedure.
-    return (
-        [
-            closure_binary_file,
-            f"{stage_resource_path}/spark-connect-client-jvm_2.12-3.5.6.jar",
-            f"{stage_resource_path}/spark-common-utils_2.12-3.5.6.jar",
-            f"{stage_resource_path}/spark-sql_2.12-3.5.6.jar",
-            f"{stage_resource_path}/json4s-ast_2.12-3.7.0-M11.jar",
-        ]
-        + map_jars
-        + [f"{stage + jar}" for jar in user_jars]
-    )
+    return [
+        closure_binary_file,
+        f"{stage_resource_path}/spark-connect-client-jvm_2.12-3.5.6.jar",
+        f"{stage_resource_path}/spark-common-utils_2.12-3.5.6.jar",
+        f"{stage_resource_path}/spark-sql_2.12-3.5.6.jar",
+        f"{stage_resource_path}/json4s-ast_2.12-3.7.0-M11.jar",
+        f"{stage_resource_path}/sas-scala-udf_2.12-0.1.0.jar",
+    ] + [f"{stage + jar}" for jar in user_jars]
 def create_scala_udf(pciudf: ProcessCommonInlineUserDefinedFunction) -> ScalaUdf:
@@ -343,6 +326,14 @@ def create_scala_udf(pciudf: ProcessCommonInlineUserDefinedFunction) -> ScalaUdf
     Returns:
         A ScalaUdf object representing the created or cached Scala UDF.
     """
+    from snowflake.snowpark_connect.resources_initializer import (
+        wait_for_resource_initialization,
+    )
+    # Make sure that the resource initializer thread is completed before creating Scala UDFs since we depend on the jars
+    # uploaded by it.
+    wait_for_resource_initialization()
     from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
     function_name = pciudf._function_name

snowpark-connect 0.26.0__py3-none-any.whl → 0.28.0__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.26.0py3-none-any.whl → 0.28.0py3-none-any.whl