PyPI - snowpark-connect - Versions diffs - 0.20.2__py3-none-any.whl → 0.22.1__py3-none-any.whl - Mend

snowpark-connect 0.20.2py3-none-any.whl → 0.22.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (84) hide show

snowflake/snowpark_connect/utils/cache.py CHANGED Viewed

@@ -8,53 +8,58 @@ from typing import Dict, Tuple
 import pandas
-from snowflake import snowpark
-from snowflake.snowpark_connect.column_name_handler import set_schema_getter
+from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
 from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
-# global cache mapping  (sessionID, planID) -> cached snowpark df .
-df_cache_map: Dict[Tuple[str, any], snowpark.DataFrame] = {}
+# global cache mapping  (sessionID, planID) -> cached snowpark dataframe container.
+df_cache_map: Dict[Tuple[str, any], DataFrameContainer] = {}
 # reentrant lock for thread safety
 _cache_map_lock = threading.RLock()
-def df_cache_map_get(key: Tuple[str, any]) -> snowpark.DataFrame | None:
+def df_cache_map_get(key: Tuple[str, any]) -> DataFrameContainer | None:
     with _cache_map_lock:
         return df_cache_map.get(key)
 def df_cache_map_put_if_absent(
     key: Tuple[str, any],
-    compute_fn: Callable[[], snowpark.DataFrame | pandas.DataFrame],
+    compute_fn: Callable[[], DataFrameContainer | pandas.DataFrame],
     materialize: bool,
-) -> snowpark.DataFrame | pandas.DataFrame:
+) -> DataFrameContainer | pandas.DataFrame:
     """
-    Put a DataFrame into the cache map if the key is absent. Optionally, as side effect, materialize
+    Put a DataFrame container into the cache map if the key is absent. Optionally, as side effect, materialize
     the DataFrame content in a temporary table.
     Args:
         key (Tuple[str, int]): The key to insert into the cache map (session_id, plan_id).
-        compute_fn (Callable[[], DataFrame]): A function to compute the DataFrame if the key is absent.
+        compute_fn (Callable[[], DataFrameContainer | pandas.DataFrame]): A function to compute the DataFrame container if the key is absent.
         materialize (bool): Whether to materialize the DataFrame.
     Returns:
-        snowpark.DataFrame | pandas.DataFrame: The cached or newly computed DataFrame.
+        DataFrameContainer | pandas.DataFrame: The cached or newly computed DataFrame container.
     """
-    def _object_to_cache(df: snowpark.DataFrame) -> snowpark.DataFrame:
+    def _object_to_cache(
+        container: DataFrameContainer,
+    ) -> DataFrameContainer:
         if materialize:
+            df = container.dataframe
             cached_result = df.cache_result()
-            # caching does not change the column name map
-            cached_result._column_map = df._column_map
-            cached_result._table_name = df._table_name
-            set_schema_getter(cached_result, lambda: df.schema)
-            return cached_result
-        return df
+            return DataFrameContainer(
+                dataframe=cached_result,
+                column_map=container.column_map,
+                table_name=container.table_name,
+                alias=container.alias,
+                cached_schema_getter=lambda: df.schema,
+            )
+        return container
     with _cache_map_lock:
         if key not in df_cache_map:
-            df = compute_fn()
+            result = compute_fn()
             # check cache again, since recursive call in compute_fn could've already cached the result.
             # we want return it, instead of saving it again. This is important if materialize = True
@@ -62,19 +67,19 @@ def df_cache_map_put_if_absent(
             if key in df_cache_map:
                 return df_cache_map[key]
-            # only cache snowpark Dataframe, but not pandas result.
+            # only cache DataFrameContainer, but not pandas result.
             # Pandas result is only returned when df.show() is called, where we convert
             # a dataframe to a string representation.
             # We don't expect map_relation would return pandas df here because that would
             # be equivalent to calling df.show().cache(), which is not allowed.
-            if isinstance(df, snowpark.DataFrame):
-                df_cache_map[key] = _object_to_cache(df)
+            if isinstance(result, DataFrameContainer):
+                df_cache_map[key] = _object_to_cache(result)
             else:
                 # This is not expected, but we will just log a warning
                 logger.warning(
                     "Unexpected pandas dataframe returned for caching. Ignoring the cache call."
                 )
-                return df
+                return result
         return df_cache_map[key]

snowflake/snowpark_connect/utils/context.py CHANGED Viewed

@@ -9,14 +9,14 @@ from typing import Mapping, Optional
 import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
-from snowflake import snowpark
+from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
 from snowflake.snowpark_connect.typed_column import TypedColumn
 # TODO: remove session id from context when we host SAS in Snowflake server
 _session_id = ContextVar[str]("_session_id")
-_plan_id_map = ContextVar[Mapping[int, snowpark.DataFrame]]("_plan_id_map")
-_alias_map = ContextVar[Mapping[str, snowpark.DataFrame | None]]("_alias_map")
+_plan_id_map = ContextVar[Mapping[int, DataFrameContainer]]("_plan_id_map")
+_alias_map = ContextVar[Mapping[str, DataFrameContainer | None]]("_alias_map")
 _spark_version = ContextVar[str]("_spark_version")
 _is_aggregate_function = ContextVar(
     "_is_aggregate_function", default=("default", False)
@@ -40,10 +40,10 @@ _sql_named_args = ContextVar[dict[str, expressions_proto.Expression]]("_sql_name
 _sql_pos_args = ContextVar[dict[int, expressions_proto.Expression]]("_sql_pos_args")
 # Used to store the df before the last projection operation
-_df_before_projection = ContextVar[snowpark.DataFrame | None](
+_df_before_projection = ContextVar[DataFrameContainer | None](
     "_df_before_projection", default=None
 )
-_outer_dataframes = ContextVar[list[snowpark.DataFrame]]("_parent_dataframes")
+_outer_dataframes = ContextVar[list[DataFrameContainer]]("_parent_dataframes")
 _spark_client_type_regex = re.compile(r"spark/(?P<spark_version>\d+\.\d+\.\d+)")
 _current_operation = ContextVar[str]("_current_operation", default="default")
@@ -66,6 +66,12 @@ _lca_alias_map: ContextVar[dict[str, TypedColumn]] = ContextVar(
     default={},
 )
+# Context variable to track current grouping columns for grouping_id() function
+_current_grouping_columns: ContextVar[list[str]] = ContextVar(
+    "_current_grouping_columns",
+    default=[],
+)
 def clear_lca_alias_map() -> None:
     _lca_alias_map.set({})
@@ -87,6 +93,16 @@ def resolve_lca_alias(name: str) -> Optional[TypedColumn]:
     return _lca_alias_map.get().get(_normalize(name))
+def set_current_grouping_columns(columns: list[str]) -> None:
+    """Set the current grouping columns for grouping_id() function."""
+    _current_grouping_columns.set(columns)
+def get_current_grouping_columns() -> list[str]:
+    """Get the current grouping columns for grouping_id() function."""
+    return _current_grouping_columns.get()
 def set_session_id(value: str) -> None:
     """Set the session ID for the current context"""
     _session_id.set(value)
@@ -97,13 +113,13 @@ def get_session_id() -> str:
     return _session_id.get(None)
-def set_plan_id_map(plan_id: int, df: snowpark.DataFrame) -> None:
+def set_plan_id_map(plan_id: int, container: DataFrameContainer) -> None:
     """Set the plan id map for the current context."""
-    _plan_id_map.get()[plan_id] = df
+    _plan_id_map.get()[plan_id] = container
-def get_plan_id_map(plan_id: int) -> snowpark.DataFrame | None:
-    """Set the plan id map for the current context."""
+def get_plan_id_map(plan_id: int) -> DataFrameContainer | None:
+    """Get the plan id map for the current context."""
     return _plan_id_map.get().get(plan_id)
@@ -295,30 +311,30 @@ def get_sql_pos_arg(pos: int) -> expressions_proto.Expression:
     return _sql_pos_args.get()[pos]
-def set_df_before_projection(df: snowpark.DataFrame | None) -> None:
+def set_df_before_projection(df: DataFrameContainer | None) -> None:
     """
-    Sets the current DataFrame in the context.
-    This is used to track the DataFrame in the current context.
+    Sets the current DataFrame container in the context.
+    This is used to track the DataFrame container in the current context.
     """
     _df_before_projection.set(df)
-def get_df_before_projection() -> snowpark.DataFrame | None:
+def get_df_before_projection() -> DataFrameContainer | None:
     """
-    Returns the current DataFrame if set, otherwise None.
-    This is used to track the DataFrame in the current context.
+    Returns the current DataFrame container if set, otherwise None.
+    This is used to track the DataFrame container in the current context.
     """
     return _df_before_projection.get()
 @contextmanager
-def push_outer_dataframe(df: snowpark.DataFrame):
+def push_outer_dataframe(df: DataFrameContainer):
     _outer_dataframes.get().append(df)
     yield
     _outer_dataframes.get().pop()
-def get_outer_dataframes() -> list[snowpark.DataFrame]:
+def get_outer_dataframes() -> list[DataFrameContainer]:
     return _outer_dataframes.get()

snowflake/snowpark_connect/utils/describe_query_cache.py CHANGED Viewed

@@ -131,21 +131,14 @@ def instrument_session_for_describe_cache(session: snowpark.Session):
             logger.debug(f"DDL detected, clearing describe query cache: '{query}'")
             cache.clear()
-    def report_query(qid: str, is_internal: bool) -> None:
-        if is_internal:
-            telemetry.report_internal_query()
-        elif qid:
-            telemetry.report_query_id(qid)
     def wrap_execute(wrapped_fn):
         def fn(query: str, **kwargs):
             update_cache_for_query(query)
-            is_internal = kwargs.get("_is_internal", False)
             try:
                 result = wrapped_fn(query, **kwargs)
-                report_query(result.sfqid, is_internal)
+                telemetry.report_query(result, **kwargs)
             except Exception as e:
-                report_query(e.sfqid, is_internal)
+                telemetry.report_query(e, **kwargs)
                 raise e
             return result

snowflake/snowpark_connect/utils/{attribute_handling.py → identifiers.py} RENAMED Viewed

@@ -1,6 +1,53 @@
 #
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
+import re
+from pyspark.errors import AnalysisException
+from snowflake.snowpark._internal.analyzer.analyzer_utils import (
+    quote_name_without_upper_casing,
+)
+from snowflake.snowpark_connect.config import (
+    auto_uppercase_column_identifiers,
+    auto_uppercase_non_column_identifiers,
+)
+QUOTED_SPARK_IDENTIFIER = re.compile(r"^`[^`]*(?:``[^`]*)*`$")
+UNQUOTED_SPARK_IDENTIFIER = re.compile(r"^\w+$")
+def unquote_spark_identifier_if_quoted(spark_name: str) -> str:
+    if UNQUOTED_SPARK_IDENTIFIER.match(spark_name):
+        return spark_name
+    if QUOTED_SPARK_IDENTIFIER.match(spark_name):
+        return spark_name[1:-1].replace("``", "`")
+    raise AnalysisException(f"Invalid name: {spark_name}")
+def spark_to_sf_single_id_with_unquoting(name: str) -> str:
+    """
+    Transforms a spark name to a valid snowflake name by quoting and potentially uppercasing it.
+    Unquotes the spark name if necessary. Will raise an AnalysisException if given name is not valid.
+    """
+    return spark_to_sf_single_id(unquote_spark_identifier_if_quoted(name))
+def spark_to_sf_single_id(name: str, is_column: bool = False) -> str:
+    """
+    Transforms a spark name to a valid snowflake name by quoting and potentially uppercasing it.
+    Assumes that the given spark name doesn't contain quotes,
+    meaning it's either already unquoted, or didn't need quoting.
+    """
+    name = quote_name_without_upper_casing(name)
+    should_uppercase = (
+        auto_uppercase_column_identifiers()
+        if is_column
+        else auto_uppercase_non_column_identifiers()
+    )
+    return name.upper() if should_uppercase else name
 def split_fully_qualified_spark_name(qualified_name: str | None) -> list[str]:

snowflake/snowpark_connect/utils/session.py CHANGED Viewed

@@ -5,11 +5,10 @@
 import logging
 import os
 from collections.abc import Sequence
-from contextlib import contextmanager
 from typing import Any
 from snowflake import snowpark
-from snowflake.snowpark.exceptions import SnowparkClientException
+from snowflake.snowpark.exceptions import SnowparkClientException, SnowparkSQLException
 from snowflake.snowpark.session import _get_active_session
 from snowflake.snowpark_connect.constants import DEFAULT_CONNECTION_NAME
 from snowflake.snowpark_connect.utils.describe_query_cache import (
@@ -20,6 +19,18 @@ from snowflake.snowpark_connect.utils.telemetry import telemetry
 from snowflake.snowpark_connect.utils.udf_cache import init_builtin_udf_cache
+# Suppress experimental warnings from snowflake.snowpark logger
+def _filter_experimental_warnings(record):
+    """Filter function to suppress experimental warnings."""
+    message = record.getMessage()
+    return not (
+        "is experimental since" in message and "Do not use it in production" in message
+    )
+logging.getLogger("snowflake.snowpark").addFilter(_filter_experimental_warnings)
 def _get_current_snowpark_session() -> snowpark.Session | None:
     # TODO: this is a temporary solution to get the current session, it would be better to add a function in snowpark
     try:
@@ -34,33 +45,6 @@ def _get_current_snowpark_session() -> snowpark.Session | None:
         raise
-@contextmanager
-def suppress_experimental_warnings():
-    """
-    Suppress experimental parameter warnings from snowpark logging.
-    This context manager filters out logging messages containing
-    "is experimental since" and "Do not use it in production"
-    from the snowpark logger, while preserving other important warnings.
-    """
-    snowpark_logger = logging.getLogger("snowflake.snowpark")
-    def filter_experimental_warnings(record):
-        """Filter function to suppress experimental parameter warnings."""
-        message = record.getMessage()
-        return not (
-            "is experimental since" in message
-            and "Do not use it in production" in message
-        )
-    snowpark_logger.addFilter(filter_experimental_warnings)
-    try:
-        yield
-    finally:
-        snowpark_logger.removeFilter(filter_experimental_warnings)
 def configure_snowpark_session(session: snowpark.Session):
     """Configure a snowpark session with required parameters and settings."""
     from snowflake.snowpark_connect.config import global_config
@@ -80,11 +64,10 @@ def configure_snowpark_session(session: snowpark.Session):
     # built-in udf cache
     init_builtin_udf_cache(session)
-    # Set experimental parameters with warning suppression
-    with suppress_experimental_warnings():
-        session.ast_enabled = False
-        session.eliminate_numeric_sql_value_cast_enabled = False
-        session.reduce_describe_query_enabled = True
+    # Set experimental parameters (warnings globally suppressed)
+    session.ast_enabled = False
+    session.eliminate_numeric_sql_value_cast_enabled = False
+    session.reduce_describe_query_enabled = True
     session._join_alias_fix = True
     session.connection.arrow_number_to_decimal_setter = True
@@ -101,6 +84,30 @@ def configure_snowpark_session(session: snowpark.Session):
     session.sql(
         f"ALTER SESSION SET {', '.join([f'{k} = {v}' for k, v in session_params.items()])}"
     ).collect()
+    # Rolling ahead in preparation of GS release 9.22 (ETA 8/5/2025). Once 9.22 is past rollback risk, merge this
+    # parameter with other in the session_params dictionary above
+    try:
+        session.sql(
+            "ALTER SESSION SET ENABLE_STRUCTURED_TYPES_IN_SNOWPARK_CONNECT_RESPONSE=true"
+        ).collect()
+    except SnowparkSQLException:
+        logger.debug(
+            "ENABLE_STRUCTURED_TYPES_IN_SNOWPARK_CONNECT_RESPONSE is not defined"
+        )
+    try:
+        session.sql(
+            "ALTER SESSION SET ENABLE_STRUCTURED_TYPES_NATIVE_ARROW_FORMAT=true"
+        ).collect()
+    except SnowparkSQLException:
+        logger.debug("ENABLE_STRUCTURED_TYPES_NATIVE_ARROW_FORMAT is not defined")
+    try:
+        session.sql(
+            "ALTER SESSION SET ENABLE_STRUCTURED_TYPES_IN_CLIENT_RESPONSE=true"
+        ).collect()
+    except SnowparkSQLException:
+        logger.debug("ENABLE_STRUCTURED_TYPES_IN_CLIENT_RESPONSE is not defined")
     # Instrument the snowpark session to use a cache for describe queries.
     instrument_session_for_describe_cache(session)
@@ -174,7 +181,3 @@ def set_query_tags(spark_tags: Sequence[str]) -> None:
     if spark_tags_str != snowpark_session.query_tag:
         snowpark_session.query_tag = spark_tags_str
-def get_python_udxf_import_files(session: snowpark.Session) -> str:
-    return ",".join([file for file in [*session._python_files, *session._import_files]])

snowpark-connect 0.20.2__py3-none-any.whl → 0.22.1__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.20.2py3-none-any.whl → 0.22.1py3-none-any.whl