PyPI - snowpark-connect - Versions diffs - 0.28.1__py3-none-any.whl → 0.29.0__py3-none-any.whl - Mend

snowpark-connect 0.28.1py3-none-any.whl → 0.29.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (28) hide show

snowflake/snowpark_connect/relation/stage_locator.py CHANGED Viewed

@@ -5,6 +5,7 @@
 import os
 from fsspec.core import url_to_fs
+from pyspark.errors.exceptions.base import AnalysisException
 from s3fs.core import S3FileSystem
 from snowflake import snowpark
@@ -33,37 +34,42 @@ def get_paths_from_stage(
     # TODO : What if GCP?
     # TODO: What if already stage path?
-    if get_cloud_from_url(paths[0]) == "azure":
-        rewrite_paths = []
-        for p in paths:
-            _, bucket_name, path = parse_azure_url(p)
-            rewrite_paths.append(f"{stage_name}/{path}")
-        paths = rewrite_paths
-    else:
-        filesystem, parsed_path = url_to_fs(paths[0])
-        if isinstance(filesystem, S3FileSystem):  # aws
-            # Remove bucket name from the path since the stage name will replace
-            # the bucket name in the path.
-            paths = [
-                f"{stage_name}/{'/'.join(url_to_fs(p)[1].split('/')[1:])}"
-                for p in paths
-            ]
-        else:  # local
-            # For local files, we need to preserve directory structure for partitioned data
-            # Instead of just using basename, we'll use the last few path components
-            new_paths = []
+    match get_cloud_from_url(paths[0]):
+        case "azure":
+            rewrite_paths = []
             for p in paths:
-                # Split the path and take the last 2-3 components to preserve structure
-                # but avoid very long paths
-                path_parts = p.split(os.sep)
-                if len(path_parts) >= 2:
-                    # Take last 2 components (e.g., "base_case/x=abc")
-                    relative_path = "/".join(path_parts[-2:])
-                else:
-                    # Single component, use basename
-                    relative_path = os.path.basename(p)
-                new_paths.append(f"{stage_name}/{relative_path}")
-            paths = new_paths
+                _, bucket_name, path = parse_azure_url(p)
+                rewrite_paths.append(f"{stage_name}/{path}")
+            paths = rewrite_paths
+        case "gcp":
+            raise AnalysisException(
+                "You must configure an integration for Google Cloud Storage to perform I/O operations rather than accessing the URL directly. Reference: https://docs.snowflake.com/en/user-guide/data-load-gcs-config"
+            )
+        case _:
+            filesystem, parsed_path = url_to_fs(paths[0])
+            if isinstance(filesystem, S3FileSystem):  # aws
+                # Remove bucket name from the path since the stage name will replace
+                # the bucket name in the path.
+                paths = [
+                    f"{stage_name}/{'/'.join(url_to_fs(p)[1].split('/')[1:])}"
+                    for p in paths
+                ]
+            else:  # local
+                # For local files, we need to preserve directory structure for partitioned data
+                # Instead of just using basename, we'll use the last few path components
+                new_paths = []
+                for p in paths:
+                    # Split the path and take the last 2-3 components to preserve structure
+                    # but avoid very long paths
+                    path_parts = p.split(os.sep)
+                    if len(path_parts) >= 2:
+                        # Take last 2 components (e.g., "base_case/x=abc")
+                        relative_path = "/".join(path_parts[-2:])
+                    else:
+                        # Single component, use basename
+                        relative_path = os.path.basename(p)
+                    new_paths.append(f"{stage_name}/{relative_path}")
+                paths = new_paths
     return paths
@@ -102,15 +108,21 @@ class StageLocator:
                 sql_query = f"CREATE OR REPLACE TEMP STAGE {stage_name[1:]} URL='azure://{account}.blob.core.windows.net/{bucket_name}'"
                 credential_session_key = (
-                    f"fs.azure.sas.{bucket_name}.{account}.blob.core.windows.net"
+                    f"fs.azure.sas.fixed.token.{account}.dfs.core.windows.net",
+                    f"fs.azure.sas.{bucket_name}.{account}.blob.core.windows.net",
                 )
                 credential = sessions_config.get(spark_session_id, None)
-                if (
-                    credential is not None
-                    and credential.get(credential_session_key) is not None
-                    and credential.get(credential_session_key).strip() != ""
-                ):
-                    sql_query += f" CREDENTIALS = (AZURE_SAS_TOKEN = '{credential.get(credential_session_key)}')"
+                sas_token = None
+                for session_key in credential_session_key:
+                    if (
+                        credential is not None
+                        and credential.get(session_key) is not None
+                        and credential.get(session_key).strip() != ""
+                    ):
+                        sas_token = credential.get(session_key)
+                        break
+                if sas_token is not None:
+                    sql_query += f" CREDENTIALS = (AZURE_SAS_TOKEN = '{sas_token}')"
                 logger.info(self.session.sql(sql_query).collect())
                 self.stages_for_azure[bucket_name] = stage_name
@@ -128,24 +140,44 @@ class StageLocator:
                     # but the rest of the time it's used, it does. We just drop it here.
                     sql_query = f"CREATE OR REPLACE TEMP STAGE {stage_name[1:]} URL='s3://{parsed_path.split('/')[0]}'"
                     credential = sessions_config.get(spark_session_id, None)
-                    if (
-                        credential is not None
-                        and credential.get("spark.hadoop.fs.s3a.access.key") is not None
-                        and credential.get("spark.hadoop.fs.s3a.secret.key") is not None
-                        and credential.get("spark.hadoop.fs.s3a.access.key").strip()
-                        != ""
-                        and credential.get("spark.hadoop.fs.s3a.secret.key").strip()
-                        != ""
-                    ):
-                        aws_keys = f" AWS_KEY_ID = '{credential.get('spark.hadoop.fs.s3a.access.key')}'"
-                        aws_keys += f" AWS_SECRET_KEY = '{credential.get('spark.hadoop.fs.s3a.secret.key')}'"
-                        if (
-                            credential.get("spark.hadoop.fs.s3a.session.token")
+                    if credential is not None:
+                        if (  # USE AWS KEYS to connect
+                            credential.get("spark.hadoop.fs.s3a.access.key") is not None
+                            and credential.get("spark.hadoop.fs.s3a.secret.key")
+                            is not None
+                            and credential.get("spark.hadoop.fs.s3a.access.key").strip()
+                            != ""
+                            and credential.get("spark.hadoop.fs.s3a.secret.key").strip()
+                            != ""
+                        ):
+                            aws_keys = f" AWS_KEY_ID = '{credential.get('spark.hadoop.fs.s3a.access.key')}'"
+                            aws_keys += f" AWS_SECRET_KEY = '{credential.get('spark.hadoop.fs.s3a.secret.key')}'"
+                            if (
+                                credential.get("spark.hadoop.fs.s3a.session.token")
+                                is not None
+                            ):
+                                aws_keys += f" AWS_TOKEN = '{credential.get('spark.hadoop.fs.s3a.session.token')}'"
+                            sql_query += f" CREDENTIALS = ({aws_keys})"
+                            sql_query += " ENCRYPTION = ( TYPE = 'AWS_SSE_S3' )"
+                        elif (  # USE AWS ROLE and KMS KEY to connect
+                            credential.get(
+                                "spark.hadoop.fs.s3a.server-side-encryption.key"
+                            )
+                            is not None
+                            and credential.get(
+                                "spark.hadoop.fs.s3a.server-side-encryption.key"
+                            ).strip()
+                            != ""
+                            and credential.get("spark.hadoop.fs.s3a.assumed.role.arn")
                             is not None
+                            and credential.get(
+                                "spark.hadoop.fs.s3a.assumed.role.arn"
+                            ).strip()
+                            != ""
                         ):
-                            aws_keys += f" AWS_TOKEN = '{credential.get('spark.hadoop.fs.s3a.session.token')}'"
-                        sql_query += f" CREDENTIALS = ({aws_keys})"
-                        sql_query += " ENCRYPTION = ( TYPE = 'AWS_SSE_S3' )"
+                            aws_role = f" AWS_ROLE = '{credential.get('spark.hadoop.fs.s3a.assumed.role.arn')}'"
+                            sql_query += f" CREDENTIALS = ({aws_role})"
+                            sql_query += f" ENCRYPTION = ( TYPE='AWS_SSE_KMS' KMS_KEY_ID = '{credential.get('spark.hadoop.fs.s3a.server-side-encryption.key')}' )"
                     logger.info(self.session.sql(sql_query).collect())
                     self.stages_for_aws[bucket_name] = stage_name

snowflake/snowpark_connect/relation/write/map_write.py CHANGED Viewed

@@ -36,6 +36,8 @@ from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
 from snowflake.snowpark_connect.relation.io_utils import (
     convert_file_prefix_path,
     is_cloud_path,
+    is_supported_compression,
+    supported_compressions_for_format,
 )
 from snowflake.snowpark_connect.relation.map_relation import map_relation
 from snowflake.snowpark_connect.relation.read.reader_config import CsvWriterConfig
@@ -179,7 +181,7 @@ def map_write(request: proto_base.ExecutePlanRequest):
                             f"Skipping REMOVE for root path {write_path} - too broad scope"
                         )
                     else:
-                        remove_command = f"REMOVE {write_path}/"
+                        remove_command = f"REMOVE '{write_path}/'"
                         session.sql(remove_command).collect()
                         logger.info(f"Successfully cleared directory: {write_path}")
                 except Exception as e:
@@ -208,6 +210,20 @@ def map_write(request: proto_base.ExecutePlanRequest):
             compression = write_op.options.get(
                 "compression", default_compression
             ).upper()
+            if not is_supported_compression(write_op.source, compression):
+                supported_compressions = supported_compressions_for_format(
+                    write_op.source
+                )
+                raise AnalysisException(
+                    f"Compression {compression} is not supported for {write_op.source} format. "
+                    + (
+                        f"Supported compressions: {sorted(supported_compressions)}"
+                        if supported_compressions
+                        else "No compression supported for this format."
+                    )
+                )
             parameters = {
                 "location": temp_file_prefix_on_stage,
                 "file_format_type": write_op.source
@@ -417,9 +433,27 @@ def map_write(request: proto_base.ExecutePlanRequest):
                     )
         case _:
             snowpark_table_name = _spark_to_snowflake(write_op.table.table_name)
+            save_method = write_op.table.save_method
+            if (
+                write_op.source == "snowflake"
+                and write_op.table.save_method
+                == commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_UNSPECIFIED
+            ):
+                save_method = (
+                    commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE
+                )
+                if len(write_op.table.table_name) == 0:
+                    dbtable_name = write_op.options.get("dbtable", "")
+                    if len(dbtable_name) == 0:
+                        raise SnowparkConnectNotImplementedError(
+                            "Save command is not supported without a table name"
+                        )
+                    else:
+                        snowpark_table_name = _spark_to_snowflake(dbtable_name)
             if (
-                write_op.table.save_method
+                save_method
                 == commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE
             ):
                 match write_mode:
@@ -481,7 +515,7 @@ def map_write(request: proto_base.ExecutePlanRequest):
                             column_order=_column_order_for_write,
                         )
             elif (
-                write_op.table.save_method
+                save_method
                 == commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO
             ):
                 _validate_schema_and_get_writer(
@@ -493,7 +527,7 @@ def map_write(request: proto_base.ExecutePlanRequest):
                 )
             else:
                 raise SnowparkConnectNotImplementedError(
-                    f"Save command not supported: {write_op.table.save_method}"
+                    f"Save command not supported: {save_method}"
                 )

snowflake/snowpark_connect/server.py CHANGED Viewed

@@ -1161,23 +1161,28 @@ def get_session(url: Optional[str] = None, conf: SparkConf = None) -> SparkSessi
 def init_spark_session(conf: SparkConf = None) -> SparkSession:
-    try:
-        # For Notebooks on SPCS
-        from jdk4py import JAVA_HOME
-        os.environ["JAVA_HOME"] = str(JAVA_HOME)
-    except ModuleNotFoundError:
-        # For notebooks on Warehouse
-        os.environ["JAVA_HOME"] = os.environ["CONDA_PREFIX"]
-        os.environ["JAVA_LD_LIBRARY_PATH"] = os.path.join(
-            os.environ["CONDA_PREFIX"], "lib", "server"
-        )
-    logger.info("JAVA_HOME=%s", os.environ["JAVA_HOME"])
+    if os.environ.get("JAVA_HOME") is None:
+        try:
+            # For Notebooks on SPCS
+            from jdk4py import JAVA_HOME
+            os.environ["JAVA_HOME"] = str(JAVA_HOME)
+        except ModuleNotFoundError:
+            # For notebooks on Warehouse
+            conda_prefix = os.environ.get("CONDA_PREFIX")
+            if conda_prefix is not None:
+                os.environ["JAVA_HOME"] = conda_prefix
+                os.environ["JAVA_LD_LIBRARY_PATH"] = os.path.join(
+                    conda_prefix, "lib", "server"
+                )
+    logger.info("JAVA_HOME=%s", os.environ.get("JAVA_HOME", "Not defined"))
     os.environ["SPARK_LOCAL_HOSTNAME"] = "127.0.0.1"
     os.environ["SPARK_CONNECT_MODE_ENABLED"] = "1"
-    snowpark_session = snowpark.context.get_active_session()
+    from snowflake.snowpark_connect.utils.session import _get_current_snowpark_session
+    snowpark_session = _get_current_snowpark_session()
     start_session(snowpark_session=snowpark_session)
     return get_session(conf=conf)

snowflake/snowpark_connect/utils/context.py CHANGED Viewed

@@ -30,9 +30,6 @@ _sql_aggregate_function_count = ContextVar[int](
     "_contains_aggregate_function", default=0
 )
-# Context for parsing map_partitions
-_map_partitions_stack = ContextVar[int]("_map_partitions_stack", default=0)
 # We have to generate our own plan IDs that are different from Spark's.
 # Spark plan IDs start at 0, so pick a "big enough" number to avoid overlaps.
 _STARTING_SQL_PLAN_ID = 0x80000000
@@ -230,16 +227,6 @@ def push_evaluating_join_condition(join_type, left_keys, right_keys):
         _is_evaluating_join_condition.set(prev)
-@contextmanager
-def push_map_partitions():
-    _map_partitions_stack.set(_map_partitions_stack.get() + 1)
-    yield
-def map_partitions_depth() -> int:
-    return _map_partitions_stack.get()
 @contextmanager
 def push_sql_scope():
     """
@@ -410,7 +397,6 @@ def clear_context_data() -> None:
     _view_process_context.set([])
     _next_sql_plan_id.set(_STARTING_SQL_PLAN_ID)
     _sql_plan_name_map.set({})
-    _map_partitions_stack.set(0)
     _sql_aggregate_function_count.set(0)
     _sql_named_args.set({})
     _sql_pos_args.set({})

snowflake/snowpark_connect/utils/io_utils.py CHANGED Viewed

@@ -3,10 +3,46 @@
 #
 import contextlib
 import functools
+import re
 from snowflake.snowpark import Session
+from snowflake.snowpark._internal.analyzer.analyzer_utils import (
+    create_file_format_statement,
+)
 from snowflake.snowpark_connect.utils.identifiers import FQN
+_MINUS_AT_THE_BEGINNING_REGEX = re.compile(r"^-")
+def cached_file_format(
+    session: Session, file_format: str, format_type_options: dict[str, str]
+) -> str:
+    """
+    Cache and return a file format name based on the given options.
+    """
+    function_name = _MINUS_AT_THE_BEGINNING_REGEX.sub(
+        "1", str(hash(frozenset(format_type_options.items())))
+    )
+    file_format_name = f"__SNOWPARK_CONNECT_FILE_FORMAT__{file_format}_{function_name}"
+    if file_format_name in session._file_formats:
+        return file_format_name
+    session.sql(
+        create_file_format_statement(
+            file_format_name,
+            file_format,
+            format_type_options,
+            temp=True,
+            if_not_exist=True,
+            use_scoped_temp_objects=False,
+            is_generated=True,
+        )
+    ).collect()
+    session._file_formats.add(file_format_name)
+    return file_format_name
 @functools.cache
 def file_format(

snowflake/snowpark_connect/utils/session.py CHANGED Viewed

@@ -71,6 +71,9 @@ def configure_snowpark_session(session: snowpark.Session):
     init_builtin_udf_cache(session)
     init_external_udxf_cache(session)
+    # file format cache
+    session._file_formats = set()
     # Set experimental parameters (warnings globally suppressed)
     session.ast_enabled = False
     session.eliminate_numeric_sql_value_cast_enabled = False

snowflake/snowpark_connect/utils/udf_cache.py CHANGED Viewed

@@ -98,7 +98,11 @@ def cached_udaf(
         # Register the function outside the lock to avoid contention
         wrapped_func = udaf(
             udaf_type,
-            name=name,
+            name=[
+                Session.get_active_session().get_current_database(),
+                Session.get_active_session().get_current_schema(),
+                name,
+            ],
             return_type=return_type,
             input_types=input_types,
             imports=imports,
@@ -155,7 +159,11 @@ def cached_udf(
         # but this will not cause any issues.
         wrapped_func = udf(
             _null_safe_wrapper,
-            name=name,
+            name=[
+                Session.get_active_session().get_current_database(),
+                Session.get_active_session().get_current_schema(),
+                name,
+            ],
             return_type=return_type,
             input_types=input_types,
             imports=imports,
@@ -205,7 +213,11 @@ def cached_udtf(
         # Register the function outside the lock to avoid contention
         wrapped_func = udtf(
             func,
-            name=name,
+            name=[
+                Session.get_active_session().get_current_database(),
+                Session.get_active_session().get_current_schema(),
+                name,
+            ],
             output_schema=output_schema,
             input_types=input_types,
             imports=imports,
@@ -306,11 +318,20 @@ def register_cached_sql_udf(
         )
         with _lock:
-            cache[function_name] = True
+            function_identifier = ".".join(
+                [
+                    Session.get_active_session().get_current_database(),
+                    Session.get_active_session().get_current_schema(),
+                    function_name,
+                ]
+            )
+            cache[function_name] = function_identifier
+    else:
+        function_identifier = cache[function_name]
     return functools.partial(
         call_udf,
-        function_name,
+        function_identifier,
     )
@@ -384,9 +405,18 @@ def register_cached_java_udf(
         )
         with _lock:
-            cache[function_name] = True
+            function_identifier = ".".join(
+                [
+                    Session.get_active_session().get_current_database(),
+                    Session.get_active_session().get_current_schema(),
+                    function_name,
+                ]
+            )
+            cache[function_name] = function_identifier
+    else:
+        function_identifier = cache[function_name]
     return functools.partial(
         call_udf,
-        function_name,
+        function_identifier,
     )

snowflake/snowpark_connect/version.py CHANGED Viewed

@@ -2,4 +2,4 @@
 #
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
-VERSION = (0,28,1)
+VERSION = (0,29,0)

{snowpark_connect-0.28.1.dist-info → snowpark_connect-0.29.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: snowpark-connect
-Version: 0.28.1
+Version: 0.29.0
 Summary: Snowpark Connect for Spark
 Author: Snowflake, Inc
 License: Apache License, Version 2.0
@@ -16,7 +16,7 @@ Requires-Dist: jpype1
 Requires-Dist: protobuf<5.0,>=4.25.3
 Requires-Dist: s3fs>=2025.3.0
 Requires-Dist: snowflake.core<2,>=1.0.5
-Requires-Dist: snowflake-snowpark-python[pandas]<1.40.0,==1.39.0
+Requires-Dist: snowflake-snowpark-python[pandas]<1.40.0,==1.39.1
 Requires-Dist: sqlglot>=26.3.8
 Requires-Dist: jaydebeapi
 Requires-Dist: aiobotocore~=2.23.0
@@ -27,6 +27,7 @@ Requires-Dist: grpcio<1.63,>=1.56.0
 Requires-Dist: grpcio-status<1.63,>=1.56.0
 Requires-Dist: googleapis-common-protos>=1.56.4
 Requires-Dist: numpy<2,>=1.15
+Requires-Dist: gcsfs>=2025.9.0
 Dynamic: author
 Dynamic: description
 Dynamic: description-content-type

snowpark-connect 0.28.1__py3-none-any.whl → 0.29.0__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.28.1py3-none-any.whl → 0.29.0py3-none-any.whl