PyPI - snowpark-connect - Versions diffs - 0.25.0__py3-none-any.whl → 0.26.0__py3-none-any.whl - Mend

snowpark-connect 0.25.0py3-none-any.whl → 0.26.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (32) hide show

snowflake/snowpark_connect/relation/write/map_write.py CHANGED Viewed

@@ -16,7 +16,7 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
     unquote_if_quoted,
 )
 from snowflake.snowpark.exceptions import SnowparkSQLException
-from snowflake.snowpark.functions import col, lit, object_construct
+from snowflake.snowpark.functions import col, lit, object_construct, sql_expr
 from snowflake.snowpark.types import (
     ArrayType,
     DataType,
@@ -40,7 +40,10 @@ from snowflake.snowpark_connect.relation.io_utils import (
 from snowflake.snowpark_connect.relation.map_relation import map_relation
 from snowflake.snowpark_connect.relation.read.reader_config import CsvWriterConfig
 from snowflake.snowpark_connect.relation.stage_locator import get_paths_from_stage
-from snowflake.snowpark_connect.relation.utils import random_string
+from snowflake.snowpark_connect.relation.utils import (
+    generate_spark_compatible_filename,
+    random_string,
+)
 from snowflake.snowpark_connect.type_mapping import snowpark_to_iceberg_type
 from snowflake.snowpark_connect.utils.context import get_session_id
 from snowflake.snowpark_connect.utils.identifiers import (
@@ -133,45 +136,80 @@ def map_write(request: proto_base.ExecutePlanRequest):
         write_op.source = ""
     should_write_to_single_file = str_to_bool(write_op.options.get("single", "false"))
-    if should_write_to_single_file:
-        # providing default size as 1GB
-        max_file_size = int(
-            write_op.options.get("snowflake_max_file_size", "1073741824")
-        )
+    # Support Snowflake-specific snowflake_max_file_size option. This is NOT a spark option.
+    max_file_size = None
+    if (
+        "snowflake_max_file_size" in write_op.options
+        and int(write_op.options["snowflake_max_file_size"]) > 0
+    ):
+        max_file_size = int(write_op.options["snowflake_max_file_size"])
+    elif should_write_to_single_file:
+        # providing default size as 1GB for single file write
+        max_file_size = 1073741824
     match write_op.source:
         case "csv" | "parquet" | "json" | "text":
             write_path = get_paths_from_stage(
                 [write_op.path],
                 session=session,
             )[0]
-            # we need a random prefix to support "append" mode
-            # otherwise copy into with overwrite=False will fail if the file already exists
+            # Generate Spark-compatible filename with proper extension
+            extension = write_op.source if write_op.source != "text" else "txt"
+            # Get compression from options for proper filename generation
+            compression_option = write_op.options.get("compression", "none")
+            # Generate Spark-compatible filename or prefix
             if should_write_to_single_file:
-                extention = write_op.source if write_op.source != "text" else "txt"
-                temp_file_prefix_on_stage = (
-                    f"{write_path}/{random_string(10, 'sas_file_')}.{extention}"
+                # Single file: generate complete filename with extension
+                spark_filename = generate_spark_compatible_filename(
+                    task_id=0,
+                    attempt_number=0,
+                    compression=compression_option,
+                    format_ext=extension,
                 )
+                temp_file_prefix_on_stage = f"{write_path}/{spark_filename}"
             else:
-                temp_file_prefix_on_stage = (
-                    f"{write_path}/{random_string(10, 'sas_file_')}"
+                # Multiple files: generate prefix without extension (Snowflake will add extensions)
+                spark_filename_prefix = generate_spark_compatible_filename(
+                    task_id=0,
+                    attempt_number=0,
+                    compression=compression_option,
+                    format_ext="",  # No extension for prefix
                 )
+                temp_file_prefix_on_stage = f"{write_path}/{spark_filename_prefix}"
             overwrite = (
                 write_op.mode
                 == commands_proto.WriteOperation.SaveMode.SAVE_MODE_OVERWRITE
             )
+            default_compression = "NONE" if write_op.source != "parquet" else "snappy"
+            compression = write_op.options.get(
+                "compression", default_compression
+            ).upper()
             parameters = {
                 "location": temp_file_prefix_on_stage,
                 "file_format_type": write_op.source
                 if write_op.source != "text"
                 else "csv",
                 "format_type_options": {
-                    "COMPRESSION": "NONE",
+                    "COMPRESSION": compression,
                 },
                 "overwrite": overwrite,
             }
-            if should_write_to_single_file:
-                parameters["single"] = True
+            # By default, download from the same prefix we wrote to.
+            download_stage_path = temp_file_prefix_on_stage
+            # Check for partition hint early to determine precedence over single option
+            partition_hint = result.partition_hint
+            # Apply max_file_size for both single and multi-file scenarios
+            # This helps control when Snowflake splits files into multiple parts
+            if max_file_size:
                 parameters["max_file_size"] = max_file_size
+            # Only apply single option if no partition hint is present (partition hint takes precedence)
+            if should_write_to_single_file and partition_hint is None:
+                parameters["single"] = True
             rewritten_df: snowpark.DataFrame = rewrite_df(input_df, write_op.source)
             get_param_from_options(parameters, write_op.options, write_op.source)
             if write_op.partitioning_columns:
@@ -186,10 +224,50 @@ def map_write(request: proto_base.ExecutePlanRequest):
                     )
                 else:
                     parameters["partition_by"] = partitioning_columns[0]
-            rewritten_df.write.copy_into_location(**parameters)
+            # If a partition hint is present (from DataFrame.repartition(n)), optionally split the
+            # write into n COPY INTO calls by assigning a synthetic partition id. Controlled by config.
+            # Note: This affects only the number of output files, not computation semantics.
+            # Partition hints take precedence over single option (matches Spark behavior) when enabled.
+            repartition_for_writes_enabled = (
+                global_config.snowflake_repartition_for_writes
+            )
+            if repartition_for_writes_enabled and partition_hint and partition_hint > 0:
+                # Create a stable synthetic file number per row using ROW_NUMBER() over a
+                # randomized order, then modulo partition_hint. We rely on sql_expr to avoid
+                # adding new helpers.
+                file_num_col = "_sas_file_num"
+                partitioned_df = rewritten_df.withColumn(
+                    file_num_col,
+                    sql_expr(
+                        f"(ROW_NUMBER() OVER (ORDER BY RANDOM())) % {partition_hint}"
+                    ),
+                )
+                # Execute multiple COPY INTO operations, one per target file.
+                # Since we write per-partition with distinct prefixes, download from the base write path.
+                download_stage_path = write_path
+                for part_idx in range(partition_hint):
+                    part_params = dict(parameters)
+                    # Preserve Spark-like filename prefix per partition so downloaded basenames
+                    # match the expected Spark pattern (with possible Snowflake counters appended).
+                    per_part_prefix = generate_spark_compatible_filename(
+                        task_id=part_idx,
+                        attempt_number=0,
+                        compression=compression_option,
+                        format_ext="",  # prefix only; Snowflake appends extension/counters
+                    )
+                    part_params["location"] = f"{write_path}/{per_part_prefix}"
+                    (
+                        partitioned_df.filter(col(file_num_col) == lit(part_idx))
+                        .drop(file_num_col)
+                        .write.copy_into_location(**part_params)
+                    )
+            else:
+                rewritten_df.write.copy_into_location(**parameters)
             if not is_cloud_path(write_op.path):
                 store_files_locally(
-                    temp_file_prefix_on_stage,
+                    download_stage_path,
                     write_op.path,
                     overwrite,
                     session,
@@ -569,7 +647,12 @@ def _validate_schema_and_get_writer(
             col_name = field.name
             renamed = col_name
             matching_field = next(
-                (f for f in table_schema.fields if f.name.lower() == col_name.lower()),
+                (
+                    f
+                    for f in table_schema.fields
+                    if unquote_if_quoted(f.name).lower()
+                    == unquote_if_quoted(col_name).lower()
+                ),
                 None,
             )
             if matching_field is not None and matching_field != col_name:
@@ -591,7 +674,10 @@ def _validate_schema_and_get_writer(
 def _validate_schema_for_append(
-    table_schema: DataType, data_schema: DataType, snowpark_table_name: str
+    table_schema: DataType,
+    data_schema: DataType,
+    snowpark_table_name: str,
+    compare_structs: bool = False,
 ):
     match (table_schema, data_schema):
         case (_, _) if table_schema == data_schema:
@@ -600,7 +686,11 @@ def _validate_schema_for_append(
         case (StructType() as table_struct, StructType() as data_struct):
             def _comparable_col_name(col: str) -> str:
-                return col if global_config.spark_sql_caseSensitive else col.lower()
+                name = col if global_config.spark_sql_caseSensitive else col.lower()
+                if compare_structs:
+                    return name
+                else:
+                    return unquote_if_quoted(name)
             def invalid_struct_schema():
                 raise AnalysisException(
@@ -640,6 +730,7 @@ def _validate_schema_for_append(
                         matching_table_field.datatype,
                         data_field.datatype,
                         snowpark_table_name,
+                        compare_structs=True,
                     )
             return

snowflake/snowpark_connect/resources_initializer.py CHANGED Viewed

@@ -9,6 +9,7 @@ from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_sess
 from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
 _resources_initialized = threading.Event()
+_initializer_lock = threading.Lock()
 SPARK_VERSION = "3.5.6"
 RESOURCE_PATH = "/snowflake/snowpark_connect/resources"
@@ -57,6 +58,9 @@ def initialize_resources() -> None:
             f"spark-connect-client-jvm_2.12-{SPARK_VERSION}.jar",
             f"spark-common-utils_2.12-{SPARK_VERSION}.jar",
             "json4s-ast_2.12-3.7.0-M11.jar",
+            "json4s-native_2.12-3.7.0-M11.jar",
+            "json4s-core_2.12-3.7.0-M11.jar",
+            "paranamer-2.8.3.jar",
         ]
         for jar in jar_files:
@@ -94,10 +98,19 @@ def initialize_resources() -> None:
     logger.info(f"All resources initialized in {time.time() - start_time:.2f}s")
+_resource_initializer = threading.Thread(
+    target=initialize_resources, name="ResourceInitializer"
+)
 def initialize_resources_async() -> threading.Thread:
     """Start resource initialization in background."""
-    thread = threading.Thread(
-        target=initialize_resources, name="ResourceInitializer", daemon=True
-    )
-    thread.start()
-    return thread
+    with _initializer_lock:
+        if not _resource_initializer.is_alive() and _resource_initializer.ident is None:
+            _resource_initializer.start()
+        return _resource_initializer
+def wait_for_resource_initialization() -> None:
+    with _initializer_lock:
+        _resource_initializer.join()

snowflake/snowpark_connect/server.py CHANGED Viewed

@@ -88,6 +88,9 @@ from snowflake.snowpark_connect.utils.context import (
     set_spark_version,
 )
 from snowflake.snowpark_connect.utils.env_utils import get_int_from_env
+from snowflake.snowpark_connect.utils.external_udxf_cache import (
+    clear_external_udxf_cache,
+)
 from snowflake.snowpark_connect.utils.interrupt import (
     interrupt_all_queries,
     interrupt_queries_with_tag,
@@ -436,7 +439,8 @@ class SnowflakeConnectServicer(proto_base_grpc.SparkConnectServiceServicer):
                         lambda: map_local_relation(relation),  # noqa: B023
                         materialize=True,
                     )
-                except Exception:
+                except Exception as e:
+                    logger.warning("Failed to put df into cache: %s", str(e))
                     # fallback - treat as regular artifact
                     _handle_regular_artifact()
             else:
@@ -556,6 +560,9 @@ class SnowflakeConnectServicer(proto_base_grpc.SparkConnectServiceServicer):
         if class_files:
             write_class_files_to_stage(session, class_files)
+        if any(not name.startswith("cache") for name in filenames.keys()):
+            clear_external_udxf_cache(session)
         return proto_base.AddArtifactsResponse(artifacts=list(response.values()))
     def ArtifactStatus(self, request, context):

snowflake/snowpark_connect/utils/concurrent.py CHANGED Viewed

@@ -64,6 +64,10 @@ class SynchronizedDict(Mapping[K, V]):
         with self._lock.reader():
             return iter(list(self._dict.items()))
+    def clear(self) -> None:
+        with self._lock.writer():
+            self._dict.clear()
 class ReadWriteLock:
     class _Reader:

snowflake/snowpark_connect/utils/external_udxf_cache.py ADDED Viewed

@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
+#
+from snowflake.snowpark import Session
+from snowflake.snowpark_connect.utils.concurrent import SynchronizedDict
+def init_external_udxf_cache(session: Session) -> None:
+    session.external_udfs_cache = SynchronizedDict()
+    session.external_udtfs_cache = SynchronizedDict()
+def clear_external_udxf_cache(session: Session) -> None:
+    session.external_udfs_cache.clear()
+    session.external_udtfs_cache.clear()
+def get_external_udf_from_cache(hash: str):
+    return Session.get_active_session().external_udfs_cache.get(hash)
+def cache_external_udf(hash: int, udf):
+    Session.get_active_session().external_udfs_cache[hash] = udf
+def clear_external_udtf_cache(session: Session) -> None:
+    session.external_udtfs_cache.clear()
+def get_external_udtf_from_cache(hash: int):
+    return Session.get_active_session().external_udtfs_cache.get(hash)
+def cache_external_udtf(hash: int, udf):
+    Session.get_active_session().external_udtfs_cache[hash] = udf

snowpark-connect 0.25.0__py3-none-any.whl → 0.26.0__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.25.0py3-none-any.whl → 0.26.0py3-none-any.whl