PyPI - snowpark-connect - Versions diffs - 0.25.0__py3-none-any.whl → 0.27.0__py3-none-any.whl - Mend

snowpark-connect 0.25.0py3-none-any.whl → 0.27.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

snowflake/snowpark_connect/relation/write/map_write.py CHANGED Viewed

@@ -16,7 +16,7 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
     unquote_if_quoted,
 )
 from snowflake.snowpark.exceptions import SnowparkSQLException
-from snowflake.snowpark.functions import col, lit, object_construct
+from snowflake.snowpark.functions import col, lit, object_construct, sql_expr
 from snowflake.snowpark.types import (
     ArrayType,
     DataType,
@@ -40,7 +40,10 @@ from snowflake.snowpark_connect.relation.io_utils import (
 from snowflake.snowpark_connect.relation.map_relation import map_relation
 from snowflake.snowpark_connect.relation.read.reader_config import CsvWriterConfig
 from snowflake.snowpark_connect.relation.stage_locator import get_paths_from_stage
-from snowflake.snowpark_connect.relation.utils import random_string
+from snowflake.snowpark_connect.relation.utils import (
+    generate_spark_compatible_filename,
+    random_string,
+)
 from snowflake.snowpark_connect.type_mapping import snowpark_to_iceberg_type
 from snowflake.snowpark_connect.utils.context import get_session_id
 from snowflake.snowpark_connect.utils.identifiers import (
@@ -48,6 +51,7 @@ from snowflake.snowpark_connect.utils.identifiers import (
     split_fully_qualified_spark_name,
 )
 from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
+from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
 from snowflake.snowpark_connect.utils.telemetry import (
     SnowparkConnectNotImplementedError,
     telemetry,
@@ -133,45 +137,99 @@ def map_write(request: proto_base.ExecutePlanRequest):
         write_op.source = ""
     should_write_to_single_file = str_to_bool(write_op.options.get("single", "false"))
-    if should_write_to_single_file:
-        # providing default size as 1GB
-        max_file_size = int(
-            write_op.options.get("snowflake_max_file_size", "1073741824")
-        )
+    # Support Snowflake-specific snowflake_max_file_size option. This is NOT a spark option.
+    max_file_size = None
+    if (
+        "snowflake_max_file_size" in write_op.options
+        and int(write_op.options["snowflake_max_file_size"]) > 0
+    ):
+        max_file_size = int(write_op.options["snowflake_max_file_size"])
+    elif should_write_to_single_file:
+        # providing default size as 1GB for single file write
+        max_file_size = 1073741824
     match write_op.source:
         case "csv" | "parquet" | "json" | "text":
             write_path = get_paths_from_stage(
                 [write_op.path],
                 session=session,
             )[0]
+            # Generate Spark-compatible filename with proper extension
+            extension = write_op.source if write_op.source != "text" else "txt"
+            # Get compression from options for proper filename generation
+            compression_option = write_op.options.get("compression", "none")
+            # Generate Spark-compatible filename or prefix
             # we need a random prefix to support "append" mode
             # otherwise copy into with overwrite=False will fail if the file already exists
-            if should_write_to_single_file:
-                extention = write_op.source if write_op.source != "text" else "txt"
-                temp_file_prefix_on_stage = (
-                    f"{write_path}/{random_string(10, 'sas_file_')}.{extention}"
-                )
-            else:
-                temp_file_prefix_on_stage = (
-                    f"{write_path}/{random_string(10, 'sas_file_')}"
-                )
             overwrite = (
                 write_op.mode
                 == commands_proto.WriteOperation.SaveMode.SAVE_MODE_OVERWRITE
             )
+            if overwrite:
+                try:
+                    path_after_stage = (
+                        write_path.split("/", 1)[1] if "/" in write_path else ""
+                    )
+                    if not path_after_stage or path_after_stage == "/":
+                        logger.warning(
+                            f"Skipping REMOVE for root path {write_path} - too broad scope"
+                        )
+                    else:
+                        remove_command = f"REMOVE {write_path}/"
+                        session.sql(remove_command).collect()
+                        logger.info(f"Successfully cleared directory: {write_path}")
+                except Exception as e:
+                    logger.warning(f"Could not clear directory {write_path}: {e}")
+            if should_write_to_single_file:
+                # Single file: generate complete filename with extension
+                spark_filename = generate_spark_compatible_filename(
+                    task_id=0,
+                    attempt_number=0,
+                    compression=compression_option,
+                    format_ext=extension,
+                )
+                temp_file_prefix_on_stage = f"{write_path}/{spark_filename}"
+            else:
+                # Multiple files: generate prefix without extension (Snowflake will add extensions)
+                spark_filename_prefix = generate_spark_compatible_filename(
+                    task_id=0,
+                    attempt_number=0,
+                    compression=compression_option,
+                    format_ext="",  # No extension for prefix
+                )
+                temp_file_prefix_on_stage = f"{write_path}/{spark_filename_prefix}"
+            default_compression = "NONE" if write_op.source != "parquet" else "snappy"
+            compression = write_op.options.get(
+                "compression", default_compression
+            ).upper()
             parameters = {
                 "location": temp_file_prefix_on_stage,
                 "file_format_type": write_op.source
                 if write_op.source != "text"
                 else "csv",
                 "format_type_options": {
-                    "COMPRESSION": "NONE",
+                    "COMPRESSION": compression,
                 },
                 "overwrite": overwrite,
             }
-            if should_write_to_single_file:
-                parameters["single"] = True
+            # By default, download from the same prefix we wrote to.
+            download_stage_path = temp_file_prefix_on_stage
+            # Check for partition hint early to determine precedence over single option
+            partition_hint = result.partition_hint
+            # Apply max_file_size for both single and multi-file scenarios
+            # This helps control when Snowflake splits files into multiple parts
+            if max_file_size:
                 parameters["max_file_size"] = max_file_size
+            # Only apply single option if no partition hint is present (partition hint takes precedence)
+            if should_write_to_single_file and partition_hint is None:
+                parameters["single"] = True
             rewritten_df: snowpark.DataFrame = rewrite_df(input_df, write_op.source)
             get_param_from_options(parameters, write_op.options, write_op.source)
             if write_op.partitioning_columns:
@@ -186,10 +244,50 @@ def map_write(request: proto_base.ExecutePlanRequest):
                     )
                 else:
                     parameters["partition_by"] = partitioning_columns[0]
-            rewritten_df.write.copy_into_location(**parameters)
+            # If a partition hint is present (from DataFrame.repartition(n)), optionally split the
+            # write into n COPY INTO calls by assigning a synthetic partition id. Controlled by config.
+            # Note: This affects only the number of output files, not computation semantics.
+            # Partition hints take precedence over single option (matches Spark behavior) when enabled.
+            repartition_for_writes_enabled = (
+                global_config.snowflake_repartition_for_writes
+            )
+            if repartition_for_writes_enabled and partition_hint and partition_hint > 0:
+                # Create a stable synthetic file number per row using ROW_NUMBER() over a
+                # randomized order, then modulo partition_hint. We rely on sql_expr to avoid
+                # adding new helpers.
+                file_num_col = "_sas_file_num"
+                partitioned_df = rewritten_df.withColumn(
+                    file_num_col,
+                    sql_expr(
+                        f"(ROW_NUMBER() OVER (ORDER BY RANDOM())) % {partition_hint}"
+                    ),
+                )
+                # Execute multiple COPY INTO operations, one per target file.
+                # Since we write per-partition with distinct prefixes, download from the base write path.
+                download_stage_path = write_path
+                for part_idx in range(partition_hint):
+                    part_params = dict(parameters)
+                    # Preserve Spark-like filename prefix per partition so downloaded basenames
+                    # match the expected Spark pattern (with possible Snowflake counters appended).
+                    per_part_prefix = generate_spark_compatible_filename(
+                        task_id=part_idx,
+                        attempt_number=0,
+                        compression=compression_option,
+                        format_ext="",  # prefix only; Snowflake appends extension/counters
+                    )
+                    part_params["location"] = f"{write_path}/{per_part_prefix}"
+                    (
+                        partitioned_df.filter(col(file_num_col) == lit(part_idx))
+                        .drop(file_num_col)
+                        .write.copy_into_location(**part_params)
+                    )
+            else:
+                rewritten_df.write.copy_into_location(**parameters)
             if not is_cloud_path(write_op.path):
                 store_files_locally(
-                    temp_file_prefix_on_stage,
+                    download_stage_path,
                     write_op.path,
                     overwrite,
                     session,
@@ -569,7 +667,12 @@ def _validate_schema_and_get_writer(
             col_name = field.name
             renamed = col_name
             matching_field = next(
-                (f for f in table_schema.fields if f.name.lower() == col_name.lower()),
+                (
+                    f
+                    for f in table_schema.fields
+                    if unquote_if_quoted(f.name).lower()
+                    == unquote_if_quoted(col_name).lower()
+                ),
                 None,
             )
             if matching_field is not None and matching_field != col_name:
@@ -591,7 +694,10 @@ def _validate_schema_and_get_writer(
 def _validate_schema_for_append(
-    table_schema: DataType, data_schema: DataType, snowpark_table_name: str
+    table_schema: DataType,
+    data_schema: DataType,
+    snowpark_table_name: str,
+    compare_structs: bool = False,
 ):
     match (table_schema, data_schema):
         case (_, _) if table_schema == data_schema:
@@ -600,7 +706,11 @@ def _validate_schema_for_append(
         case (StructType() as table_struct, StructType() as data_struct):
             def _comparable_col_name(col: str) -> str:
-                return col if global_config.spark_sql_caseSensitive else col.lower()
+                name = col if global_config.spark_sql_caseSensitive else col.lower()
+                if compare_structs:
+                    return name
+                else:
+                    return unquote_if_quoted(name)
             def invalid_struct_schema():
                 raise AnalysisException(
@@ -640,6 +750,7 @@ def _validate_schema_for_append(
                         matching_table_field.datatype,
                         data_field.datatype,
                         snowpark_table_name,
+                        compare_structs=True,
                     )
             return

snowflake/snowpark_connect/resources_initializer.py CHANGED Viewed

@@ -9,6 +9,7 @@ from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_sess
 from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
 _resources_initialized = threading.Event()
+_initializer_lock = threading.Lock()
 SPARK_VERSION = "3.5.6"
 RESOURCE_PATH = "/snowflake/snowpark_connect/resources"
@@ -57,6 +58,9 @@ def initialize_resources() -> None:
             f"spark-connect-client-jvm_2.12-{SPARK_VERSION}.jar",
             f"spark-common-utils_2.12-{SPARK_VERSION}.jar",
             "json4s-ast_2.12-3.7.0-M11.jar",
+            "json4s-native_2.12-3.7.0-M11.jar",
+            "json4s-core_2.12-3.7.0-M11.jar",
+            "paranamer-2.8.3.jar",
         ]
         for jar in jar_files:
@@ -94,10 +98,19 @@ def initialize_resources() -> None:
     logger.info(f"All resources initialized in {time.time() - start_time:.2f}s")
+_resource_initializer = threading.Thread(
+    target=initialize_resources, name="ResourceInitializer"
+)
 def initialize_resources_async() -> threading.Thread:
     """Start resource initialization in background."""
-    thread = threading.Thread(
-        target=initialize_resources, name="ResourceInitializer", daemon=True
-    )
-    thread.start()
-    return thread
+    with _initializer_lock:
+        if not _resource_initializer.is_alive() and _resource_initializer.ident is None:
+            _resource_initializer.start()
+        return _resource_initializer
+def wait_for_resource_initialization() -> None:
+    with _initializer_lock:
+        _resource_initializer.join()

snowflake/snowpark_connect/server.py CHANGED Viewed

@@ -88,6 +88,9 @@ from snowflake.snowpark_connect.utils.context import (
     set_spark_version,
 )
 from snowflake.snowpark_connect.utils.env_utils import get_int_from_env
+from snowflake.snowpark_connect.utils.external_udxf_cache import (
+    clear_external_udxf_cache,
+)
 from snowflake.snowpark_connect.utils.interrupt import (
     interrupt_all_queries,
     interrupt_queries_with_tag,
@@ -436,7 +439,8 @@ class SnowflakeConnectServicer(proto_base_grpc.SparkConnectServiceServicer):
                         lambda: map_local_relation(relation),  # noqa: B023
                         materialize=True,
                     )
-                except Exception:
+                except Exception as e:
+                    logger.warning("Failed to put df into cache: %s", str(e))
                     # fallback - treat as regular artifact
                     _handle_regular_artifact()
             else:
@@ -527,7 +531,10 @@ class SnowflakeConnectServicer(proto_base_grpc.SparkConnectServiceServicer):
             if name.endswith(".class"):
                 # name is <dir>/<package>/<class_name>
                 # we don't need the dir name, but require the package, so only remove dir
-                class_files[name.split("/", 1)[-1]] = filepath
+                if os.name != "nt":
+                    class_files[name.split("/", 1)[-1]] = filepath
+                else:
+                    class_files[name.split("\\", 1)[-1]] = filepath
                 continue
             session.file.put(
                 filepath,
@@ -556,6 +563,9 @@ class SnowflakeConnectServicer(proto_base_grpc.SparkConnectServiceServicer):
         if class_files:
             write_class_files_to_stage(session, class_files)
+        if any(not name.startswith("cache") for name in filenames.keys()):
+            clear_external_udxf_cache(session)
         return proto_base.AddArtifactsResponse(artifacts=list(response.values()))
     def ArtifactStatus(self, request, context):

snowflake/snowpark_connect/utils/artifacts.py CHANGED Viewed

@@ -39,7 +39,7 @@ def write_temporary_artifact(
     if os.name != "nt":
         filepath = f"/tmp/sas-{session.session_id}/{name}"
     else:
-        filepath = f"{tempfile.gettempdir()}/sas-{session.session_id}/{name}"
+        filepath = f"{tempfile.gettempdir()}\\sas-{session.session_id}\\{name}"
     # The name comes to us as a path (e.g. cache/<name>), so we need to create
     # the parent directory if it doesn't exist to avoid errors during writing.
     pathlib.Path(filepath).parent.mkdir(parents=True, exist_ok=True)
@@ -55,11 +55,10 @@ def write_class_files_to_stage(
 ) -> None:
     if os.name != "nt":
         filepath = f"/tmp/sas-{session.session_id}"
+        jar_name = f'{filepath}/{hashlib.sha256(str(files).encode("utf-8")).hexdigest()[:10]}.jar'
     else:
-        filepath = f"{tempfile.gettempdir()}/sas-{session.session_id}"
-    jar_name = (
-        f'{filepath}/{hashlib.sha256(str(files).encode("utf-8")).hexdigest()[:10]}.jar'
-    )
+        filepath = f"{tempfile.gettempdir()}\\sas-{session.session_id}"
+        jar_name = f'{filepath}\\{hashlib.sha256(str(files).encode("utf-8")).hexdigest()[:10]}.jar'
     with zipfile.ZipFile(jar_name, "w", zipfile.ZIP_DEFLATED) as jar:
         for name, path in files.items():
             jar.write(path, name)

snowflake/snowpark_connect/utils/concurrent.py CHANGED Viewed

@@ -64,6 +64,10 @@ class SynchronizedDict(Mapping[K, V]):
         with self._lock.reader():
             return iter(list(self._dict.items()))
+    def clear(self) -> None:
+        with self._lock.writer():
+            self._dict.clear()
 class ReadWriteLock:
     class _Reader:

snowflake/snowpark_connect/utils/context.py CHANGED Viewed

@@ -30,6 +30,9 @@ _sql_aggregate_function_count = ContextVar[int](
     "_contains_aggregate_function", default=0
 )
+# Context for parsing map_partitions
+_map_partitions_stack = ContextVar[int]("_map_partitions_stack", default=0)
 # We have to generate our own plan IDs that are different from Spark's.
 # Spark plan IDs start at 0, so pick a "big enough" number to avoid overlaps.
 _STARTING_SQL_PLAN_ID = 0x80000000
@@ -49,6 +52,7 @@ _spark_client_type_regex = re.compile(r"spark/(?P<spark_version>\d+\.\d+\.\d+)")
 _current_operation = ContextVar[str]("_current_operation", default="default")
 _resolving_fun_args = ContextVar[bool]("_resolving_fun_args", default=False)
 _resolving_lambda_fun = ContextVar[bool]("_resolving_lambdas", default=False)
+_current_lambda_params = ContextVar[list[str]]("_current_lambda_params", default=[])
 _is_window_enabled = ContextVar[bool]("_is_window_enabled", default=False)
 _is_in_pivot = ContextVar[bool]("_is_in_pivot", default=False)
@@ -206,6 +210,16 @@ def push_evaluating_join_condition(join_type, left_keys, right_keys):
         _is_evaluating_join_condition.set(prev)
+@contextmanager
+def push_map_partitions():
+    _map_partitions_stack.set(_map_partitions_stack.get() + 1)
+    yield
+def map_partitions_depth() -> int:
+    return _map_partitions_stack.get()
 @contextmanager
 def push_sql_scope():
     """
@@ -238,16 +252,21 @@ def push_operation_scope(operation: str):
 @contextmanager
-def resolving_lambda_function():
+def resolving_lambda_function(param_names: list[str] = None):
     """
     Context manager that sets a flag indicating lambda function is being resolved.
+    Also tracks the lambda parameter names for validation.
     """
     prev = _resolving_lambda_fun.get()
+    prev_params = _current_lambda_params.get()
     try:
         _resolving_lambda_fun.set(True)
+        if param_names is not None:
+            _current_lambda_params.set(param_names)
         yield
     finally:
         _resolving_lambda_fun.set(prev)
+        _current_lambda_params.set(prev_params)
 def is_lambda_being_resolved() -> bool:
@@ -257,6 +276,13 @@ def is_lambda_being_resolved() -> bool:
     return _resolving_lambda_fun.get()
+def get_current_lambda_params() -> list[str]:
+    """
+    Returns the current lambda parameter names.
+    """
+    return _current_lambda_params.get()
 @contextmanager
 def resolving_fun_args():
     """
@@ -270,6 +296,19 @@ def resolving_fun_args():
         _resolving_fun_args.set(prev)
+@contextmanager
+def not_resolving_fun_args():
+    """
+    Context manager that sets a flag indicating function arguments are *not* being resolved.
+    """
+    prev = _resolving_fun_args.get()
+    try:
+        _resolving_fun_args.set(False)
+        yield
+    finally:
+        _resolving_fun_args.set(prev)
 def is_function_argument_being_resolved() -> bool:
     """
     Returns True if function arguments are being resolved.
@@ -350,6 +389,7 @@ def clear_context_data() -> None:
     _next_sql_plan_id.set(_STARTING_SQL_PLAN_ID)
     _sql_plan_name_map.set({})
+    _map_partitions_stack.set(0)
     _sql_aggregate_function_count.set(0)
     _sql_named_args.set({})
     _sql_pos_args.set({})

snowflake/snowpark_connect/utils/external_udxf_cache.py ADDED Viewed

@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
+#
+from snowflake.snowpark import Session
+from snowflake.snowpark_connect.utils.concurrent import SynchronizedDict
+def init_external_udxf_cache(session: Session) -> None:
+    session.external_udfs_cache = SynchronizedDict()
+    session.external_udtfs_cache = SynchronizedDict()
+def clear_external_udxf_cache(session: Session) -> None:
+    session.external_udfs_cache.clear()
+    session.external_udtfs_cache.clear()
+def get_external_udf_from_cache(hash: str):
+    return Session.get_active_session().external_udfs_cache.get(hash)
+def cache_external_udf(hash: int, udf):
+    Session.get_active_session().external_udfs_cache[hash] = udf
+def clear_external_udtf_cache(session: Session) -> None:
+    session.external_udtfs_cache.clear()
+def get_external_udtf_from_cache(hash: int):
+    return Session.get_active_session().external_udtfs_cache.get(hash)
+def cache_external_udtf(hash: int, udf):
+    Session.get_active_session().external_udtfs_cache[hash] = udf

snowflake/snowpark_connect/utils/pandas_udtf_utils.py CHANGED Viewed

@@ -87,9 +87,93 @@ def get_map_in_arrow_udtf(
 def create_pandas_udtf(
     udtf_proto: CommonInlineUserDefinedFunction,
     spark_column_names: list[str],
-    input_schema: StructType | None = None,
-    return_schema: StructType | None = None,
+    input_schema: StructType,
+    return_schema: StructType,
+):
+    user_function, _ = cloudpickle.loads(udtf_proto.python_udf.command)
+    output_column_names = [field.name for field in return_schema.fields]
+    output_column_original_names = [
+        field.original_column_identifier for field in return_schema.fields
+    ]
+    class MapPandasUDTF:
+        def __init__(self) -> None:
+            self.user_function = user_function
+            self.output_column_names = output_column_names
+            self.spark_column_names = spark_column_names
+            self.output_column_original_names = output_column_original_names
+        def end_partition(self, df: pd.DataFrame):
+            if df.empty:
+                empty_df = pd.DataFrame(columns=self.output_column_names)
+                yield empty_df
+                return
+            df_without_dummy = df.drop(
+                columns=["_DUMMY_PARTITION_KEY"], errors="ignore"
+            )
+            df_without_dummy.columns = self.spark_column_names
+            result_iterator = self.user_function(
+                [pd.DataFrame([row]) for _, row in df_without_dummy.iterrows()]
+            )
+            if not isinstance(result_iterator, Iterator) and not hasattr(
+                result_iterator, "__iter__"
+            ):
+                raise RuntimeError(
+                    f"snowpark_connect::UDF_RETURN_TYPE Return type of the user-defined function should be "
+                    f"iterator of pandas.DataFrame, but is {type(result_iterator).__name__}"
+                )
+            output_df = pd.concat(result_iterator)
+            generated_output_column_names = list(output_df.columns)
+            missing_columns = []
+            for original_column in self.output_column_original_names:
+                if original_column not in generated_output_column_names:
+                    missing_columns.append(original_column)
+            if missing_columns:
+                unexpected_columns = [
+                    column
+                    for column in generated_output_column_names
+                    if column not in self.output_column_original_names
+                ]
+                raise RuntimeError(
+                    f"[RESULT_COLUMNS_MISMATCH_FOR_PANDAS_UDF] Column names of the returned pandas.DataFrame do not match specified schema. Missing: {', '.join(sorted(missing_columns))}. Unexpected: {', '.join(sorted(unexpected_columns))}"
+                    "."
+                )
+            reordered_df = output_df[self.output_column_original_names]
+            reordered_df.columns = self.output_column_names
+            yield reordered_df
+    return snowpark_fn.pandas_udtf(
+        MapPandasUDTF,
+        output_schema=PandasDataFrameType(
+            [field.datatype for field in return_schema.fields],
+            [field.name for field in return_schema.fields],
+        ),
+        input_types=[
+            PandasDataFrameType(
+                [field.datatype for field in input_schema.fields] + [IntegerType()]
+            )
+        ],
+        input_names=[field.name for field in input_schema.fields]
+        + ["_DUMMY_PARTITION_KEY"],
+        name="map_pandas_udtf",
+        replace=True,
+        packages=["pandas"],
+        is_permanent=False,
+    )
+def create_pandas_udtf_with_arrow(
+    udtf_proto: CommonInlineUserDefinedFunction,
+    spark_column_names: list[str],
+    input_schema: StructType,
+    return_schema: StructType,
 ) -> str | snowpark.udtf.UserDefinedTableFunction:
     user_function, _ = cloudpickle.loads(udtf_proto.python_udf.command)
     output_column_names = [field.name for field in return_schema.fields]

snowpark-connect 0.25.0__py3-none-any.whl → 0.27.0__py3-none-any.whl

snowpark-connect 0.25.0py3-none-any.whl → 0.27.0py3-none-any.whl