PyPI - snowpark-connect - Versions diffs - 0.25.0__py3-none-any.whl → 0.27.0__py3-none-any.whl - Mend

snowpark-connect 0.25.0py3-none-any.whl → 0.27.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

snowflake/snowpark_connect/relation/map_relation.py CHANGED Viewed

@@ -15,6 +15,7 @@ from snowflake.snowpark_connect.utils.cache import (
 from snowflake.snowpark_connect.utils.context import (
     get_plan_id_map,
     get_session_id,
+    push_map_partitions,
     push_operation_scope,
     set_is_aggregate_function,
     set_plan_id_map,
@@ -90,6 +91,7 @@ def map_relation(
                 table_name=copy.deepcopy(cached_container.table_name),
                 alias=cached_container.alias,
                 cached_schema_getter=lambda: cached_df.schema,
+                partition_hint=cached_container.partition_hint,
             )
             # If we don't make a copy of the df._output, the expression IDs for attributes in Snowpark DataFrames will differ from those stored in the cache,
             # leading to errors during query execution.
@@ -179,7 +181,8 @@ def map_relation(
                     )
                 return cached_df
             case "map_partitions":
-                result = map_map_partitions.map_map_partitions(rel)
+                with push_map_partitions():
+                    result = map_map_partitions.map_map_partitions(rel)
             case "offset":
                 result = map_row_ops.map_offset(rel)
             case "project":
@@ -189,13 +192,23 @@ def map_relation(
             case "read":
                 result = read.map_read(rel)
             case "repartition":
-                # TODO: Snowpark df identity transform with annotation
+                # Preserve partition hint for file output control
+                # This handles both repartition(n) with shuffle=True and coalesce(n) with shuffle=False
                 result = map_relation(rel.repartition.input)
+                if rel.repartition.num_partitions > 0:
+                    result.partition_hint = rel.repartition.num_partitions
             case "repartition_by_expression":
                 # This is a no-op operation in SAS as Snowpark doesn't have the concept of partitions.
                 # All the data in the dataframe will be treated as a single partition, and this will not
                 # have any side effects.
                 result = map_relation(rel.repartition_by_expression.input)
+                # Only preserve partition hint if num_partitions is explicitly specified and > 0
+                # Column-based repartitioning without count should clear any existing partition hints
+                if rel.repartition_by_expression.num_partitions > 0:
+                    result.partition_hint = rel.repartition_by_expression.num_partitions
+                else:
+                    # Column-based repartitioning clears partition hint (resets to default behavior)
+                    result.partition_hint = None
             case "replace":
                 result = map_row_ops.map_replace(rel)
             case "sample":

snowflake/snowpark_connect/relation/map_row_ops.py CHANGED Viewed

@@ -553,7 +553,14 @@ def map_filter(
         rel.filter.condition, input_container.column_map, typer
     )
-    result = input_df.filter(condition.col)
+    if rel.filter.input.WhichOneof("rel_type") == "subquery_alias":
+        # map_subquery_alias does not actually wrap the DataFrame in an alias or subquery.
+        # Apparently, there are cases (e.g., TpcdsQ53) where this is required, without it, we get
+        # SQL compilation error.
+        # To mitigate it, we are doing .select("*"), .alias() introduces additional describe queries
+        result = input_df.select("*").filter(condition.col)
+    else:
+        result = input_df.filter(condition.col)
     return DataFrameContainer(
         result,

snowflake/snowpark_connect/relation/map_show_string.py CHANGED Viewed

@@ -12,6 +12,7 @@ from snowflake.snowpark._internal.analyzer import analyzer_utils
 from snowflake.snowpark.functions import col
 from snowflake.snowpark.types import DateType, StringType, StructField, StructType
 from snowflake.snowpark_connect.column_name_handler import set_schema_getter
+from snowflake.snowpark_connect.config import global_config
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
 from snowflake.snowpark_connect.relation.map_relation import map_relation
@@ -33,6 +34,7 @@ def map_show_string(rel: relation_proto.Relation) -> pandas.DataFrame:
         truncate=rel.show_string.truncate,
         vertical=rel.show_string.vertical,
         _spark_column_names=input_df_container.column_map.get_spark_columns(),
+        _spark_session_tz=global_config.spark_sql_session_timeZone,
     )
     return pandas.DataFrame({"show_string": [show_string]})

snowflake/snowpark_connect/relation/map_sql.py CHANGED Viewed

@@ -56,6 +56,7 @@ from snowflake.snowpark_connect.utils.context import (
     _accessing_temp_object,
     gen_sql_plan_id,
     get_session_id,
+    get_sql_plan,
     push_evaluating_sql_scope,
     push_sql_scope,
     set_sql_args,
@@ -542,6 +543,7 @@ def map_sql_to_pandas_df(
                 rows = session.sql(f"DESCRIBE TABLE {name}").collect()
             case "DescribeNamespace":
                 name = get_relation_identifier_name(logical_plan.namespace(), True)
+                name = change_default_to_public(name)
                 rows = session.sql(f"DESCRIBE SCHEMA {name}").collect()
                 if not rows:
                     rows = None
@@ -793,6 +795,7 @@ def map_sql_to_pandas_df(
             case "SetCatalogAndNamespace":
                 # TODO: add catalog setting here
                 name = get_relation_identifier_name(logical_plan.child(), True)
+                name = change_default_to_public(name)
                 session.sql(f"USE SCHEMA {name}").collect()
             case "SetCommand":
                 kv_result_tuple = logical_plan.kv().get()
@@ -801,6 +804,7 @@ def map_sql_to_pandas_df(
                 set_config_param(get_session_id(), key, val, session)
             case "SetNamespaceCommand":
                 name = _spark_to_snowflake(logical_plan.namespace())
+                name = change_default_to_public(name)
                 session.sql(f"USE SCHEMA {name}").collect()
             case "SetNamespaceLocation" | "SetNamespaceProperties":
                 raise SnowparkConnectNotImplementedError(
@@ -997,6 +1001,20 @@ def get_sql_passthrough() -> bool:
     return get_boolean_session_config_param("snowpark.connect.sql.passthrough")
+def change_default_to_public(name: str) -> str:
+    """
+    Change the namespace to PUBLIC when given name is DEFAULT
+    :param name: Given namespace
+    :return: if name is DEFAULT return PUBLIC otherwise name
+    """
+    if name.startswith('"'):
+        if name.upper() == '"DEFAULT"':
+            return name.replace("DEFAULT", "PUBLIC")
+    elif name.upper() == "DEFAULT":
+        return "PUBLIC"
+    return name
 def map_sql(
     rel: relation_proto.Relation,
 ) -> DataFrameContainer:
@@ -1008,7 +1026,6 @@ def map_sql(
     In passthough mode as True, SAS calls session.sql() and not calling Spark Parser.
     This is to mitigate any issue not covered by spark logical plan to protobuf conversion.
     """
     snowpark_connect_sql_passthrough = get_sql_passthrough()
     if not snowpark_connect_sql_passthrough:
@@ -1353,6 +1370,7 @@ def map_logical_plan_relation(
                     left_input=map_logical_plan_relation(children[0]),
                     right_input=map_logical_plan_relation(children[1]),
                     set_op_type=relation_proto.SetOperation.SET_OP_TYPE_UNION,
+                    is_all=True,
                     by_name=rel.byName(),
                     allow_missing_columns=rel.allowMissingCol(),
                 )
@@ -1701,7 +1719,50 @@ def map_logical_plan_relation(
                     _window_specs.get()[key] = window_spec
                 proto = map_logical_plan_relation(rel.child())
         case "Generate":
-            input_relation = map_logical_plan_relation(rel.child())
+            # Generate creates a nested Project relation (see lines 1785-1790) without
+            # setting its plan_id field. When this Project is later processed by map_project
+            # (map_column_ops.py), it uses rel.common.plan_id which defaults to 0 for unset
+            # protobuf fields. This means all columns from the Generate operation (both exploded
+            # columns and passthrough columns) will have plan_id=0 in their names.
+            #
+            # If Generate's child is a SubqueryAlias whose inner relation was processed
+            # with a non-zero plan_id, there will be a mismatch between:
+            # - The columns referenced in the Project (expecting plan_id from SubqueryAlias's child)
+            # - The actual column names created by Generate's Project (using plan_id=0)
+            # Therefore, when Generate has a SubqueryAlias child, we explicitly process the inner
+            # relation with plan_id=0 to match what Generate's Project will use. This only applies when
+            # the immediate child of Generate is a SubqueryAlias and preserves existing registrations (like CTEs),
+            # so it won't affect other patterns.
+            child_class = str(rel.child().getClass().getSimpleName())
+            if child_class == "SubqueryAlias":
+                alias = str(rel.child().alias())
+                # Check if this alias was already registered during initial SQL parsing
+                existing_plan_id = get_sql_plan(alias)
+                if existing_plan_id is not None:
+                    # Use the existing plan_id to maintain consistency with prior registration
+                    used_plan_id = existing_plan_id
+                else:
+                    # Use plan_id=0 to match what the nested Project will use (protobuf default)
+                    used_plan_id = 0
+                    set_sql_plan_name(alias, used_plan_id)
+                # Process the inner child with the determined plan_id
+                inner_child = map_logical_plan_relation(
+                    rel.child().child(), plan_id=used_plan_id
+                )
+                input_relation = relation_proto.Relation(
+                    subquery_alias=relation_proto.SubqueryAlias(
+                        input=inner_child,
+                        alias=alias,
+                    )
+                )
+            else:
+                input_relation = map_logical_plan_relation(rel.child())
             generator_output_list = as_java_list(rel.generatorOutput())
             generator_output_list_expressions = [
                 map_logical_plan_expression(e) for e in generator_output_list

snowflake/snowpark_connect/relation/map_udtf.py CHANGED Viewed

@@ -31,6 +31,10 @@ from snowflake.snowpark_connect.type_mapping import (
     proto_to_snowpark_type,
 )
 from snowflake.snowpark_connect.utils.context import push_udtf_context
+from snowflake.snowpark_connect.utils.external_udxf_cache import (
+    cache_external_udtf,
+    get_external_udtf_from_cache,
+)
 from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
 from snowflake.snowpark_connect.utils.udtf_helper import (
     SnowparkUDTF,
@@ -44,6 +48,34 @@ from snowflake.snowpark_connect.utils.udxf_import_utils import (
 )
+def cache_external_udtf_wrapper(from_register_udtf: bool):
+    def outer_wrapper(wrapper_func):
+        def wrapper(
+            udtf_proto: relation_proto.CommonInlineUserDefinedTableFunction,
+            spark_column_names,
+        ) -> SnowparkUDTF | None:
+            udf_hash = hash(str(udtf_proto))
+            cached_udtf = get_external_udtf_from_cache(udf_hash)
+            if cached_udtf:
+                if from_register_udtf:
+                    session = get_or_create_snowpark_session()
+                    session._udtfs[udtf_proto.function_name.lower()] = (
+                        cached_udtf,
+                        spark_column_names,
+                    )
+                return cached_udtf
+            snowpark_udf = wrapper_func(udtf_proto, spark_column_names)
+            cache_external_udtf(udf_hash, snowpark_udf)
+            return snowpark_udf
+        return wrapper
+    return outer_wrapper
 def build_expected_types_from_parsed(
     parsed_return: types_proto.DataType,
 ) -> List[Tuple[str, Any]]:
@@ -165,26 +197,37 @@ def register_udtf(
     ) = process_return_type(python_udft.return_type)
     function_name = udtf_proto.function_name
-    kwargs = {
-        "session": session,
-        "udtf_proto": udtf_proto,
-        "expected_types": expected_types,
-        "output_schema": output_schema,
-        "packages": global_config.get("snowpark.connect.udf.packages", ""),
-        "imports": get_python_udxf_import_files(session),
-        "called_from": "register_udtf",
-        "is_arrow_enabled": is_arrow_enabled_in_udtf(),
-        "is_spark_compatible_udtf_mode_enabled": is_spark_compatible_udtf_mode_enabled(),
-    }
-    if require_creating_udtf_in_sproc(udtf_proto):
-        snowpark_udtf = create_udtf_in_sproc(**kwargs)
-    else:
-        udtf = create_udtf(**kwargs)
-        snowpark_udtf = SnowparkUDTF(
-            name=udtf.name, input_types=udtf._input_types, output_schema=output_schema
-        )
+    @cache_external_udtf_wrapper(from_register_udtf=True)
+    def _register_udtf(
+        udtf_proto: relation_proto.CommonInlineUserDefinedTableFunction,
+        spark_column_names,
+    ):
+        kwargs = {
+            "session": session,
+            "udtf_proto": udtf_proto,
+            "expected_types": expected_types,
+            "output_schema": output_schema,
+            "packages": global_config.get("snowpark.connect.udf.packages", ""),
+            "imports": get_python_udxf_import_files(session),
+            "called_from": "register_udtf",
+            "is_arrow_enabled": is_arrow_enabled_in_udtf(),
+            "is_spark_compatible_udtf_mode_enabled": is_spark_compatible_udtf_mode_enabled(),
+        }
+        if require_creating_udtf_in_sproc(udtf_proto):
+            snowpark_udtf = create_udtf_in_sproc(**kwargs)
+        else:
+            udtf = create_udtf(**kwargs)
+            snowpark_udtf = SnowparkUDTF(
+                name=udtf.name,
+                input_types=udtf._input_types,
+                output_schema=output_schema,
+            )
+        return snowpark_udtf
+    snowpark_udtf = _register_udtf(udtf_proto, spark_column_names)
+    # We have to update cached _udtfs here, because function could have been cached in map_common_inline_user_defined_table_function
     session._udtfs[function_name.lower()] = (snowpark_udtf, spark_column_names)
     return snowpark_udtf
@@ -213,32 +256,41 @@ def map_common_inline_user_defined_table_function(
         spark_column_names,
     ) = process_return_type(python_udft.return_type)
-    kwargs = {
-        "session": session,
-        "udtf_proto": rel,
-        "expected_types": expected_types,
-        "output_schema": output_schema,
-        "packages": global_config.get("snowpark.connect.udf.packages", ""),
-        "imports": get_python_udxf_import_files(session),
-        "called_from": "map_common_inline_user_defined_table_function",
-        "is_arrow_enabled": is_arrow_enabled_in_udtf(),
-        "is_spark_compatible_udtf_mode_enabled": is_spark_compatible_udtf_mode_enabled(),
-    }
-    if require_creating_udtf_in_sproc(rel):
-        snowpark_udtf_or_error = create_udtf_in_sproc(**kwargs)
-        if isinstance(snowpark_udtf_or_error, str):
-            raise PythonException(snowpark_udtf_or_error)
-        snowpark_udtf = snowpark_udtf_or_error
-    else:
-        udtf_or_error = create_udtf(**kwargs)
-        if isinstance(udtf_or_error, str):
-            raise PythonException(udtf_or_error)
-        udtf = udtf_or_error
-        snowpark_udtf = SnowparkUDTF(
-            name=udtf.name, input_types=udtf._input_types, output_schema=output_schema
-        )
+    @cache_external_udtf_wrapper(from_register_udtf=False)
+    def _get_udtf(
+        udtf_proto: relation_proto.CommonInlineUserDefinedTableFunction,
+        spark_column_names,
+    ):
+        kwargs = {
+            "session": session,
+            "udtf_proto": udtf_proto,
+            "expected_types": expected_types,
+            "output_schema": output_schema,
+            "packages": global_config.get("snowpark.connect.udf.packages", ""),
+            "imports": get_python_udxf_import_files(session),
+            "called_from": "map_common_inline_user_defined_table_function",
+            "is_arrow_enabled": is_arrow_enabled_in_udtf(),
+            "is_spark_compatible_udtf_mode_enabled": is_spark_compatible_udtf_mode_enabled(),
+        }
+        if require_creating_udtf_in_sproc(udtf_proto):
+            snowpark_udtf_or_error = create_udtf_in_sproc(**kwargs)
+            if isinstance(snowpark_udtf_or_error, str):
+                raise PythonException(snowpark_udtf_or_error)
+            snowpark_udtf = snowpark_udtf_or_error
+        else:
+            udtf_or_error = create_udtf(**kwargs)
+            if isinstance(udtf_or_error, str):
+                raise PythonException(udtf_or_error)
+            udtf = udtf_or_error
+            snowpark_udtf = SnowparkUDTF(
+                name=udtf.name,
+                input_types=udtf._input_types,
+                output_schema=output_schema,
+            )
+        return snowpark_udtf
+    snowpark_udtf = _get_udtf(rel, spark_column_names)
     column_map = ColumnNameMap([], [])
     snowpark_udtf_args = []

snowflake/snowpark_connect/relation/utils.py CHANGED Viewed

@@ -6,6 +6,7 @@ import random
 import re
 import string
 import time
+import uuid
 from typing import Sequence
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
@@ -153,6 +154,49 @@ def random_string(
     return "".join([prefix, random_part, suffix])
+def generate_spark_compatible_filename(
+    task_id: int = 0,
+    attempt_number: int = 0,
+    compression: str = None,
+    format_ext: str = "parquet",
+) -> str:
+    """Generate a Spark-compatible filename following the convention:
+    part-<task-id>-<uuid>-c<attempt-number>.<compression>.<format>
+    Args:
+        task_id: Task ID (usually 0 for single partition)
+        attempt_number: Attempt number (usually 0)
+        compression: Compression type (e.g., 'snappy', 'gzip', 'none')
+        format_ext: File format extension (e.g., 'parquet', 'csv', 'json')
+    Returns:
+        A filename string following Spark's naming convention
+    """
+    # Generate a UUID for uniqueness
+    file_uuid = str(uuid.uuid4())
+    # Format task ID with leading zeros (5 digits)
+    formatted_task_id = f"{task_id:05d}"
+    # Format attempt number with leading zeros (3 digits)
+    formatted_attempt = f"{attempt_number:03d}"
+    # Build the base filename
+    base_name = f"part-{formatted_task_id}-{file_uuid}-c{formatted_attempt}"
+    # Add compression if specified and not 'none'
+    if compression and compression.lower() not in ("none", "uncompressed"):
+        compression_part = f".{compression.lower()}"
+    else:
+        compression_part = ""
+    # Add format extension if specified
+    if format_ext:
+        return f"{base_name}{compression_part}.{format_ext}"
+    else:
+        return f"{base_name}{compression_part}"
 def _normalize_query_for_semantic_hash(query_str: str) -> str:
     """
     Normalize a query string for semantic comparison by extracting original names from

snowpark-connect 0.25.0__py3-none-any.whl → 0.27.0__py3-none-any.whl

snowpark-connect 0.25.0py3-none-any.whl → 0.27.0py3-none-any.whl