PyPI - snowpark-connect - Versions diffs - 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

snowpark-connect 0.27.0py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (192) hide show

snowflake/snowpark_connect/relation/map_local_relation.py CHANGED Viewed

@@ -11,6 +11,7 @@ import pyarrow as pa
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
 from snowflake import snowpark
+from snowflake.snowpark._internal.analyzer.analyzer import ARRAY_BIND_THRESHOLD
 from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
 from snowflake.snowpark._internal.utils import is_in_stored_procedure
 from snowflake.snowpark.types import LongType, StructField, StructType
@@ -18,7 +19,10 @@ from snowflake.snowpark_connect import tcm
 from snowflake.snowpark_connect.column_name_handler import (
     make_column_names_snowpark_compatible,
 )
+from snowflake.snowpark_connect.config import global_config
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
+from snowflake.snowpark_connect.error.error_codes import ErrorCodes
+from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
 from snowflake.snowpark_connect.type_mapping import (
     get_python_sql_utils_class,
     map_json_schema_to_snowpark,
@@ -242,6 +246,16 @@ def map_local_relation(
         # _create_temp_stage() changes were not ported to the internal connector, leading to this
         # error on TCM and in notebooks (sproc):
         # TypeError: _create_temp_stage() takes 7 positional arguments but 8 were given
+        #
+        # For large local relations (rows * cols >= ARRAY_BIND_THRESHOLD), use PyArrow path for better performance.
+        # PyArrow uses stage operations (5-6 queries) which is more efficient for large data than batch inserts.
+        enable_optimization = global_config._get_config_setting(
+            "snowpark.connect.localRelation.optimizeSmallData"
+        )
+        use_vectorized_scanner = global_config._get_config_setting(
+            "snowpark.connect.parquet.useVectorizedScanner"
+        )
         use_pyarrow = (
             not is_in_stored_procedure()
             # TODO: SNOW-2220726 investigate why use_pyarrow failed in TCM:
@@ -253,12 +267,19 @@ def map_local_relation(
                 current_schema.strip('"') if current_schema is not None else "",
             )
             is not None
+            and (
+                # When optimization is disabled, always use PyArrow (preserves row ordering that some tests rely on)
+                not enable_optimization
+                # When optimization is enabled, use PyArrow only for large data for better performance.
+                or (table.num_rows * table.num_columns >= ARRAY_BIND_THRESHOLD)
+            )
         )
         if use_pyarrow:
             snowpark_df: snowpark.DataFrame = session.create_dataframe(
                 # Rename the columns to match the Snowpark schema before creating.
                 data=table.rename_columns([unquote_if_quoted(c) for c in new_columns]),
+                use_vectorized_scanner=use_vectorized_scanner,
             )
             # Cast the columns to the correct types based on the schema as create_dataframe will
@@ -273,6 +294,9 @@ def map_local_relation(
             snowpark_df = snowpark_df.select(*casted_columns)
         else:
+            # For small datasets (< ARRAY_BIND_THRESHOLD), use List[Row] path.
+            # Snowpark's SnowflakeValues will use inline VALUES clause (lazy, no queries) for small data,
+            # or temp table with batch insert (lazy, 3 queries on action) if it grows larger.
             pylist_df = [
                 list(row)
                 for row in zip(*(col.to_pylist() for col in table.itercolumns()))
@@ -325,11 +349,14 @@ def map_local_relation(
             spark_column_names=spark_column_names,
             snowpark_column_names=new_columns,
             column_metadata=column_metadata,
+            snowpark_column_types=[f.datatype for f in snowpark_schema.fields],
         )
     else:
-        raise SnowparkConnectNotImplementedError(
+        exception = SnowparkConnectNotImplementedError(
             "LocalRelation without data & schema is not supported"
         )
+        attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+        raise exception
 def map_range(

snowflake/snowpark_connect/relation/map_map_partitions.py CHANGED Viewed

@@ -8,11 +8,19 @@ from pyspark.sql.connect.proto.expressions_pb2 import CommonInlineUserDefinedFun
 import snowflake.snowpark.functions as snowpark_fn
 from snowflake import snowpark
 from snowflake.snowpark.types import StructType
+from snowflake.snowpark_connect.column_name_handler import make_unique_snowpark_name
 from snowflake.snowpark_connect.constants import MAP_IN_ARROW_EVAL_TYPE
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
+from snowflake.snowpark_connect.expression.map_unresolved_star import (
+    map_unresolved_star_as_single_column,
+)
+from snowflake.snowpark_connect.expression.typer import ExpressionTyper
 from snowflake.snowpark_connect.relation.map_relation import map_relation
 from snowflake.snowpark_connect.type_mapping import proto_to_snowpark_type
-from snowflake.snowpark_connect.utils.context import map_partitions_depth
+from snowflake.snowpark_connect.utils.java_udtf_utils import (
+    JAVA_UDTF_PREFIX,
+    create_java_udtf_for_scala_flatmap_handling,
+)
 from snowflake.snowpark_connect.utils.pandas_udtf_utils import (
     create_pandas_udtf,
     create_pandas_udtf_with_arrow,
@@ -53,18 +61,18 @@ def _call_udtf(
         ).cast("int"),
     )
-    udtf_columns = input_df.columns + [snowpark_fn.col("_DUMMY_PARTITION_KEY")]
+    udtf_columns = [f"snowflake_jtf_{column}" for column in input_df.columns] + [
+        "_DUMMY_PARTITION_KEY"
+    ]
     tfc = snowpark_fn.call_table_function(udtf_name, *udtf_columns).over(
         partition_by=[snowpark_fn.col("_DUMMY_PARTITION_KEY")]
     )
-    # Use map_partitions_depth only when mapping non nested map_partitions
-    # When mapping chained functions additional column casting is necessary
-    if map_partitions_depth() == 1:
-        result_df_with_dummy = input_df_with_dummy.join_table_function(tfc)
-    else:
-        result_df_with_dummy = input_df_with_dummy.select(tfc)
+    # Overwrite the input_df columns to prevent name conflicts with UDTF output columns
+    result_df_with_dummy = input_df_with_dummy.to_df(udtf_columns).join_table_function(
+        tfc
+    )
     output_cols = [field.name for field in return_type.fields]
@@ -95,6 +103,73 @@ def _map_with_pandas_udtf(
         else udf_proto.scalar_scala_udf.outputType
     )
+    if udf_proto.WhichOneof("function") == "scalar_scala_udf":
+        assert (
+            len(udf_proto.scalar_scala_udf.inputTypes) == 1
+        ), "len(inputTypes) should be 1 for map and flatMap operations"
+        udtf_name = create_java_udtf_for_scala_flatmap_handling(udf_proto)
+        if udf_proto.scalar_scala_udf.inputTypes[0].WhichOneof("kind") == "struct":
+            spark_col_name, typed_col = map_unresolved_star_as_single_column(
+                udf_proto.arguments[0],
+                input_df_container.column_map,
+                ExpressionTyper(input_df),
+            )
+            udtf_arg_column = typed_col.col
+        else:
+            udtf_arg_column = snowpark_fn.col(
+                input_df_container.column_map.get_snowpark_columns()[0]
+            )
+            spark_col_name = input_df_container.column_map.get_spark_columns()[0]
+        if udf_proto.scalar_scala_udf.inputTypes[0].WhichOneof("kind") in (
+            "map",
+            "array",
+        ):
+            udtf_arg_column = snowpark_fn.to_variant(udtf_arg_column)
+        new_snowpark_col_name = make_unique_snowpark_name(spark_col_name)
+        df = input_df.join_table_function(
+            snowpark_fn.call_table_function(udtf_name, udtf_arg_column)
+        )
+        df = df.select(
+            snowpark_fn.cast(
+                snowpark_fn.col(JAVA_UDTF_PREFIX + "C1"), return_type
+            ).alias(new_snowpark_col_name)
+        )
+        if udf_proto.scalar_scala_udf.outputType.WhichOneof("kind") == "struct":
+            spark_names = [field.name for field in return_type.fields]
+            output_snowpark_names = [
+                make_unique_snowpark_name(name) for name in spark_names
+            ]
+            output_types = [field.datatype for field in return_type.fields]
+            cols = [
+                snowpark_fn.get(
+                    snowpark_fn.col(new_snowpark_col_name), snowpark_fn.lit(spark_name)
+                ).alias(snowpark_name)
+                for spark_name, snowpark_name in zip(spark_names, output_snowpark_names)
+            ]
+            if cols:
+                df = df.select(*cols)
+        else:
+            output_types = [return_type]
+            output_snowpark_names = [new_snowpark_col_name]
+            spark_names = [spark_col_name]
+        return DataFrameContainer.create_with_column_mapping(
+            dataframe=df,
+            spark_column_names=spark_names,
+            snowpark_column_names=output_snowpark_names,
+            snowpark_column_types=output_types,
+        )
     # Check if this is mapInArrow (eval_type == 207)
     map_in_arrow = (
         udf_proto.WhichOneof("function") == "python_udf"

snowflake/snowpark_connect/relation/map_relation.py CHANGED Viewed

@@ -8,14 +8,16 @@ import pandas
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
+from snowflake.snowpark_connect.error.error_codes import ErrorCodes
+from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
 from snowflake.snowpark_connect.utils.cache import (
     df_cache_map_get,
     df_cache_map_put_if_absent,
 )
 from snowflake.snowpark_connect.utils.context import (
     get_plan_id_map,
-    get_session_id,
-    push_map_partitions,
+    get_spark_session_id,
+    not_resolving_fun_args,
     push_operation_scope,
     set_is_aggregate_function,
     set_plan_id_map,
@@ -73,7 +75,7 @@ def map_relation(
     if reuse_parsed_plan and rel.HasField("common") and rel.common.HasField("plan_id"):
         # TODO: remove get_session_id() when we host SAS in Snowflake server
         # Check for cached relation
-        cache_entry = df_cache_map_get((get_session_id(), rel.common.plan_id))
+        cache_entry = df_cache_map_get((get_spark_session_id(), rel.common.plan_id))
         if cache_entry is not None:
             if isinstance(cache_entry, DataFrameContainer):
                 set_plan_id_map(rel.common.plan_id, cache_entry)
@@ -103,7 +105,9 @@ def map_relation(
     else:
         # This happens when the relation is empty, usually because the incoming message
         # type was incorrectly routed here.
-        raise SnowparkConnectNotImplementedError("No Relation Type")
+        exception = SnowparkConnectNotImplementedError("No Relation Type")
+        attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+        raise exception
     result: DataFrameContainer | pandas.DataFrame
     operation = rel.WhichOneof("rel_type")
@@ -121,11 +125,19 @@ def map_relation(
                     case relation_proto.Aggregate.GroupType.GROUP_TYPE_PIVOT:
                         result = map_aggregate.map_pivot_aggregate(rel)
                     case other:
-                        raise SnowparkConnectNotImplementedError(f"AGGREGATE {other}")
+                        exception = SnowparkConnectNotImplementedError(
+                            f"AGGREGATE {other}"
+                        )
+                        attach_custom_error_code(
+                            exception, ErrorCodes.UNSUPPORTED_OPERATION
+                        )
+                        raise exception
             case "approx_quantile":
                 result = map_stats.map_approx_quantile(rel)
             case "as_of_join":
-                raise SnowparkConnectNotImplementedError("AS_OF_JOIN")
+                exception = SnowparkConnectNotImplementedError("AS_OF_JOIN")
+                attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+                raise exception
             case "catalog":  # TODO: order these alphabetically
                 result = map_catalog.map_catalog(rel.catalog)
             case "collect_metrics":
@@ -150,7 +162,10 @@ def map_relation(
             case "drop_na":
                 result = map_row_ops.map_dropna(rel)
             case "extension":
-                result = map_extension.map_extension(rel)
+                # Extensions can be passed as function args, and we need to reset the context here.
+                # Matters only for resolving alias expressions in the extensions rel.
+                with not_resolving_fun_args():
+                    result = map_extension.map_extension(rel)
             case "fill_na":
                 result = map_row_ops.map_fillna(rel)
             case "filter":
@@ -167,22 +182,25 @@ def map_relation(
             case "limit":
                 result = map_row_ops.map_limit(rel)
             case "local_relation":
-                result = map_local_relation.map_local_relation(rel)
+                result = map_local_relation.map_local_relation(
+                    rel
+                ).without_materialization()
                 df_cache_map_put_if_absent(
-                    (get_session_id(), rel.common.plan_id), lambda: result, False
+                    (get_spark_session_id(), rel.common.plan_id), lambda: result
                 )
             case "cached_local_relation":
                 cached_df = df_cache_map_get(
-                    (get_session_id(), rel.cached_local_relation.hash)
+                    (get_spark_session_id(), rel.cached_local_relation.hash)
                 )
                 if cached_df is None:
-                    raise ValueError(
+                    exception = ValueError(
                         f"Local relation with hash {rel.cached_local_relation.hash} not found in cache."
                     )
+                    attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
+                    raise exception
                 return cached_df
             case "map_partitions":
-                with push_map_partitions():
-                    result = map_map_partitions.map_map_partitions(rel)
+                result = map_map_partitions.map_map_partitions(rel)
             case "offset":
                 result = map_row_ops.map_offset(rel)
             case "project":
@@ -214,14 +232,13 @@ def map_relation(
             case "sample":
                 sampled_df_not_evaluated = map_row_ops.map_sample(rel)
                 df_cache_map_put_if_absent(
-                    (get_session_id(), rel.common.plan_id),
+                    (get_spark_session_id(), rel.common.plan_id),
                     lambda: sampled_df_not_evaluated,
-                    True,
                 )
                 # We will retrieve from cache and return that, because insertion to cache
                 # triggers evaluation.
-                result = df_cache_map_get((get_session_id(), rel.common.plan_id))
+                result = df_cache_map_get((get_spark_session_id(), rel.common.plan_id))
             case "sample_by":
                 result = map_sample_by.map_sample_by(rel)
             case "set_op":
@@ -233,7 +250,13 @@ def map_relation(
                     case relation_proto.SetOperation.SetOpType.SET_OP_TYPE_EXCEPT:
                         result = map_row_ops.map_except(rel)
                     case other:
-                        raise SnowparkConnectNotImplementedError(f"SET_OP {other}")
+                        exception = SnowparkConnectNotImplementedError(
+                            f"SET_OP {other}"
+                        )
+                        attach_custom_error_code(
+                            exception, ErrorCodes.UNSUPPORTED_OPERATION
+                        )
+                        raise exception
             case "show_string":
                 result = map_show_string.map_show_string(rel)
             case "sort":
@@ -259,11 +282,17 @@ def map_relation(
             case "with_columns_renamed":
                 result = map_column_ops.map_with_columns_renamed(rel)
             case "with_relations":
-                raise SnowparkConnectNotImplementedError("WITH_RELATIONS")
+                exception = SnowparkConnectNotImplementedError("WITH_RELATIONS")
+                attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+                raise exception
             case "group_map":
                 result = map_column_ops.map_group_map(rel)
             case other:
-                raise SnowparkConnectNotImplementedError(f"Other Relation {other}")
+                exception = SnowparkConnectNotImplementedError(
+                    f"Other Relation {other}"
+                )
+                attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+                raise exception
         # Store container in plan cache
         if isinstance(result, DataFrameContainer):

snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

snowpark-connect 0.27.0py3-none-any.whl → 1.6.0py3-none-any.whl