PyPI - snowpark-connect - Versions diffs - 0.22.1__py3-none-any.whl → 0.23.0__py3-none-any.whl - Mend

snowpark-connect 0.22.1py3-none-any.whl → 0.23.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (42) hide show

snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py CHANGED Viewed

@@ -8,7 +8,10 @@ import typing
 import pandas
 import pyspark.sql.connect.proto.common_pb2 as common_proto
 import pyspark.sql.connect.proto.types_pb2 as types_proto
-from snowflake.core.exceptions import NotFoundError
+from pyspark.sql.connect.client.core import Retrying
+from snowflake.core.exceptions import APIError, NotFoundError
+from snowflake.core.schema import Schema
+from snowflake.core.table import Table, TableColumn
 from snowflake.snowpark import functions
 from snowflake.snowpark._internal.analyzer.analyzer_utils import (
@@ -22,6 +25,7 @@ from snowflake.snowpark_connect.config import (
     global_config,
 )
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
+from snowflake.snowpark_connect.error.exceptions import MaxRetryExceeded
 from snowflake.snowpark_connect.relation.catalogs.abstract_spark_catalog import (
     AbstractSparkCatalog,
     _get_current_snowflake_schema,
@@ -39,6 +43,37 @@ from snowflake.snowpark_connect.utils.telemetry import (
 from snowflake.snowpark_connect.utils.udf_cache import cached_udf
+def _is_retryable_api_error(e: Exception) -> bool:
+    """
+    Determine if an APIError should be retried.
+    Only retry on server errors, rate limiting, and transient network issues.
+    Don't retry on client errors like authentication, authorization, or validation failures.
+    """
+    if not isinstance(e, APIError):
+        return False
+    # Check if the error has a status_code attribute
+    if hasattr(e, "status_code"):
+        # Retry on server errors (5xx), rate limiting (429), and some client errors (400)
+        # 400 can be transient in some cases (like the original error trace shows)
+        return e.status_code in [400, 429, 500, 502, 503, 504]
+    # For APIErrors without explicit status codes, check the message
+    error_msg = str(e).lower()
+    retryable_patterns = [
+        "timeout",
+        "connection",
+        "network",
+        "unavailable",
+        "temporary",
+        "rate limit",
+        "throttle",
+    ]
+    return any(pattern in error_msg for pattern in retryable_patterns)
 def _normalize_identifier(identifier: str | None) -> str | None:
     if identifier is None:
         return None
@@ -73,10 +108,25 @@ class SnowflakeCatalog(AbstractSparkCatalog):
             )
         sp_catalog = get_or_create_snowpark_session().catalog
-        dbs = sp_catalog.list_schemas(
-            database=sf_quote(sf_database),
-            pattern=_normalize_identifier(sf_schema),
-        )
+        dbs: list[Schema] | None = None
+        for attempt in Retrying(
+            max_retries=5,
+            initial_backoff=100,  # 100ms
+            max_backoff=5000,  # 5 s
+            backoff_multiplier=2.0,
+            jitter=100,
+            min_jitter_threshold=200,
+            can_retry=_is_retryable_api_error,
+        ):
+            with attempt:
+                dbs = sp_catalog.list_schemas(
+                    database=sf_quote(sf_database),
+                    pattern=_normalize_identifier(sf_schema),
+                )
+        if dbs is None:
+            raise MaxRetryExceeded(
+                f"Failed to fetch databases {f'with pattern {pattern} ' if pattern is not None else ''}after all retry attempts"
+            )
         names: list[str] = list()
         catalogs: list[str] = list()
         descriptions: list[str | None] = list()
@@ -112,9 +162,24 @@ class SnowflakeCatalog(AbstractSparkCatalog):
             )
         sp_catalog = get_or_create_snowpark_session().catalog
-        db = sp_catalog.get_schema(
-            schema=sf_quote(sf_schema), database=sf_quote(sf_database)
-        )
+        db: Schema | None = None
+        for attempt in Retrying(
+            max_retries=5,
+            initial_backoff=100,  # 100ms
+            max_backoff=5000,  # 5 s
+            backoff_multiplier=2.0,
+            jitter=100,
+            min_jitter_threshold=200,
+            can_retry=_is_retryable_api_error,
+        ):
+            with attempt:
+                db = sp_catalog.get_schema(
+                    schema=sf_quote(sf_schema), database=sf_quote(sf_database)
+                )
+        if db is None:
+            raise MaxRetryExceeded(
+                f"Failed to fetch database {spark_dbName} after all retry attempts"
+            )
         name = unquote_if_quoted(db.name)
         return pandas.DataFrame(
@@ -241,11 +306,27 @@ class SnowflakeCatalog(AbstractSparkCatalog):
                 "Calling into another catalog is not currently supported"
             )
-        table = sp_catalog.get_table(
-            database=sf_quote(sf_database),
-            schema=sf_quote(sf_schema),
-            table_name=sf_quote(table_name),
-        )
+        table: Table | None = None
+        for attempt in Retrying(
+            max_retries=5,
+            initial_backoff=100,  # 100ms
+            max_backoff=5000,  # 5 s
+            backoff_multiplier=2.0,
+            jitter=100,
+            min_jitter_threshold=200,
+            can_retry=_is_retryable_api_error,
+        ):
+            with attempt:
+                table = sp_catalog.get_table(
+                    database=sf_quote(sf_database),
+                    schema=sf_quote(sf_schema),
+                    table_name=sf_quote(table_name),
+                )
+        if table is None:
+            raise MaxRetryExceeded(
+                f"Failed to fetch table {spark_tableName} after all retry attempts"
+            )
         return pandas.DataFrame(
             {
@@ -286,6 +367,7 @@ class SnowflakeCatalog(AbstractSparkCatalog):
     ) -> pandas.DataFrame:
         """List all columns in a table/view, optionally database name filter can be provided."""
         sp_catalog = get_or_create_snowpark_session().catalog
+        columns: list[TableColumn] | None = None
         if spark_dbName is None:
             catalog, sf_database, sf_schema, sf_table = _process_multi_layer_identifier(
                 spark_tableName
@@ -294,15 +376,39 @@ class SnowflakeCatalog(AbstractSparkCatalog):
                 raise SnowparkConnectNotImplementedError(
                     "Calling into another catalog is not currently supported"
                 )
-            columns = sp_catalog.list_columns(
-                database=sf_quote(sf_database),
-                schema=sf_quote(sf_schema),
-                table_name=sf_quote(sf_table),
-            )
+            for attempt in Retrying(
+                max_retries=5,
+                initial_backoff=100,  # 100ms
+                max_backoff=5000,  # 5 s
+                backoff_multiplier=2.0,
+                jitter=100,
+                min_jitter_threshold=200,
+                can_retry=_is_retryable_api_error,
+            ):
+                with attempt:
+                    columns = sp_catalog.list_columns(
+                        database=sf_quote(sf_database),
+                        schema=sf_quote(sf_schema),
+                        table_name=sf_quote(sf_table),
+                    )
         else:
-            columns = sp_catalog.list_columns(
-                schema=sf_quote(spark_dbName),
-                table_name=sf_quote(spark_tableName),
+            for attempt in Retrying(
+                max_retries=5,
+                initial_backoff=100,  # 100ms
+                max_backoff=5000,  # 5 s
+                backoff_multiplier=2.0,
+                jitter=100,
+                min_jitter_threshold=200,
+                can_retry=_is_retryable_api_error,
+            ):
+                with attempt:
+                    columns = sp_catalog.list_columns(
+                        schema=sf_quote(spark_dbName),
+                        table_name=sf_quote(spark_tableName),
+                    )
+        if columns is None:
+            raise MaxRetryExceeded(
+                f"Failed to fetch columns of {spark_tableName} after all retry attempts"
             )
         names: list[str] = list()
         descriptions: list[str | None] = list()

snowflake/snowpark_connect/relation/map_aggregate.py CHANGED Viewed

@@ -153,11 +153,63 @@ def map_pivot_aggregate(
                     used_columns.add(mapped_col[0].snowpark_name)
     if len(columns.grouping_expressions()) == 0:
-        result = (
-            input_df_actual.select(*used_columns)
-            .pivot(pivot_column[1].col, pivot_values if pivot_values else None)
-            .agg(*columns.aggregation_expressions(unalias=True))
-        )
+        # Snowpark doesn't support multiple aggregations in pivot without groupBy
+        # So we need to perform each aggregation separately and then combine results
+        if len(columns.aggregation_expressions(unalias=True)) > 1:
+            agg_expressions = columns.aggregation_expressions(unalias=True)
+            agg_metadata = columns.aggregation_columns
+            num_agg_functions = len(agg_expressions)
+            spark_names = []
+            pivot_results = []
+            for i, agg_expr in enumerate(agg_expressions):
+                pivot_result = (
+                    input_df_actual.select(*used_columns)
+                    .pivot(pivot_column[1].col, pivot_values if pivot_values else None)
+                    .agg(agg_expr)
+                )
+                for col_name in pivot_result.columns:
+                    spark_names.append(
+                        f"{pivot_column_name(col_name)}_{agg_metadata[i].spark_name}"
+                    )
+                pivot_results.append(pivot_result)
+            result = pivot_results[0]
+            for pivot_result in pivot_results[1:]:
+                result = result.cross_join(pivot_result)
+            pivot_columns_per_agg = len(pivot_results[0].columns)
+            reordered_spark_names = []
+            reordered_snowpark_names = []
+            reordered_types = []
+            column_selectors = []
+            for pivot_idx in range(pivot_columns_per_agg):
+                for agg_idx in range(num_agg_functions):
+                    current_pos = agg_idx * pivot_columns_per_agg + pivot_idx
+                    if current_pos < len(spark_names):
+                        idx = current_pos + 1  # 1-based indexing for Snowpark
+                        reordered_spark_names.append(spark_names[current_pos])
+                        reordered_snowpark_names.append(f"${idx}")
+                        reordered_types.append(
+                            result.schema.fields[current_pos].datatype
+                        )
+                        column_selectors.append(snowpark_fn.col(f"${idx}"))
+            return DataFrameContainer.create_with_column_mapping(
+                dataframe=result.select(*column_selectors),
+                spark_column_names=reordered_spark_names,
+                snowpark_column_names=reordered_snowpark_names,
+                column_qualifiers=[[]] * len(reordered_spark_names),
+                parent_column_name_map=input_container.column_map,
+                snowpark_column_types=reordered_types,
+            )
+        else:
+            result = (
+                input_df_actual.select(*used_columns)
+                .pivot(pivot_column[1].col, pivot_values if pivot_values else None)
+                .agg(*columns.aggregation_expressions(unalias=True))
+            )
     else:
         result = (
             input_df_actual.group_by(*columns.grouping_expressions())

snowflake/snowpark_connect/relation/map_column_ops.py CHANGED Viewed

@@ -6,10 +6,12 @@ import ast
 import json
 import sys
 from collections import defaultdict
+from copy import copy
 import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
 import pyspark.sql.connect.proto.types_pb2 as types_proto
+from pyspark.errors import PySparkValueError
 from pyspark.errors.exceptions.base import AnalysisException
 from pyspark.serializers import CloudPickleSerializer
@@ -44,6 +46,7 @@ from snowflake.snowpark_connect.expression.typer import ExpressionTyper
 from snowflake.snowpark_connect.relation.map_relation import map_relation
 from snowflake.snowpark_connect.relation.utils import (
     TYPE_MAP_FOR_TO_SCHEMA,
+    can_sort_be_flattened,
     snowpark_functions_col,
 )
 from snowflake.snowpark_connect.type_mapping import (
@@ -266,6 +269,7 @@ def map_project(
             aliased_col = mapper.col.alias(snowpark_column)
             select_list.append(aliased_col)
             new_snowpark_columns.append(snowpark_column)
             new_spark_columns.append(spark_name)
             column_types.extend(mapper.types)
@@ -342,6 +346,12 @@ def map_sort(
     sort_order = sort.order
+    if not sort_order:
+        raise PySparkValueError(
+            error_class="CANNOT_BE_EMPTY",
+            message="At least one column must be specified.",
+        )
     if len(sort_order) == 1:
         parsed_col_name = split_fully_qualified_spark_name(
             sort_order[0].child.unresolved_attribute.unparsed_identifier
@@ -422,7 +432,30 @@ def map_sort(
     # TODO: sort.isglobal.
     if not order_specified:
         ascending = None
-    result = input_df.sort(cols, ascending=ascending)
+    select_statement = getattr(input_df, "_select_statement", None)
+    sort_expressions = [c._expression for c in cols]
+    if (
+        can_sort_be_flattened(select_statement, *sort_expressions)
+        and input_df._ops_after_agg is None
+    ):
+        # "flattened" order by that will allow using dropped columns
+        new = copy(select_statement)
+        new.from_ = select_statement.from_.to_subqueryable()
+        new.pre_actions = new.from_.pre_actions
+        new.post_actions = new.from_.post_actions
+        new.order_by = sort_expressions + (select_statement.order_by or [])
+        new.column_states = select_statement.column_states
+        new._merge_projection_complexity_with_subquery = False
+        new.df_ast_ids = (
+            select_statement.df_ast_ids.copy()
+            if select_statement.df_ast_ids is not None
+            else None
+        )
+        new.attributes = select_statement.attributes
+        result = input_df._with_plan(new)
+    else:
+        result = input_df.sort(cols, ascending=ascending)
     return DataFrameContainer(
         result,
@@ -1075,14 +1108,12 @@ def map_group_map(
     snowpark_grouping_expressions: list[snowpark.Column] = []
     typer = ExpressionTyper(input_df)
     group_name_list: list[str] = []
-    qualifiers = []
     for exp in grouping_expressions:
         new_name, snowpark_column = map_single_column_expression(
             exp, input_container.column_map, typer
         )
         snowpark_grouping_expressions.append(snowpark_column.col)
         group_name_list.append(new_name)
-        qualifiers.append(snowpark_column.get_qualifiers())
     if rel.group_map.func.python_udf is None:
         raise ValueError("group_map relation without python udf is not supported")
@@ -1124,13 +1155,14 @@ def map_group_map(
         result = input_df.group_by(*snowpark_grouping_expressions).apply_in_pandas(
             callable_func, output_type
         )
-    qualifiers.extend([[]] * (len(result.columns) - len(group_name_list)))
+    # The UDTF `apply_in_pandas` generates a new table whose output schema
+    # can be entirely different from that of the input Snowpark DataFrame.
+    # As a result, the output DataFrame should not use qualifiers based on the input group by columns.
     return DataFrameContainer.create_with_column_mapping(
         dataframe=result,
         spark_column_names=[field.name for field in output_type],
         snowpark_column_names=result.columns,
-        column_qualifiers=qualifiers,
+        column_qualifiers=None,
         parent_column_name_map=input_container.column_map,
     )

snowflake/snowpark_connect/relation/map_extension.py CHANGED Viewed

@@ -374,23 +374,31 @@ def map_aggregate(
     snowpark_columns: list[str] = []
     snowpark_column_types: list[snowpark_types.DataType] = []
-    def _add_column(spark_name: str, snowpark_column: TypedColumn) -> snowpark.Column:
-        alias = make_column_names_snowpark_compatible(
-            [spark_name], plan_id, len(spark_columns)
-        )[0]
+    # Use grouping columns directly without aliases
+    groupings = [col.col for _, col in raw_groupings]
+    # Create aliases only for aggregation columns
+    aggregations = []
+    for i, (spark_name, snowpark_column) in enumerate(raw_aggregations):
+        alias = make_column_names_snowpark_compatible([spark_name], plan_id, i)[0]
         spark_columns.append(spark_name)
         snowpark_columns.append(alias)
         snowpark_column_types.append(snowpark_column.typ)
-        return snowpark_column.col.alias(alias)
-    groupings = [_add_column(name, col) for name, col in raw_groupings]
-    aggregations = [_add_column(name, col) for name, col in raw_aggregations]
+        aggregations.append(snowpark_column.col.alias(alias))
     match aggregate.group_type:
         case snowflake_proto.Aggregate.GROUP_TYPE_GROUPBY:
-            result = input_df.group_by(groupings)
+            if groupings:
+                # Normal GROUP BY with explicit grouping columns
+                result = input_df.group_by(groupings)
+            else:
+                # No explicit GROUP BY - this is an aggregate over the entire table
+                # Use a dummy constant that will be excluded from the final result
+                result = input_df.with_column(
+                    "__dummy_group__", snowpark_fn.lit(1)
+                ).group_by("__dummy_group__")
         case snowflake_proto.Aggregate.GROUP_TYPE_ROLLUP:
             result = input_df.rollup(groupings)
         case snowflake_proto.Aggregate.GROUP_TYPE_CUBE:
@@ -410,28 +418,54 @@ def map_aggregate(
                 f"Unsupported GROUP BY type: {other}"
             )
-    result = result.agg(*aggregations)
+    result = result.agg(*aggregations, exclude_grouping_columns=True)
+    # If we added a dummy grouping column, make sure it's excluded
+    if not groupings and "__dummy_group__" in result.columns:
+        result = result.drop("__dummy_group__")
+    # Apply HAVING condition if present
+    if aggregate.HasField("having_condition"):
+        from snowflake.snowpark_connect.expression.hybrid_column_map import (
+            create_hybrid_column_map_for_having,
+        )
+        # Create aggregated DataFrame column map
+        aggregated_column_map = DataFrameContainer.create_with_column_mapping(
+            dataframe=result,
+            spark_column_names=spark_columns,
+            snowpark_column_names=snowpark_columns,
+            snowpark_column_types=snowpark_column_types,
+        ).column_map
+        # Create hybrid column map that can resolve both input and aggregate contexts
+        hybrid_map = create_hybrid_column_map_for_having(
+            input_df=input_df,
+            input_column_map=input_container.column_map,
+            aggregated_df=result,
+            aggregated_column_map=aggregated_column_map,
+            aggregate_expressions=list(aggregate.aggregate_expressions),
+            grouping_expressions=list(aggregate.grouping_expressions),
+            spark_columns=spark_columns,
+            raw_aggregations=raw_aggregations,
+        )
+        # Map the HAVING condition using hybrid resolution
+        _, having_column = hybrid_map.resolve_expression(aggregate.having_condition)
+        # Apply the HAVING filter
+        result = result.filter(having_column.col)
     if aggregate.group_type == snowflake_proto.Aggregate.GROUP_TYPE_GROUPING_SETS:
         # Immediately drop extra columns. Unlike other GROUP BY operations,
         # grouping sets don't allow ORDER BY with columns that aren't in the aggregate list.
-        result = result.select(result.columns[-len(spark_columns) :])
+        result = result.select(result.columns[-len(aggregations) :])
-    # Build a parent column map that includes groupings.
-    result_container = DataFrameContainer.create_with_column_mapping(
+    # Return only aggregation columns in the column map
+    return DataFrameContainer.create_with_column_mapping(
         dataframe=result,
         spark_column_names=spark_columns,
         snowpark_column_names=snowpark_columns,
         snowpark_column_types=snowpark_column_types,
-    )
-    # Drop the groupings.
-    grouping_count = len(groupings)
-    return DataFrameContainer.create_with_column_mapping(
-        result.drop(snowpark_columns[:grouping_count]),
-        spark_columns[grouping_count:],
-        snowpark_columns[grouping_count:],
-        snowpark_column_types[grouping_count:],
-        parent_column_name_map=result_container.column_map,
+        parent_column_name_map=input_df._column_map,
     )

snowflake/snowpark_connect/relation/map_local_relation.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import json
 import re
+from json import JSONDecodeError
 import numpy as np
 import pyarrow as pa
@@ -19,6 +20,7 @@ from snowflake.snowpark_connect.column_name_handler import (
 )
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
 from snowflake.snowpark_connect.type_mapping import (
+    get_python_sql_utils_class,
     map_json_schema_to_snowpark,
     map_pyarrow_to_snowpark_types,
     map_simple_types,
@@ -34,7 +36,12 @@ def parse_local_relation_schema_string(rel: relation_proto.Relation):
     # schema_str can be a dict, or just a type string, e.g. INTEGER.
     schema_str = rel.local_relation.schema
     assert schema_str
-    schema_dict = json.loads(schema_str)
+    try:
+        schema_dict = json.loads(schema_str)
+    except JSONDecodeError:
+        # Legacy scala clients sends unparsed struct type strings like "struct<id:bigint,a:int,b:double>"
+        spark_datatype = get_python_sql_utils_class().parseDataType(schema_str)
+        schema_dict = json.loads(spark_datatype.json())
     column_metadata = {}
     if isinstance(schema_dict, dict):

snowflake/snowpark_connect/relation/map_row_ops.py CHANGED Viewed

@@ -1,6 +1,7 @@
 #
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
+from copy import copy
 import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
@@ -8,6 +9,7 @@ from pyspark.errors.exceptions.base import AnalysisException, IllegalArgumentExc
 import snowflake.snowpark_connect.relation.utils as utils
 from snowflake import snowpark
+from snowflake.snowpark._internal.analyzer.binary_expression import And
 from snowflake.snowpark.functions import col, expr as snowpark_expr
 from snowflake.snowpark.types import (
     BooleanType,
@@ -29,6 +31,7 @@ from snowflake.snowpark_connect.expression.map_expression import (
 )
 from snowflake.snowpark_connect.expression.typer import ExpressionTyper
 from snowflake.snowpark_connect.relation.map_relation import map_relation
+from snowflake.snowpark_connect.relation.utils import can_filter_be_flattened
 from snowflake.snowpark_connect.utils.telemetry import (
     SnowparkConnectNotImplementedError,
 )
@@ -551,7 +554,33 @@ def map_filter(
     _, condition = map_single_column_expression(
         rel.filter.condition, input_container.column_map, typer
     )
-    result = input_df.filter(condition.col)
+    select_statement = getattr(input_df, "_select_statement", None)
+    condition_exp = condition.col._expression
+    if (
+        can_filter_be_flattened(select_statement, condition_exp)
+        and input_df._ops_after_agg is None
+    ):
+        new = copy(select_statement)
+        new.from_ = select_statement.from_.to_subqueryable()
+        new.pre_actions = new.from_.pre_actions
+        new.post_actions = new.from_.post_actions
+        new.column_states = select_statement.column_states
+        new.where = (
+            And(select_statement.where, condition_exp)
+            if select_statement.where is not None
+            else condition_exp
+        )
+        new._merge_projection_complexity_with_subquery = False
+        new.df_ast_ids = (
+            select_statement.df_ast_ids.copy()
+            if select_statement.df_ast_ids is not None
+            else None
+        )
+        new.attributes = select_statement.attributes
+        result = input_df._with_plan(new)
+    else:
+        result = input_df.filter(condition.col)
     return DataFrameContainer(
         result,

snowflake/snowpark_connect/relation/map_sql.py CHANGED Viewed

@@ -77,6 +77,9 @@ from ..expression.map_sql_expression import (
 from ..utils.identifiers import spark_to_sf_single_id
 _ctes = ContextVar[dict[str, relation_proto.Relation]]("_ctes", default={})
+_having_condition = ContextVar[expressions_proto.Expression | None](
+    "_having_condition", default=None
+)
 def _is_sql_select_statement_helper(sql_string: str) -> bool:
@@ -1146,6 +1149,7 @@ def map_logical_plan_relation(
                             grouping_expressions=grouping_expressions,
                             aggregate_expressions=aggregate_expressions,
                             grouping_sets=grouping_sets,
+                            having_condition=_having_condition.get(),
                         )
                     )
                 )
@@ -1389,12 +1393,25 @@ def map_logical_plan_relation(
                 )
             )
         case "UnresolvedHaving":
-            proto = relation_proto.Relation(
-                filter=relation_proto.Filter(
-                    input=map_logical_plan_relation(rel.child()),
-                    condition=map_logical_plan_expression(rel.havingCondition()),
+            # Store the having condition in context and process the child aggregate
+            child_relation = rel.child()
+            if str(child_relation.getClass().getSimpleName()) != "Aggregate":
+                raise SnowparkConnectNotImplementedError(
+                    "UnresolvedHaving can only be applied to Aggregate relations"
                 )
-            )
+            # Store having condition in a context variable for the Aggregate case to pick up
+            having_condition = map_logical_plan_expression(rel.havingCondition())
+            # Store in thread-local context (similar to how _ctes works)
+            token = _having_condition.set(having_condition)
+            try:
+                # Recursively call map_logical_plan_relation on the child Aggregate
+                # The Aggregate case will pick up the having condition from context
+                proto = map_logical_plan_relation(child_relation, plan_id)
+            finally:
+                _having_condition.reset(token)
         case "UnresolvedHint":
             proto = relation_proto.Relation(
                 hint=relation_proto.Hint(

snowflake/snowpark_connect/relation/read/map_read.py CHANGED Viewed

@@ -95,7 +95,8 @@ def map_read(
             if len(rel.read.data_source.paths) > 0:
                 # Normalize paths to ensure consistent behavior
                 clean_source_paths = [
-                    str(Path(path)) for path in rel.read.data_source.paths
+                    path.rstrip("/") if is_cloud_path(path) else str(Path(path))
+                    for path in rel.read.data_source.paths
                 ]
                 result = _read_file(

snowflake/snowpark_connect/relation/read/map_read_parquet.py CHANGED Viewed

@@ -54,10 +54,17 @@ def map_read_parquet(
     if len(paths) == 1:
         df = _read_parquet_with_partitions(session, reader, paths[0])
     else:
+        is_merge_schema = options.config.get("mergeschema")
         df = _read_parquet_with_partitions(session, reader, paths[0])
+        schema_cols = df.columns
         for p in paths[1:]:
             reader._user_schema = None
-            df = df.union_all(_read_parquet_with_partitions(session, reader, p))
+            df = df.union_all_by_name(
+                _read_parquet_with_partitions(session, reader, p),
+                allow_missing_columns=True,
+            )
+        if not is_merge_schema:
+            df = df.select(*schema_cols)
     renamed_df, snowpark_column_names = rename_columns_as_snowflake_standard(
         df, rel.common.plan_id

snowflake/snowpark_connect/relation/read/reader_config.py CHANGED Viewed

@@ -398,3 +398,12 @@ class ParquetReaderConfig(ReaderWriterConfig):
             ),
             options,
         )
+    def convert_to_snowpark_args(self) -> dict[str, Any]:
+        snowpark_args = super().convert_to_snowpark_args()
+        # Should be determined by spark.sql.parquet.binaryAsString, but currently Snowpark Connect only supports
+        # the default value (false). TODO: Add support for spark.sql.parquet.binaryAsString equal to "true".
+        snowpark_args["BINARY_AS_TEXT"] = False
+        return snowpark_args

snowpark-connect 0.22.1__py3-none-any.whl → 0.23.0__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.22.1py3-none-any.whl → 0.23.0py3-none-any.whl