PyPI - snowpark-connect - Versions diffs - 0.32.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

snowpark-connect 0.32.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (106) hide show

snowflake/snowpark_connect/relation/map_sql.py CHANGED Viewed

@@ -25,6 +25,7 @@ import snowflake.snowpark.functions as snowpark_fn
 import snowflake.snowpark_connect.proto.snowflake_expression_ext_pb2 as snowflake_exp_proto
 import snowflake.snowpark_connect.proto.snowflake_relation_ext_pb2 as snowflake_proto
 from snowflake import snowpark
+from snowflake.snowpark import Session
 from snowflake.snowpark._internal.analyzer.analyzer_utils import (
     quote_name_without_upper_casing,
     unquote_if_quoted,
@@ -61,6 +62,9 @@ from snowflake.snowpark_connect.relation.map_relation import (
     NATURAL_JOIN_TYPE_BASE,
     map_relation,
 )
+# Import from utils for consistency
+from snowflake.snowpark_connect.relation.utils import is_aggregate_function
 from snowflake.snowpark_connect.type_mapping import map_snowpark_to_pyspark_types
 from snowflake.snowpark_connect.utils.context import (
     _accessing_temp_object,
@@ -152,6 +156,48 @@ def _push_cte_scope():
         _cte_definitions.reset(def_token)
+def _process_cte_relations(cte_relations):
+    """
+    Process CTE relations and register them in the current CTE scope.
+    This function extracts CTE definitions from CTE relations,
+    maps them to protobuf representations, and stores them for later reference.
+    Args:
+        cte_relations: Java list of CTE relations (tuples of name and SubqueryAlias)
+    """
+    for cte in as_java_list(cte_relations):
+        name = str(cte._1())
+        # Store the original CTE definition for re-evaluation
+        _cte_definitions.get()[name] = cte._2()
+        # Process CTE definition with a unique plan_id to ensure proper column naming
+        # Clear HAVING condition before processing each CTE to prevent leakage between CTEs
+        saved_having = _having_condition.get()
+        _having_condition.set(None)
+        try:
+            cte_plan_id = gen_sql_plan_id()
+            cte_proto = map_logical_plan_relation(cte._2(), cte_plan_id)
+            _ctes.get()[name] = cte_proto
+        finally:
+            _having_condition.set(saved_having)
+@contextmanager
+def _with_cte_scope(cte_relations):
+    """
+    Context manager that creates a CTE scope and processes CTE relations.
+    This combines _push_cte_scope() and _process_cte_relations() to handle
+    the common pattern of processing CTEs within a new scope.
+    Args:
+        cte_relations: Java list of CTE relations (tuples of name and SubqueryAlias)
+    """
+    with (_push_cte_scope()):
+        _process_cte_relations(cte_relations)
+        yield
 @contextmanager
 def _push_window_specs_scope():
     """
@@ -258,6 +304,130 @@ def _create_table_as_select(logical_plan, mode: str) -> None:
     )
+def _insert_into_table(logical_plan, session: Session) -> None:
+    df_container = execute_logical_plan(logical_plan.query())
+    df = df_container.dataframe
+    queries = df.queries["queries"]
+    if len(queries) != 1:
+        exception = SnowparkConnectNotImplementedError(
+            f"Unexpected number of queries: {len(queries)}"
+        )
+        attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+        raise exception
+    name = get_relation_identifier_name(logical_plan.table(), True)
+    user_columns = [
+        spark_to_sf_single_id(str(col), is_column=True)
+        for col in as_java_list(logical_plan.userSpecifiedCols())
+    ]
+    overwrite_str = "OVERWRITE" if logical_plan.overwrite() else ""
+    cols_str = "(" + ", ".join(user_columns) + ")" if user_columns else ""
+    # Extract partition spec if any
+    partition_spec = logical_plan.partitionSpec()
+    partition_map = as_java_map(partition_spec)
+    partition_columns = {}
+    for entry in partition_map.entrySet():
+        col_name = str(entry.getKey())
+        value_option = entry.getValue()
+        if value_option.isDefined():
+            partition_columns[col_name] = value_option.get()
+    # Add partition columns to the dataframe
+    if partition_columns:
+        """
+        Spark sends them in the partition spec and the values won't be present in the values array.
+        As snowflake does not support static partitions in INSERT INTO statements,
+        we need to add the partition columns to the dataframe as literal columns.
+        ex: INSERT INTO TABLE test_table PARTITION (ds='2021-01-01', hr=10) VALUES ('k1', 100), ('k2', 200), ('k3', 300)
+        Spark sends: VALUES ('k1', 100), ('k2', 200), ('k3', 300) with partition spec (ds='2021-01-01', hr=10)
+        Snowflake expects: VALUES ('k1', 100, '2021-01-01', 10), ('k2', 200, '2021-01-01', 10), ('k3', 300, '2021-01-01', 10)
+        We need to add the partition columns to the dataframe as literal columns.
+        ex: df = df.withColumn('ds', snowpark_fn.lit('2021-01-01'))
+            df = df.withColumn('hr', snowpark_fn.lit(10))
+        Then the final query will be:
+        INSERT INTO TABLE test_table VALUES ('k1', 100, '2021-01-01', 10), ('k2', 200, '2021-01-01', 10), ('k3', 300, '2021-01-01', 10)
+        """
+        for partition_col, partition_value in partition_columns.items():
+            df = df.withColumn(partition_col, snowpark_fn.lit(partition_value))
+    target_table = session.table(name)
+    target_schema = target_table.schema
+    expected_number_of_columns = (
+        len(user_columns) if user_columns else len(target_schema.fields)
+    )
+    if expected_number_of_columns != len(df.schema.fields):
+        reason = (
+            "too many data columns"
+            if len(df.schema.fields) > expected_number_of_columns
+            else "not enough data columns"
+        )
+        exception = AnalysisException(
+            f'[INSERT_COLUMN_ARITY_MISMATCH.{reason.replace(" ", "_").upper()}] Cannot write to {name}, the reason is {reason}:\n'
+            f'Table columns: {", ".join(target_schema.names)}.\n'
+            f'Data columns: {", ".join(df.schema.names)}.'
+        )
+        attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
+        raise exception
+    try:
+        # Modify df with type conversions and struct field name mapping
+        modified_columns = []
+        for source_field, target_field in zip(df.schema.fields, target_schema.fields):
+            col_name = source_field.name
+            # Handle different type conversions
+            if isinstance(
+                target_field.datatype, snowpark.types.DecimalType
+            ) and isinstance(
+                source_field.datatype,
+                (snowpark.types.FloatType, snowpark.types.DoubleType),
+            ):
+                # Add CASE WHEN to convert NaN to NULL for DECIMAL targets
+                # Only apply this to floating-point source columns
+                modified_col = (
+                    snowpark_fn.when(
+                        snowpark_fn.equal_nan(snowpark_fn.col(col_name)),
+                        snowpark_fn.lit(None),
+                    )
+                    .otherwise(snowpark_fn.col(col_name))
+                    .alias(col_name)
+                )
+                modified_columns.append(modified_col)
+            elif (
+                isinstance(target_field.datatype, snowpark.types.StructType)
+                and source_field.datatype != target_field.datatype
+            ):
+                # Cast struct with field name mapping (e.g., col1,col2 -> i1,i2)
+                # This fixes INSERT INTO table with struct literals like (2, 3)
+                modified_col = (
+                    snowpark_fn.col(col_name)
+                    .cast(target_field.datatype, rename_fields=True)
+                    .alias(col_name)
+                )
+                modified_columns.append(modified_col)
+            else:
+                modified_columns.append(snowpark_fn.col(col_name))
+        df = df.select(modified_columns)
+    except Exception:
+        pass
+    queries = df.queries["queries"]
+    final_query = queries[0]
+    session.sql(
+        f"INSERT {overwrite_str} INTO {name} {cols_str} {final_query}",
+    ).collect()
 def _spark_field_to_sql(field: jpype.JObject, is_column: bool) -> str:
     # Column names will be uppercased according to "snowpark.connect.sql.identifiers.auto-uppercase"
     # if present, or to "spark.sql.caseSensitive".
@@ -588,25 +758,48 @@ def map_sql_to_pandas_df(
                     f"CREATE TABLE {if_not_exists}{name} LIKE {source}"
                 ).collect()
             case "CreateTempViewUsing":
+                parsed_sql = sqlglot.parse_one(sql_string, dialect="spark")
+                spark_view_name = next(parsed_sql.find_all(sqlglot.exp.Table)).name
+                # extract ONLY top-level column definitions (not nested struct fields)
+                column_defs = []
+                schema_node = next(parsed_sql.find_all(sqlglot.exp.Schema), None)
+                if schema_node:
+                    for expr in schema_node.expressions:
+                        if isinstance(expr, sqlglot.exp.ColumnDef):
+                            column_defs.append(expr)
+                num_columns = len(column_defs)
+                if num_columns > 0:
+                    null_list_parts = []
+                    for col_def in column_defs:
+                        col_name = spark_to_sf_single_id(col_def.name, is_column=True)
+                        col_type = col_def.kind
+                        if col_type:
+                            null_list_parts.append(
+                                f"CAST(NULL AS {col_type.sql(dialect='snowflake')}) AS {col_name}"
+                            )
+                        else:
+                            null_list_parts.append(f"NULL AS {col_name}")
+                    null_list = ", ".join(null_list_parts)
+                else:
+                    null_list = "*"
                 empty_select = (
-                    " AS SELECT * WHERE 1 = 0"
+                    f" AS SELECT {null_list} WHERE 1 = 0"
                     if logical_plan.options().isEmpty()
                     and logical_plan.children().isEmpty()
                     else ""
                 )
-                parsed_sql = (
-                    sqlglot.parse_one(sql_string, dialect="spark")
-                    .transform(_normalize_identifiers)
+                transformed_sql = (
+                    parsed_sql.transform(_normalize_identifiers)
                     .transform(_remove_column_data_type)
                     .transform(_remove_file_format_property)
                 )
-                snowflake_sql = parsed_sql.sql(dialect="snowflake")
+                snowflake_sql = transformed_sql.sql(dialect="snowflake")
                 session.sql(f"{snowflake_sql}{empty_select}").collect()
-                spark_view_name = next(
-                    sqlglot.parse_one(sql_string, dialect="spark").find_all(
-                        sqlglot.exp.Table
-                    )
-                ).name
                 snowflake_view_name = spark_to_sf_single_id_with_unquoting(
                     spark_view_name
                 )
@@ -856,65 +1049,7 @@ def map_sql_to_pandas_df(
                     )
                     raise exception
             case "InsertIntoStatement":
-                df_container = execute_logical_plan(logical_plan.query())
-                df = df_container.dataframe
-                queries = df.queries["queries"]
-                if len(queries) != 1:
-                    exception = SnowparkConnectNotImplementedError(
-                        f"Unexpected number of queries: {len(queries)}"
-                    )
-                    attach_custom_error_code(
-                        exception, ErrorCodes.UNSUPPORTED_OPERATION
-                    )
-                    raise exception
-                name = get_relation_identifier_name(logical_plan.table(), True)
-                user_columns = [
-                    spark_to_sf_single_id(str(col), is_column=True)
-                    for col in as_java_list(logical_plan.userSpecifiedCols())
-                ]
-                overwrite_str = "OVERWRITE" if logical_plan.overwrite() else ""
-                cols_str = "(" + ", ".join(user_columns) + ")" if user_columns else ""
-                try:
-                    target_table = session.table(name)
-                    target_schema = target_table.schema
-                    # Modify df with NaN → NULL conversion for DECIMAL columns
-                    modified_columns = []
-                    for source_field, target_field in zip(
-                        df.schema.fields, target_schema.fields
-                    ):
-                        col_name = source_field.name
-                        if isinstance(
-                            target_field.datatype, snowpark.types.DecimalType
-                        ) and isinstance(
-                            source_field.datatype,
-                            (snowpark.types.FloatType, snowpark.types.DoubleType),
-                        ):
-                            # Add CASE WHEN to convert NaN to NULL for DECIMAL targets
-                            # Only apply this to floating-point source columns
-                            modified_col = (
-                                snowpark_fn.when(
-                                    snowpark_fn.equal_nan(snowpark_fn.col(col_name)),
-                                    snowpark_fn.lit(None),
-                                )
-                                .otherwise(snowpark_fn.col(col_name))
-                                .alias(col_name)
-                            )
-                            modified_columns.append(modified_col)
-                        else:
-                            modified_columns.append(snowpark_fn.col(col_name))
-                    df = df.select(modified_columns)
-                except Exception:
-                    pass
-                queries = df.queries["queries"]
-                final_query = queries[0]
-                session.sql(
-                    f"INSERT {overwrite_str} INTO {name} {cols_str} {final_query}",
-                ).collect()
+                _insert_into_table(logical_plan, session)
             case "MergeIntoTable":
                 source_df_container = map_relation(
                     map_logical_plan_relation(logical_plan.sourceTable())
@@ -1345,7 +1480,7 @@ def map_sql_to_pandas_df(
                 return pandas.DataFrame({"": [""]}), ""
             case "RepairTable":
-                # No-Op. Snowflake doesn't have explicit partitions to repair.
+                # No-Op: Snowflake doesn't have explicit partitions to repair.
                 table_relation = logical_plan.child()
                 db_and_table_name = as_java_list(table_relation.multipartIdentifier())
                 multi_part_len = len(db_and_table_name)
@@ -1371,6 +1506,16 @@ def map_sql_to_pandas_df(
                     raise exception
                 return pandas.DataFrame({"": [""]}), ""
+            case "UnresolvedWith":
+                child = logical_plan.child()
+                child_class = str(child.getClass().getSimpleName())
+                match child_class:
+                    case "InsertIntoStatement":
+                        with _with_cte_scope(logical_plan.cteRelations()):
+                            _insert_into_table(child, get_or_create_snowpark_session())
+                    case _:
+                        execute_logical_plan(logical_plan)
+                        return None, None
             case _:
                 execute_logical_plan(logical_plan)
                 return None, None
@@ -1598,7 +1743,19 @@ def map_logical_plan_relation(
                     attr_parts = as_java_list(expr.nameParts())
                     if len(attr_parts) == 1:
                         attr_name = str(attr_parts[0])
-                        return alias_map.get(attr_name, expr)
+                        if attr_name in alias_map:
+                            # Check if the alias references an aggregate function
+                            # If so, don't substitute because you can't GROUP BY an aggregate
+                            aliased_expr = alias_map[attr_name]
+                            aliased_expr_class = str(
+                                aliased_expr.getClass().getSimpleName()
+                            )
+                            if aliased_expr_class == "UnresolvedFunction":
+                                func_name = str(aliased_expr.nameParts().head())
+                                if is_aggregate_function(func_name):
+                                    return expr
+                            return aliased_expr
+                        return expr
                     return expr
@@ -1837,13 +1994,104 @@ def map_logical_plan_relation(
                 )
             )
         case "Sort":
+            # Process the input first
+            input_proto = map_logical_plan_relation(rel.child())
+            # Check if child is a Project - if so, build an alias map for ORDER BY resolution
+            # This handles: SELECT o.date AS order_date ... ORDER BY o.date
+            child_class = str(rel.child().getClass().getSimpleName())
+            alias_map = {}
+            if child_class == "Project":
+                # Extract aliases from SELECT clause
+                for proj_expr in list(as_java_list(rel.child().projectList())):
+                    if str(proj_expr.getClass().getSimpleName()) == "Alias":
+                        alias_name = str(proj_expr.name())
+                        child_expr = proj_expr.child()
+                        # Store mapping from original expression to alias name
+                        # Use string representation for matching
+                        expr_str = str(child_expr)
+                        alias_map[expr_str] = alias_name
+                        # Also handle UnresolvedAttribute specifically to get the qualified name
+                        if (
+                            str(child_expr.getClass().getSimpleName())
+                            == "UnresolvedAttribute"
+                        ):
+                            # Get the qualified name like "o.date"
+                            name_parts = list(as_java_list(child_expr.nameParts()))
+                            qualified_name = ".".join(str(part) for part in name_parts)
+                            if qualified_name not in alias_map:
+                                alias_map[qualified_name] = alias_name
+            # Process ORDER BY expressions, substituting aliases where needed
+            order_list = []
+            for order_expr in as_java_list(rel.order()):
+                # Get the child expression from the SortOrder
+                child_expr = order_expr.child()
+                expr_class = str(child_expr.getClass().getSimpleName())
+                # Check if this expression matches any aliased expression
+                expr_str = str(child_expr)
+                substituted = False
+                if expr_str in alias_map:
+                    # Found a match - substitute with alias reference
+                    alias_name = alias_map[expr_str]
+                    # Create new UnresolvedAttribute for the alias
+                    UnresolvedAttribute = jpype.JClass(
+                        "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute"
+                    )
+                    new_attr = UnresolvedAttribute.quoted(alias_name)
+                    # Create new SortOrder with substituted expression
+                    SortOrder = jpype.JClass(
+                        "org.apache.spark.sql.catalyst.expressions.SortOrder"
+                    )
+                    new_order = SortOrder(
+                        new_attr,
+                        order_expr.direction(),
+                        order_expr.nullOrdering(),
+                        order_expr.sameOrderExpressions(),
+                    )
+                    order_list.append(map_logical_plan_expression(new_order).sort_order)
+                    substituted = True
+                elif expr_class == "UnresolvedAttribute":
+                    # Try matching on qualified name
+                    name_parts = list(as_java_list(child_expr.nameParts()))
+                    qualified_name = ".".join(str(part) for part in name_parts)
+                    if qualified_name in alias_map:
+                        alias_name = alias_map[qualified_name]
+                        UnresolvedAttribute = jpype.JClass(
+                            "org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute"
+                        )
+                        new_attr = UnresolvedAttribute.quoted(alias_name)
+                        SortOrder = jpype.JClass(
+                            "org.apache.spark.sql.catalyst.expressions.SortOrder"
+                        )
+                        new_order = SortOrder(
+                            new_attr,
+                            order_expr.direction(),
+                            order_expr.nullOrdering(),
+                            order_expr.sameOrderExpressions(),
+                        )
+                        order_list.append(
+                            map_logical_plan_expression(new_order).sort_order
+                        )
+                        substituted = True
+                if not substituted:
+                    # No substitution needed - use original
+                    order_list.append(
+                        map_logical_plan_expression(order_expr).sort_order
+                    )
             proto = relation_proto.Relation(
                 sort=relation_proto.Sort(
-                    input=map_logical_plan_relation(rel.child()),
-                    order=[
-                        map_logical_plan_expression(e).sort_order
-                        for e in as_java_list(rel.order())
-                    ],
+                    input=input_proto,
+                    order=order_list,
                 )
             )
         case "SubqueryAlias":
@@ -2030,10 +2278,16 @@ def map_logical_plan_relation(
                     )
                     # Re-evaluate the CTE definition with a fresh plan_id
-                    fresh_plan_id = gen_sql_plan_id()
-                    fresh_cte_proto = map_logical_plan_relation(
-                        cte_definition, fresh_plan_id
-                    )
+                    # Clear HAVING condition to prevent leakage from outer CTEs
+                    saved_having = _having_condition.get()
+                    _having_condition.set(None)
+                    try:
+                        fresh_plan_id = gen_sql_plan_id()
+                        fresh_cte_proto = map_logical_plan_relation(
+                            cte_definition, fresh_plan_id
+                        )
+                    finally:
+                        _having_condition.set(saved_having)
                     # Use SubqueryColumnAliases to ensure consistent column names across CTE references
                     # This is crucial for CTEs that reference other CTEs
@@ -2188,16 +2442,7 @@ def map_logical_plan_relation(
                         ),
                     )
         case "UnresolvedWith":
-            with _push_cte_scope():
-                for cte in as_java_list(rel.cteRelations()):
-                    name = str(cte._1())
-                    # Store the original CTE definition for re-evaluation
-                    _cte_definitions.get()[name] = cte._2()
-                    # Process CTE definition with a unique plan_id to ensure proper column naming
-                    cte_plan_id = gen_sql_plan_id()
-                    cte_proto = map_logical_plan_relation(cte._2(), cte_plan_id)
-                    _ctes.get()[name] = cte_proto
+            with _with_cte_scope(rel.cteRelations()):
                 proto = map_logical_plan_relation(rel.child())
         case "LateralJoin":
             left = map_logical_plan_relation(rel.left())

snowflake/snowpark_connect/relation/read/map_read.py CHANGED Viewed

@@ -225,12 +225,20 @@ def _get_supported_read_file_format(unparsed_identifier: str) -> str | None:
     return None
+# TODO: [SNOW-2465948] Remove this once Snowpark fixes the issue with stage paths.
+class StagePathStr(str):
+    def partition(self, __sep):
+        if str(self)[0] == "'":
+            return str(self)[1:].partition(__sep)
+        return str(self).partition(__sep)
 def _quote_stage_path(stage_path: str) -> str:
     """
     Quote stage paths to escape any special characters.
     """
     if stage_path.startswith("@"):
-        return f"'{stage_path}'"
+        return StagePathStr(f"'{stage_path}'")
     return stage_path

snowflake/snowpark_connect/relation/read/map_read_csv.py CHANGED Viewed

@@ -6,6 +6,7 @@ import copy
 from typing import Any
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
+from pyspark.errors.exceptions.base import AnalysisException
 import snowflake.snowpark.functions as snowpark_fn
 from snowflake import snowpark
@@ -20,6 +21,7 @@ from snowflake.snowpark_connect.relation.read.metadata_utils import (
     get_non_metadata_fields,
 )
 from snowflake.snowpark_connect.relation.read.utils import (
+    apply_metadata_exclusion_pattern,
     get_spark_column_names_from_snowpark_columns,
     rename_columns_as_snowflake_standard,
 )
@@ -62,6 +64,8 @@ def map_read_csv(
         snowpark_read_options["INFER_SCHEMA"] = snowpark_options.get(
             "INFER_SCHEMA", False
         )
+        apply_metadata_exclusion_pattern(snowpark_options)
         snowpark_read_options["PATTERN"] = snowpark_options.get("PATTERN", None)
         raw_options = rel.read.data_source.options
@@ -157,6 +161,7 @@ def get_header_names(
     path: list[str],
     file_format_options: dict,
     snowpark_read_options: dict,
+    raw_options: dict,
 ) -> list[str]:
     no_header_file_format_options = copy.copy(file_format_options)
     no_header_file_format_options["PARSE_HEADER"] = False
@@ -168,7 +173,19 @@ def get_header_names(
     no_header_snowpark_read_options.pop("INFER_SCHEMA", None)
     header_df = session.read.options(no_header_snowpark_read_options).csv(path).limit(1)
-    header_data = header_df.collect()[0]
+    collected_data = header_df.collect()
+    if len(collected_data) == 0:
+        error_msg = f"Path does not exist or contains no data: {path}"
+        user_pattern = raw_options.get("pathGlobFilter", None)
+        if user_pattern:
+            error_msg += f" (with pathGlobFilter: {user_pattern})"
+        exception = AnalysisException(error_msg)
+        attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
+        raise exception
+    header_data = collected_data[0]
     return [
         f'"{header_data[i]}"'
         for i in range(len(header_df.schema.fields))
@@ -207,7 +224,7 @@ def read_data(
         return df
     headers = get_header_names(
-        session, path, file_format_options, snowpark_read_options
+        session, path, file_format_options, snowpark_read_options, raw_options
     )
     df_schema_fields = non_metadata_fields

snowpark-connect 0.32.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.32.0py3-none-any.whl → 1.0.0py3-none-any.whl