PyPI - snowpark-connect - Versions diffs - 0.26.0__py3-none-any.whl → 0.28.0__py3-none-any.whl - Mend - Supply Chain Defender

snowpark-connect 0.26.0py3-none-any.whl → 0.28.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (42) hide show

snowflake/snowpark_connect/relation/map_show_string.py CHANGED Viewed

@@ -12,6 +12,7 @@ from snowflake.snowpark._internal.analyzer import analyzer_utils
 from snowflake.snowpark.functions import col
 from snowflake.snowpark.types import DateType, StringType, StructField, StructType
 from snowflake.snowpark_connect.column_name_handler import set_schema_getter
+from snowflake.snowpark_connect.config import global_config
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
 from snowflake.snowpark_connect.relation.map_relation import map_relation
@@ -33,6 +34,7 @@ def map_show_string(rel: relation_proto.Relation) -> pandas.DataFrame:
         truncate=rel.show_string.truncate,
         vertical=rel.show_string.vertical,
         _spark_column_names=input_df_container.column_map.get_spark_columns(),
+        _spark_session_tz=global_config.spark_sql_session_timeZone,
     )
     return pandas.DataFrame({"show_string": [show_string]})

snowflake/snowpark_connect/relation/map_sql.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import re
 from collections.abc import MutableMapping, MutableSequence
-from contextlib import contextmanager
+from contextlib import contextmanager, suppress
 from contextvars import ContextVar
 from functools import reduce
@@ -30,10 +30,13 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
 )
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
 from snowflake.snowpark._internal.utils import is_sql_select_statement, quote_name
+from snowflake.snowpark.functions import when_matched, when_not_matched
 from snowflake.snowpark_connect.config import (
     auto_uppercase_non_column_identifiers,
+    check_table_supports_operation,
     get_boolean_session_config_param,
     global_config,
+    record_table_metadata,
     set_config_param,
     unset_config_param,
 )
@@ -56,8 +59,10 @@ from snowflake.snowpark_connect.utils.context import (
     _accessing_temp_object,
     gen_sql_plan_id,
     get_session_id,
+    get_sql_plan,
     push_evaluating_sql_scope,
     push_sql_scope,
+    set_plan_id_map,
     set_sql_args,
     set_sql_plan_name,
 )
@@ -67,6 +72,7 @@ from snowflake.snowpark_connect.utils.telemetry import (
     telemetry,
 )
+from .. import column_name_handler
 from ..expression.map_sql_expression import (
     _window_specs,
     as_java_list,
@@ -202,6 +208,9 @@ def _rename_columns(
 def _create_table_as_select(logical_plan, mode: str) -> None:
     # TODO: for as select create tables we'd map multi layer identifier here
     name = get_relation_identifier_name(logical_plan.name())
+    full_table_identifier = get_relation_identifier_name(
+        logical_plan.name(), is_multi_part=True
+    )
     comment = logical_plan.tableSpec().comment()
     container = execute_logical_plan(logical_plan.query())
@@ -222,6 +231,15 @@ def _create_table_as_select(logical_plan, mode: str) -> None:
         mode=mode,
     )
+    # Record table metadata for CREATE TABLE AS SELECT
+    # These are typically considered v2 tables and support RENAME COLUMN
+    record_table_metadata(
+        table_identifier=full_table_identifier,
+        table_type="v2",
+        data_source="default",
+        supports_column_rename=True,
+    )
 def _spark_field_to_sql(field: jpype.JObject, is_column: bool) -> str:
     # Column names will be uppercased according to "snowpark.connect.sql.identifiers.auto-uppercase",
@@ -299,6 +317,65 @@ def _remove_column_data_type(node):
     return node
+def _get_condition_from_action(action, column_mapping, typer):
+    condition = None
+    if action.condition().isDefined():
+        (_, condition_typed_col,) = map_single_column_expression(
+            map_logical_plan_expression(action.condition().get()),
+            column_mapping,
+            typer,
+        )
+        condition = condition_typed_col.col
+    return condition
+def _get_assignments_from_action(
+    action,
+    column_mapping_source,
+    column_mapping_target,
+    typer_source,
+    typer_target,
+):
+    assignments = dict()
+    if (
+        action.getClass().getSimpleName() == "InsertAction"
+        or action.getClass().getSimpleName() == "UpdateAction"
+    ):
+        incoming_assignments = as_java_list(action.assignments())
+        for assignment in incoming_assignments:
+            (_, key_typ_col) = map_single_column_expression(
+                map_logical_plan_expression(assignment.key()),
+                column_mapping=column_mapping_target,
+                typer=typer_target,
+            )
+            key_name = typer_target.df.select(key_typ_col.col).columns[0]
+            (_, val_typ_col) = map_single_column_expression(
+                map_logical_plan_expression(assignment.value()),
+                column_mapping=column_mapping_source,
+                typer=typer_source,
+            )
+            assignments[key_name] = val_typ_col.col
+    elif (
+        action.getClass().getSimpleName() == "InsertStarAction"
+        or action.getClass().getSimpleName() == "UpdateStarAction"
+    ):
+        if len(column_mapping_source.columns) != len(column_mapping_target.columns):
+            raise ValueError(
+                "source and target must have the same number of columns for InsertStarAction or UpdateStarAction"
+            )
+        for i, col in enumerate(column_mapping_target.columns):
+            if assignments.get(col.snowpark_name) is not None:
+                raise SnowparkConnectNotImplementedError(
+                    "UpdateStarAction or InsertStarAction is not supported with duplicate columns."
+                )
+            assignments[col.snowpark_name] = snowpark_fn.col(
+                column_mapping_source.columns[i].snowpark_name
+            )
+    return assignments
 def map_sql_to_pandas_df(
     sql_string: str,
     named_args: MutableMapping[str, expressions_proto.Expression.Literal],
@@ -420,6 +497,9 @@ def map_sql_to_pandas_df(
                     )
                 name = get_relation_identifier_name(logical_plan.name())
+                full_table_identifier = get_relation_identifier_name(
+                    logical_plan.name(), is_multi_part=True
+                )
                 columns = ", ".join(
                     _spark_field_to_sql(f, True)
                     for f in logical_plan.tableSchema().fields()
@@ -430,10 +510,48 @@ def map_sql_to_pandas_df(
                     if comment_opt.isDefined()
                     else ""
                 )
+                # Extract data source for metadata tracking
+                data_source = "default"
+                with suppress(Exception):
+                    # Get data source from tableSpec.provider() (for USING clause)
+                    if hasattr(logical_plan, "tableSpec"):
+                        table_spec = logical_plan.tableSpec()
+                        if hasattr(table_spec, "provider"):
+                            provider_opt = table_spec.provider()
+                            if provider_opt.isDefined():
+                                data_source = str(provider_opt.get()).lower()
+                        else:
+                            # Fall back to checking properties for FORMAT
+                            table_properties = table_spec.properties()
+                            if not table_properties.isEmpty():
+                                for prop in table_properties.get():
+                                    if str(prop.key()) == "FORMAT":
+                                        data_source = str(prop.value()).lower()
+                                        break
                 # NOTE: We are intentionally ignoring any FORMAT=... parameters here.
                 session.sql(
                     f"CREATE {replace_table} TABLE {if_not_exists}{name} ({columns}) {comment}"
                 ).collect()
+                # Record table metadata for Spark compatibility
+                # Tables created with explicit schema are considered v1 tables
+                # v1 tables with certain data sources don't support RENAME COLUMN in OSS Spark
+                supports_rename = data_source not in (
+                    "parquet",
+                    "csv",
+                    "json",
+                    "orc",
+                    "avro",
+                )
+                record_table_metadata(
+                    table_identifier=full_table_identifier,
+                    table_type="v1",
+                    data_source=data_source,
+                    supports_column_rename=supports_rename,
+                )
             case "CreateTableAsSelect":
                 mode = "ignore" if logical_plan.ignoreIfExists() else "errorifexists"
                 _create_table_as_select(logical_plan, mode=mode)
@@ -727,15 +845,147 @@ def map_sql_to_pandas_df(
                     f"INSERT {overwrite_str} INTO {name} {cols_str} {final_query}",
                 ).collect()
             case "MergeIntoTable":
-                raise UnsupportedOperationException(
-                    "[UNSUPPORTED_SQL_EXTENSION] The MERGE INTO command failed.\n"
-                    + "Reason: This command is a platform-specific SQL extension and is not part of the standard Apache Spark specification that this interface uses."
+                source_df_container = map_relation(
+                    map_logical_plan_relation(logical_plan.sourceTable())
                 )
+                source_df = source_df_container.dataframe
+                plan_id = gen_sql_plan_id()
+                target_df_container = map_relation(
+                    map_logical_plan_relation(logical_plan.targetTable(), plan_id)
+                )
+                target_df = target_df_container.dataframe
+                if (
+                    logical_plan.targetTable().getClass().getSimpleName()
+                    == "UnresolvedRelation"
+                ):
+                    target_table_name = _spark_to_snowflake(
+                        logical_plan.targetTable().multipartIdentifier()
+                    )
+                else:
+                    target_table_name = _spark_to_snowflake(
+                        logical_plan.targetTable().child().multipartIdentifier()
+                    )
+                target_table = session.table(target_table_name)
+                target_table_columns = target_table.columns
+                target_df_spark_names = []
+                for target_table_col, target_df_col in zip(
+                    target_table_columns, target_df_container.column_map.columns
+                ):
+                    target_df = target_df.with_column_renamed(
+                        target_df_col.snowpark_name,
+                        target_table_col,
+                    )
+                    target_df_spark_names.append(target_df_col.spark_name)
+                target_df_container = DataFrameContainer.create_with_column_mapping(
+                    dataframe=target_df,
+                    spark_column_names=target_df_spark_names,
+                    snowpark_column_names=target_table_columns,
+                )
+                set_plan_id_map(plan_id, target_df_container)
+                joined_df_before_condition: snowpark.DataFrame = source_df.join(
+                    target_df
+                )
+                column_mapping_for_conditions = column_name_handler.JoinColumnNameMap(
+                    source_df_container.column_map,
+                    target_df_container.column_map,
+                )
+                typer_for_expressions = ExpressionTyper(joined_df_before_condition)
+                (_, merge_condition_typed_col,) = map_single_column_expression(
+                    map_logical_plan_expression(logical_plan.mergeCondition()),
+                    column_mapping=column_mapping_for_conditions,
+                    typer=typer_for_expressions,
+                )
+                clauses = []
+                for matched_action in as_java_list(logical_plan.matchedActions()):
+                    condition = _get_condition_from_action(
+                        matched_action,
+                        column_mapping_for_conditions,
+                        typer_for_expressions,
+                    )
+                    if matched_action.getClass().getSimpleName() == "DeleteAction":
+                        clauses.append(when_matched(condition).delete())
+                    elif (
+                        matched_action.getClass().getSimpleName() == "UpdateAction"
+                        or matched_action.getClass().getSimpleName()
+                        == "UpdateStarAction"
+                    ):
+                        assignments = _get_assignments_from_action(
+                            matched_action,
+                            source_df_container.column_map,
+                            target_df_container.column_map,
+                            ExpressionTyper(source_df),
+                            ExpressionTyper(target_df),
+                        )
+                        clauses.append(when_matched(condition).update(assignments))
+                for not_matched_action in as_java_list(
+                    logical_plan.notMatchedActions()
+                ):
+                    condition = _get_condition_from_action(
+                        not_matched_action,
+                        column_mapping_for_conditions,
+                        typer_for_expressions,
+                    )
+                    if (
+                        not_matched_action.getClass().getSimpleName() == "InsertAction"
+                        or not_matched_action.getClass().getSimpleName()
+                        == "InsertStarAction"
+                    ):
+                        assignments = _get_assignments_from_action(
+                            not_matched_action,
+                            source_df_container.column_map,
+                            target_df_container.column_map,
+                            ExpressionTyper(source_df),
+                            ExpressionTyper(target_df),
+                        )
+                        clauses.append(when_not_matched(condition).insert(assignments))
+                if not as_java_list(logical_plan.notMatchedBySourceActions()).isEmpty():
+                    raise SnowparkConnectNotImplementedError(
+                        "Snowflake does not support 'not matched by source' actions in MERGE statements."
+                    )
+                target_table.merge(source_df, merge_condition_typed_col.col, clauses)
             case "DeleteFromTable":
-                raise UnsupportedOperationException(
-                    "[UNSUPPORTED_SQL_EXTENSION] The DELETE FROM command failed.\n"
-                    + "Reason: This command is a platform-specific SQL extension and is not part of the standard Apache Spark specification that this interface uses."
+                df_container = map_relation(
+                    map_logical_plan_relation(logical_plan.table())
+                )
+                name = get_relation_identifier_name(logical_plan.table(), True)
+                table = session.table(name)
+                table_columns = table.columns
+                df = df_container.dataframe
+                spark_names = []
+                for table_col, df_col in zip(
+                    table_columns, df_container.column_map.columns
+                ):
+                    df = df.with_column_renamed(
+                        df_col.snowpark_name,
+                        table_col,
+                    )
+                    spark_names.append(df_col.spark_name)
+                df_container = DataFrameContainer.create_with_column_mapping(
+                    dataframe=df,
+                    spark_column_names=spark_names,
+                    snowpark_column_names=table_columns,
+                )
+                df = df_container.dataframe
+                (
+                    condition_column_name,
+                    condition_typed_col,
+                ) = map_single_column_expression(
+                    map_logical_plan_expression(logical_plan.condition()),
+                    df_container.column_map,
+                    ExpressionTyper(df),
                 )
+                table.delete(condition_typed_col.col)
             case "UpdateTable":
                 # Databricks/Delta-specific extension not supported by SAS.
                 # Provide an actionable, clear error.
@@ -744,7 +994,20 @@ def map_sql_to_pandas_df(
                     + "Reason: This command is a platform-specific SQL extension and is not part of the standard Apache Spark specification that this interface uses."
                 )
             case "RenameColumn":
-                table_name = get_relation_identifier_name(logical_plan.table(), True)
+                full_table_identifier = get_relation_identifier_name(
+                    logical_plan.table(), True
+                )
+                # Check Spark compatibility for RENAME COLUMN operation
+                if not check_table_supports_operation(
+                    full_table_identifier, "rename_column"
+                ):
+                    raise AnalysisException(
+                        f"ALTER TABLE RENAME COLUMN is not supported for table '{full_table_identifier}'. "
+                        f"This table was created as a v1 table with a data source that doesn't support column renaming. "
+                        f"To enable this operation, set 'enable_snowflake_extension_behavior' to 'true'."
+                    )
                 column_obj = logical_plan.column()
                 old_column_name = ".".join(
                     spark_to_sf_single_id(str(part), is_column=True)
@@ -754,7 +1017,7 @@ def map_sql_to_pandas_df(
                     case_insensitive_name = next(
                         (
                             f.name
-                            for f in session.table(table_name).schema.fields
+                            for f in session.table(full_table_identifier).schema.fields
                             if f.name.lower() == old_column_name.lower()
                         ),
                         None,
@@ -766,7 +1029,7 @@ def map_sql_to_pandas_df(
                 )
                 # Pass through to Snowflake
-                snowflake_sql = f"ALTER TABLE {table_name} RENAME COLUMN {old_column_name} TO {new_column_name}"
+                snowflake_sql = f"ALTER TABLE {full_table_identifier} RENAME COLUMN {old_column_name} TO {new_column_name}"
                 session.sql(snowflake_sql).collect()
             case "RenameTable":
                 name = get_relation_identifier_name(logical_plan.child(), True)
@@ -997,6 +1260,90 @@ def get_sql_passthrough() -> bool:
     return get_boolean_session_config_param("snowpark.connect.sql.passthrough")
+def change_default_to_public(name: str) -> str:
+    """
+    Change the namespace to PUBLIC when given name is DEFAULT
+    :param name: Given namespace
+    :return: if name is DEFAULT return PUBLIC otherwise name
+    """
+    if name.startswith('"'):
+        if name.upper() == '"DEFAULT"':
+            return name.replace("DEFAULT", "PUBLIC")
+    elif name.upper() == "DEFAULT":
+        return "PUBLIC"
+    return name
+def _preprocess_identifier_calls(sql_query: str) -> str:
+    """
+    Pre-process SQL query to resolve IDENTIFIER() calls before Spark parsing.
+    Transforms: IDENTIFIER('abs')(c2) -> abs(c2)
+    Transforms: IDENTIFIER('COAL' || 'ESCE')(NULL, 1) -> COALESCE(NULL, 1)
+    This preserves all function arguments in their original positions, eliminating
+    the need to reconstruct them at the expression level.
+    """
+    import re
+    # Pattern to match IDENTIFIER(...) followed by optional function call arguments
+    # This captures both the identifier expression and any trailing arguments
+    # Note: We need to be careful about whitespace preservation
+    identifier_pattern = r"IDENTIFIER\s*\(\s*([^)]+)\s*\)(\s*)(\([^)]*\))?"
+    def resolve_identifier_match(match):
+        identifier_expr_str = match.group(1).strip()
+        whitespace = match.group(2) if match.group(2) else ""
+        function_args = match.group(3) if match.group(3) else ""
+        try:
+            # Handle string concatenation FIRST: IDENTIFIER('COAL' || 'ESCE')
+            # (Must check this before simple strings since it also starts/ends with quotes)
+            if "||" in identifier_expr_str:
+                # Parse basic string concatenation with proper quote handling
+                parts = []
+                split_parts = identifier_expr_str.split("||")
+                for part in split_parts:
+                    part = part.strip()
+                    if part.startswith("'") and part.endswith("'"):
+                        unquoted = part[1:-1]  # Remove quotes from each part
+                        parts.append(unquoted)
+                    else:
+                        # Non-string parts - return original for safety
+                        return match.group(0)
+                resolved_name = "".join(parts)  # Concatenate the unquoted parts
+            # Handle simple string literals: IDENTIFIER('abs')
+            elif identifier_expr_str.startswith("'") and identifier_expr_str.endswith(
+                "'"
+            ):
+                resolved_name = identifier_expr_str[1:-1]  # Remove quotes
+            else:
+                # Complex expressions not supported yet - return original
+                return match.group(0)
+            # Return resolved function call with preserved arguments and whitespace
+            if function_args:
+                # Function call case: IDENTIFIER('abs')(c1) -> abs(c1)
+                result = f"{resolved_name}{function_args}"
+            else:
+                # Column reference case: IDENTIFIER('c1') FROM -> c1 FROM (preserve whitespace)
+                result = f"{resolved_name}{whitespace}"
+            return result
+        except Exception:
+            # Return original to avoid breaking the query
+            return match.group(0)
+    # Apply the transformation
+    processed_query = re.sub(
+        identifier_pattern, resolve_identifier_match, sql_query, flags=re.IGNORECASE
+    )
+    return processed_query
 def map_sql(
     rel: relation_proto.Relation,
 ) -> DataFrameContainer:
@@ -1008,7 +1355,6 @@ def map_sql(
     In passthough mode as True, SAS calls session.sql() and not calling Spark Parser.
     This is to mitigate any issue not covered by spark logical plan to protobuf conversion.
     """
     snowpark_connect_sql_passthrough = get_sql_passthrough()
     if not snowpark_connect_sql_passthrough:
@@ -1353,6 +1699,7 @@ def map_logical_plan_relation(
                     left_input=map_logical_plan_relation(children[0]),
                     right_input=map_logical_plan_relation(children[1]),
                     set_op_type=relation_proto.SetOperation.SET_OP_TYPE_UNION,
+                    is_all=True,
                     by_name=rel.byName(),
                     allow_missing_columns=rel.allowMissingCol(),
                 )
@@ -1701,7 +2048,50 @@ def map_logical_plan_relation(
                     _window_specs.get()[key] = window_spec
                 proto = map_logical_plan_relation(rel.child())
         case "Generate":
-            input_relation = map_logical_plan_relation(rel.child())
+            # Generate creates a nested Project relation (see lines 1785-1790) without
+            # setting its plan_id field. When this Project is later processed by map_project
+            # (map_column_ops.py), it uses rel.common.plan_id which defaults to 0 for unset
+            # protobuf fields. This means all columns from the Generate operation (both exploded
+            # columns and passthrough columns) will have plan_id=0 in their names.
+            #
+            # If Generate's child is a SubqueryAlias whose inner relation was processed
+            # with a non-zero plan_id, there will be a mismatch between:
+            # - The columns referenced in the Project (expecting plan_id from SubqueryAlias's child)
+            # - The actual column names created by Generate's Project (using plan_id=0)
+            # Therefore, when Generate has a SubqueryAlias child, we explicitly process the inner
+            # relation with plan_id=0 to match what Generate's Project will use. This only applies when
+            # the immediate child of Generate is a SubqueryAlias and preserves existing registrations (like CTEs),
+            # so it won't affect other patterns.
+            child_class = str(rel.child().getClass().getSimpleName())
+            if child_class == "SubqueryAlias":
+                alias = str(rel.child().alias())
+                # Check if this alias was already registered during initial SQL parsing
+                existing_plan_id = get_sql_plan(alias)
+                if existing_plan_id is not None:
+                    # Use the existing plan_id to maintain consistency with prior registration
+                    used_plan_id = existing_plan_id
+                else:
+                    # Use plan_id=0 to match what the nested Project will use (protobuf default)
+                    used_plan_id = 0
+                    set_sql_plan_name(alias, used_plan_id)
+                # Process the inner child with the determined plan_id
+                inner_child = map_logical_plan_relation(
+                    rel.child().child(), plan_id=used_plan_id
+                )
+                input_relation = relation_proto.Relation(
+                    subquery_alias=relation_proto.SubqueryAlias(
+                        input=inner_child,
+                        alias=alias,
+                    )
+                )
+            else:
+                input_relation = map_logical_plan_relation(rel.child())
             generator_output_list = as_java_list(rel.generatorOutput())
             generator_output_list_expressions = [
                 map_logical_plan_expression(e) for e in generator_output_list
@@ -1784,8 +2174,11 @@ def map_logical_plan_relation(
 def get_relation_identifier_name(name_obj, is_multi_part: bool = False) -> str:
-    if name_obj.getClass().getSimpleName() == "PlanWithUnresolvedIdentifier":
-        # IDENTIFIER(<table_name>)
+    if name_obj.getClass().getSimpleName() in (
+        "PlanWithUnresolvedIdentifier",
+        "ExpressionWithUnresolvedIdentifier",
+    ):
+        # IDENTIFIER(<table_name>), or IDENTIFIER(<method name>)
         expr_proto = map_logical_plan_expression(name_obj.identifierExpr())
         session = snowpark.Session.get_active_session()
         m = ColumnNameMap([], [], None)
@@ -1797,7 +2190,12 @@ def get_relation_identifier_name(name_obj, is_multi_part: bool = False) -> str:
         )
     else:
         if is_multi_part:
-            name = _spark_to_snowflake(name_obj.multipartIdentifier())
+            try:
+                # Try multipartIdentifier first for full catalog.database.table
+                name = _spark_to_snowflake(name_obj.multipartIdentifier())
+            except AttributeError:
+                # Fallback to nameParts if multipartIdentifier not available
+                name = _spark_to_snowflake(name_obj.nameParts())
         else:
             name = _spark_to_snowflake(name_obj.nameParts())