PyPI - snowpark-connect - Versions diffs - 0.27.0__py3-none-any.whl → 0.28.0__py3-none-any.whl - Mend - Supply Chain Defender

snowpark-connect 0.27.0py3-none-any.whl → 0.28.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (35) hide show

snowflake/snowpark_connect/relation/map_sql.py CHANGED Viewed

@@ -4,7 +4,7 @@
 import re
 from collections.abc import MutableMapping, MutableSequence
-from contextlib import contextmanager
+from contextlib import contextmanager, suppress
 from contextvars import ContextVar
 from functools import reduce
@@ -30,10 +30,13 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
 )
 from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
 from snowflake.snowpark._internal.utils import is_sql_select_statement, quote_name
+from snowflake.snowpark.functions import when_matched, when_not_matched
 from snowflake.snowpark_connect.config import (
     auto_uppercase_non_column_identifiers,
+    check_table_supports_operation,
     get_boolean_session_config_param,
     global_config,
+    record_table_metadata,
     set_config_param,
     unset_config_param,
 )
@@ -59,6 +62,7 @@ from snowflake.snowpark_connect.utils.context import (
     get_sql_plan,
     push_evaluating_sql_scope,
     push_sql_scope,
+    set_plan_id_map,
     set_sql_args,
     set_sql_plan_name,
 )
@@ -68,6 +72,7 @@ from snowflake.snowpark_connect.utils.telemetry import (
     telemetry,
 )
+from .. import column_name_handler
 from ..expression.map_sql_expression import (
     _window_specs,
     as_java_list,
@@ -203,6 +208,9 @@ def _rename_columns(
 def _create_table_as_select(logical_plan, mode: str) -> None:
     # TODO: for as select create tables we'd map multi layer identifier here
     name = get_relation_identifier_name(logical_plan.name())
+    full_table_identifier = get_relation_identifier_name(
+        logical_plan.name(), is_multi_part=True
+    )
     comment = logical_plan.tableSpec().comment()
     container = execute_logical_plan(logical_plan.query())
@@ -223,6 +231,15 @@ def _create_table_as_select(logical_plan, mode: str) -> None:
         mode=mode,
     )
+    # Record table metadata for CREATE TABLE AS SELECT
+    # These are typically considered v2 tables and support RENAME COLUMN
+    record_table_metadata(
+        table_identifier=full_table_identifier,
+        table_type="v2",
+        data_source="default",
+        supports_column_rename=True,
+    )
 def _spark_field_to_sql(field: jpype.JObject, is_column: bool) -> str:
     # Column names will be uppercased according to "snowpark.connect.sql.identifiers.auto-uppercase",
@@ -300,6 +317,65 @@ def _remove_column_data_type(node):
     return node
+def _get_condition_from_action(action, column_mapping, typer):
+    condition = None
+    if action.condition().isDefined():
+        (_, condition_typed_col,) = map_single_column_expression(
+            map_logical_plan_expression(action.condition().get()),
+            column_mapping,
+            typer,
+        )
+        condition = condition_typed_col.col
+    return condition
+def _get_assignments_from_action(
+    action,
+    column_mapping_source,
+    column_mapping_target,
+    typer_source,
+    typer_target,
+):
+    assignments = dict()
+    if (
+        action.getClass().getSimpleName() == "InsertAction"
+        or action.getClass().getSimpleName() == "UpdateAction"
+    ):
+        incoming_assignments = as_java_list(action.assignments())
+        for assignment in incoming_assignments:
+            (_, key_typ_col) = map_single_column_expression(
+                map_logical_plan_expression(assignment.key()),
+                column_mapping=column_mapping_target,
+                typer=typer_target,
+            )
+            key_name = typer_target.df.select(key_typ_col.col).columns[0]
+            (_, val_typ_col) = map_single_column_expression(
+                map_logical_plan_expression(assignment.value()),
+                column_mapping=column_mapping_source,
+                typer=typer_source,
+            )
+            assignments[key_name] = val_typ_col.col
+    elif (
+        action.getClass().getSimpleName() == "InsertStarAction"
+        or action.getClass().getSimpleName() == "UpdateStarAction"
+    ):
+        if len(column_mapping_source.columns) != len(column_mapping_target.columns):
+            raise ValueError(
+                "source and target must have the same number of columns for InsertStarAction or UpdateStarAction"
+            )
+        for i, col in enumerate(column_mapping_target.columns):
+            if assignments.get(col.snowpark_name) is not None:
+                raise SnowparkConnectNotImplementedError(
+                    "UpdateStarAction or InsertStarAction is not supported with duplicate columns."
+                )
+            assignments[col.snowpark_name] = snowpark_fn.col(
+                column_mapping_source.columns[i].snowpark_name
+            )
+    return assignments
 def map_sql_to_pandas_df(
     sql_string: str,
     named_args: MutableMapping[str, expressions_proto.Expression.Literal],
@@ -421,6 +497,9 @@ def map_sql_to_pandas_df(
                     )
                 name = get_relation_identifier_name(logical_plan.name())
+                full_table_identifier = get_relation_identifier_name(
+                    logical_plan.name(), is_multi_part=True
+                )
                 columns = ", ".join(
                     _spark_field_to_sql(f, True)
                     for f in logical_plan.tableSchema().fields()
@@ -431,10 +510,48 @@ def map_sql_to_pandas_df(
                     if comment_opt.isDefined()
                     else ""
                 )
+                # Extract data source for metadata tracking
+                data_source = "default"
+                with suppress(Exception):
+                    # Get data source from tableSpec.provider() (for USING clause)
+                    if hasattr(logical_plan, "tableSpec"):
+                        table_spec = logical_plan.tableSpec()
+                        if hasattr(table_spec, "provider"):
+                            provider_opt = table_spec.provider()
+                            if provider_opt.isDefined():
+                                data_source = str(provider_opt.get()).lower()
+                        else:
+                            # Fall back to checking properties for FORMAT
+                            table_properties = table_spec.properties()
+                            if not table_properties.isEmpty():
+                                for prop in table_properties.get():
+                                    if str(prop.key()) == "FORMAT":
+                                        data_source = str(prop.value()).lower()
+                                        break
                 # NOTE: We are intentionally ignoring any FORMAT=... parameters here.
                 session.sql(
                     f"CREATE {replace_table} TABLE {if_not_exists}{name} ({columns}) {comment}"
                 ).collect()
+                # Record table metadata for Spark compatibility
+                # Tables created with explicit schema are considered v1 tables
+                # v1 tables with certain data sources don't support RENAME COLUMN in OSS Spark
+                supports_rename = data_source not in (
+                    "parquet",
+                    "csv",
+                    "json",
+                    "orc",
+                    "avro",
+                )
+                record_table_metadata(
+                    table_identifier=full_table_identifier,
+                    table_type="v1",
+                    data_source=data_source,
+                    supports_column_rename=supports_rename,
+                )
             case "CreateTableAsSelect":
                 mode = "ignore" if logical_plan.ignoreIfExists() else "errorifexists"
                 _create_table_as_select(logical_plan, mode=mode)
@@ -543,7 +660,6 @@ def map_sql_to_pandas_df(
                 rows = session.sql(f"DESCRIBE TABLE {name}").collect()
             case "DescribeNamespace":
                 name = get_relation_identifier_name(logical_plan.namespace(), True)
-                name = change_default_to_public(name)
                 rows = session.sql(f"DESCRIBE SCHEMA {name}").collect()
                 if not rows:
                     rows = None
@@ -729,15 +845,147 @@ def map_sql_to_pandas_df(
                     f"INSERT {overwrite_str} INTO {name} {cols_str} {final_query}",
                 ).collect()
             case "MergeIntoTable":
-                raise UnsupportedOperationException(
-                    "[UNSUPPORTED_SQL_EXTENSION] The MERGE INTO command failed.\n"
-                    + "Reason: This command is a platform-specific SQL extension and is not part of the standard Apache Spark specification that this interface uses."
+                source_df_container = map_relation(
+                    map_logical_plan_relation(logical_plan.sourceTable())
                 )
+                source_df = source_df_container.dataframe
+                plan_id = gen_sql_plan_id()
+                target_df_container = map_relation(
+                    map_logical_plan_relation(logical_plan.targetTable(), plan_id)
+                )
+                target_df = target_df_container.dataframe
+                if (
+                    logical_plan.targetTable().getClass().getSimpleName()
+                    == "UnresolvedRelation"
+                ):
+                    target_table_name = _spark_to_snowflake(
+                        logical_plan.targetTable().multipartIdentifier()
+                    )
+                else:
+                    target_table_name = _spark_to_snowflake(
+                        logical_plan.targetTable().child().multipartIdentifier()
+                    )
+                target_table = session.table(target_table_name)
+                target_table_columns = target_table.columns
+                target_df_spark_names = []
+                for target_table_col, target_df_col in zip(
+                    target_table_columns, target_df_container.column_map.columns
+                ):
+                    target_df = target_df.with_column_renamed(
+                        target_df_col.snowpark_name,
+                        target_table_col,
+                    )
+                    target_df_spark_names.append(target_df_col.spark_name)
+                target_df_container = DataFrameContainer.create_with_column_mapping(
+                    dataframe=target_df,
+                    spark_column_names=target_df_spark_names,
+                    snowpark_column_names=target_table_columns,
+                )
+                set_plan_id_map(plan_id, target_df_container)
+                joined_df_before_condition: snowpark.DataFrame = source_df.join(
+                    target_df
+                )
+                column_mapping_for_conditions = column_name_handler.JoinColumnNameMap(
+                    source_df_container.column_map,
+                    target_df_container.column_map,
+                )
+                typer_for_expressions = ExpressionTyper(joined_df_before_condition)
+                (_, merge_condition_typed_col,) = map_single_column_expression(
+                    map_logical_plan_expression(logical_plan.mergeCondition()),
+                    column_mapping=column_mapping_for_conditions,
+                    typer=typer_for_expressions,
+                )
+                clauses = []
+                for matched_action in as_java_list(logical_plan.matchedActions()):
+                    condition = _get_condition_from_action(
+                        matched_action,
+                        column_mapping_for_conditions,
+                        typer_for_expressions,
+                    )
+                    if matched_action.getClass().getSimpleName() == "DeleteAction":
+                        clauses.append(when_matched(condition).delete())
+                    elif (
+                        matched_action.getClass().getSimpleName() == "UpdateAction"
+                        or matched_action.getClass().getSimpleName()
+                        == "UpdateStarAction"
+                    ):
+                        assignments = _get_assignments_from_action(
+                            matched_action,
+                            source_df_container.column_map,
+                            target_df_container.column_map,
+                            ExpressionTyper(source_df),
+                            ExpressionTyper(target_df),
+                        )
+                        clauses.append(when_matched(condition).update(assignments))
+                for not_matched_action in as_java_list(
+                    logical_plan.notMatchedActions()
+                ):
+                    condition = _get_condition_from_action(
+                        not_matched_action,
+                        column_mapping_for_conditions,
+                        typer_for_expressions,
+                    )
+                    if (
+                        not_matched_action.getClass().getSimpleName() == "InsertAction"
+                        or not_matched_action.getClass().getSimpleName()
+                        == "InsertStarAction"
+                    ):
+                        assignments = _get_assignments_from_action(
+                            not_matched_action,
+                            source_df_container.column_map,
+                            target_df_container.column_map,
+                            ExpressionTyper(source_df),
+                            ExpressionTyper(target_df),
+                        )
+                        clauses.append(when_not_matched(condition).insert(assignments))
+                if not as_java_list(logical_plan.notMatchedBySourceActions()).isEmpty():
+                    raise SnowparkConnectNotImplementedError(
+                        "Snowflake does not support 'not matched by source' actions in MERGE statements."
+                    )
+                target_table.merge(source_df, merge_condition_typed_col.col, clauses)
             case "DeleteFromTable":
-                raise UnsupportedOperationException(
-                    "[UNSUPPORTED_SQL_EXTENSION] The DELETE FROM command failed.\n"
-                    + "Reason: This command is a platform-specific SQL extension and is not part of the standard Apache Spark specification that this interface uses."
+                df_container = map_relation(
+                    map_logical_plan_relation(logical_plan.table())
                 )
+                name = get_relation_identifier_name(logical_plan.table(), True)
+                table = session.table(name)
+                table_columns = table.columns
+                df = df_container.dataframe
+                spark_names = []
+                for table_col, df_col in zip(
+                    table_columns, df_container.column_map.columns
+                ):
+                    df = df.with_column_renamed(
+                        df_col.snowpark_name,
+                        table_col,
+                    )
+                    spark_names.append(df_col.spark_name)
+                df_container = DataFrameContainer.create_with_column_mapping(
+                    dataframe=df,
+                    spark_column_names=spark_names,
+                    snowpark_column_names=table_columns,
+                )
+                df = df_container.dataframe
+                (
+                    condition_column_name,
+                    condition_typed_col,
+                ) = map_single_column_expression(
+                    map_logical_plan_expression(logical_plan.condition()),
+                    df_container.column_map,
+                    ExpressionTyper(df),
+                )
+                table.delete(condition_typed_col.col)
             case "UpdateTable":
                 # Databricks/Delta-specific extension not supported by SAS.
                 # Provide an actionable, clear error.
@@ -746,7 +994,20 @@ def map_sql_to_pandas_df(
                     + "Reason: This command is a platform-specific SQL extension and is not part of the standard Apache Spark specification that this interface uses."
                 )
             case "RenameColumn":
-                table_name = get_relation_identifier_name(logical_plan.table(), True)
+                full_table_identifier = get_relation_identifier_name(
+                    logical_plan.table(), True
+                )
+                # Check Spark compatibility for RENAME COLUMN operation
+                if not check_table_supports_operation(
+                    full_table_identifier, "rename_column"
+                ):
+                    raise AnalysisException(
+                        f"ALTER TABLE RENAME COLUMN is not supported for table '{full_table_identifier}'. "
+                        f"This table was created as a v1 table with a data source that doesn't support column renaming. "
+                        f"To enable this operation, set 'enable_snowflake_extension_behavior' to 'true'."
+                    )
                 column_obj = logical_plan.column()
                 old_column_name = ".".join(
                     spark_to_sf_single_id(str(part), is_column=True)
@@ -756,7 +1017,7 @@ def map_sql_to_pandas_df(
                     case_insensitive_name = next(
                         (
                             f.name
-                            for f in session.table(table_name).schema.fields
+                            for f in session.table(full_table_identifier).schema.fields
                             if f.name.lower() == old_column_name.lower()
                         ),
                         None,
@@ -768,7 +1029,7 @@ def map_sql_to_pandas_df(
                 )
                 # Pass through to Snowflake
-                snowflake_sql = f"ALTER TABLE {table_name} RENAME COLUMN {old_column_name} TO {new_column_name}"
+                snowflake_sql = f"ALTER TABLE {full_table_identifier} RENAME COLUMN {old_column_name} TO {new_column_name}"
                 session.sql(snowflake_sql).collect()
             case "RenameTable":
                 name = get_relation_identifier_name(logical_plan.child(), True)
@@ -795,7 +1056,6 @@ def map_sql_to_pandas_df(
             case "SetCatalogAndNamespace":
                 # TODO: add catalog setting here
                 name = get_relation_identifier_name(logical_plan.child(), True)
-                name = change_default_to_public(name)
                 session.sql(f"USE SCHEMA {name}").collect()
             case "SetCommand":
                 kv_result_tuple = logical_plan.kv().get()
@@ -804,7 +1064,6 @@ def map_sql_to_pandas_df(
                 set_config_param(get_session_id(), key, val, session)
             case "SetNamespaceCommand":
                 name = _spark_to_snowflake(logical_plan.namespace())
-                name = change_default_to_public(name)
                 session.sql(f"USE SCHEMA {name}").collect()
             case "SetNamespaceLocation" | "SetNamespaceProperties":
                 raise SnowparkConnectNotImplementedError(
@@ -1015,6 +1274,76 @@ def change_default_to_public(name: str) -> str:
     return name
+def _preprocess_identifier_calls(sql_query: str) -> str:
+    """
+    Pre-process SQL query to resolve IDENTIFIER() calls before Spark parsing.
+    Transforms: IDENTIFIER('abs')(c2) -> abs(c2)
+    Transforms: IDENTIFIER('COAL' || 'ESCE')(NULL, 1) -> COALESCE(NULL, 1)
+    This preserves all function arguments in their original positions, eliminating
+    the need to reconstruct them at the expression level.
+    """
+    import re
+    # Pattern to match IDENTIFIER(...) followed by optional function call arguments
+    # This captures both the identifier expression and any trailing arguments
+    # Note: We need to be careful about whitespace preservation
+    identifier_pattern = r"IDENTIFIER\s*\(\s*([^)]+)\s*\)(\s*)(\([^)]*\))?"
+    def resolve_identifier_match(match):
+        identifier_expr_str = match.group(1).strip()
+        whitespace = match.group(2) if match.group(2) else ""
+        function_args = match.group(3) if match.group(3) else ""
+        try:
+            # Handle string concatenation FIRST: IDENTIFIER('COAL' || 'ESCE')
+            # (Must check this before simple strings since it also starts/ends with quotes)
+            if "||" in identifier_expr_str:
+                # Parse basic string concatenation with proper quote handling
+                parts = []
+                split_parts = identifier_expr_str.split("||")
+                for part in split_parts:
+                    part = part.strip()
+                    if part.startswith("'") and part.endswith("'"):
+                        unquoted = part[1:-1]  # Remove quotes from each part
+                        parts.append(unquoted)
+                    else:
+                        # Non-string parts - return original for safety
+                        return match.group(0)
+                resolved_name = "".join(parts)  # Concatenate the unquoted parts
+            # Handle simple string literals: IDENTIFIER('abs')
+            elif identifier_expr_str.startswith("'") and identifier_expr_str.endswith(
+                "'"
+            ):
+                resolved_name = identifier_expr_str[1:-1]  # Remove quotes
+            else:
+                # Complex expressions not supported yet - return original
+                return match.group(0)
+            # Return resolved function call with preserved arguments and whitespace
+            if function_args:
+                # Function call case: IDENTIFIER('abs')(c1) -> abs(c1)
+                result = f"{resolved_name}{function_args}"
+            else:
+                # Column reference case: IDENTIFIER('c1') FROM -> c1 FROM (preserve whitespace)
+                result = f"{resolved_name}{whitespace}"
+            return result
+        except Exception:
+            # Return original to avoid breaking the query
+            return match.group(0)
+    # Apply the transformation
+    processed_query = re.sub(
+        identifier_pattern, resolve_identifier_match, sql_query, flags=re.IGNORECASE
+    )
+    return processed_query
 def map_sql(
     rel: relation_proto.Relation,
 ) -> DataFrameContainer:
@@ -1845,8 +2174,11 @@ def map_logical_plan_relation(
 def get_relation_identifier_name(name_obj, is_multi_part: bool = False) -> str:
-    if name_obj.getClass().getSimpleName() == "PlanWithUnresolvedIdentifier":
-        # IDENTIFIER(<table_name>)
+    if name_obj.getClass().getSimpleName() in (
+        "PlanWithUnresolvedIdentifier",
+        "ExpressionWithUnresolvedIdentifier",
+    ):
+        # IDENTIFIER(<table_name>), or IDENTIFIER(<method name>)
         expr_proto = map_logical_plan_expression(name_obj.identifierExpr())
         session = snowpark.Session.get_active_session()
         m = ColumnNameMap([], [], None)
@@ -1858,7 +2190,12 @@ def get_relation_identifier_name(name_obj, is_multi_part: bool = False) -> str:
         )
     else:
         if is_multi_part:
-            name = _spark_to_snowflake(name_obj.multipartIdentifier())
+            try:
+                # Try multipartIdentifier first for full catalog.database.table
+                name = _spark_to_snowflake(name_obj.multipartIdentifier())
+            except AttributeError:
+                # Fallback to nameParts if multipartIdentifier not available
+                name = _spark_to_snowflake(name_obj.nameParts())
         else:
             name = _spark_to_snowflake(name_obj.nameParts())