PyPI - snowpark-connect - Versions diffs - 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

snowpark-connect 0.27.0py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (192) hide show

snowflake/snowpark_connect/relation/utils.py CHANGED Viewed

@@ -7,16 +7,18 @@ import re
 import string
 import time
 import uuid
-from typing import Sequence
+from typing import Any, Sequence
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
 import snowflake.snowpark.functions as snowpark_fn
 from snowflake import snowpark
+from snowflake.snowpark import Column
 from snowflake.snowpark.types import (
     BinaryType,
     BooleanType,
     ByteType,
+    DataType,
     DateType,
     DecimalType,
     DoubleType,
@@ -92,6 +94,21 @@ TYPE_MAP_FOR_TO_SCHEMA = {
 }
+# This mapping is used to map the compression type to the extension of the file.
+FILE_COMPRESSION_TO_EXTENSION = {
+    "GZIP": "gz",
+    "BZ2": "bz2",
+    "BROTLI": "br",
+    "ZSTD": "zst",
+    "DEFLATE": "deflate",
+    "RAW_DEFLATE": "raw_deflate",
+    "SNAPPY": "snappy",
+    "LZO": "lzo",
+    "LZ4": "lz4",
+    "BZIP2": "bz2",
+}
 def get_df_with_partition_row_number(
     container: DataFrameContainer,
     plan_id: int | None,
@@ -159,6 +176,7 @@ def generate_spark_compatible_filename(
     attempt_number: int = 0,
     compression: str = None,
     format_ext: str = "parquet",
+    shared_uuid: str = None,
 ) -> str:
     """Generate a Spark-compatible filename following the convention:
     part-<task-id>-<uuid>-c<attempt-number>.<compression>.<format>
@@ -168,12 +186,13 @@ def generate_spark_compatible_filename(
         attempt_number: Attempt number (usually 0)
         compression: Compression type (e.g., 'snappy', 'gzip', 'none')
         format_ext: File format extension (e.g., 'parquet', 'csv', 'json')
+        shared_uuid: Shared UUID for the file
     Returns:
         A filename string following Spark's naming convention
     """
-    # Generate a UUID for uniqueness
-    file_uuid = str(uuid.uuid4())
+    # Use the shared UUID if provided, otherwise generate a new one for uniqueness
+    file_uuid = shared_uuid or str(uuid.uuid4())
     # Format task ID with leading zeros (5 digits)
     formatted_task_id = f"{task_id:05d}"
@@ -186,13 +205,15 @@ def generate_spark_compatible_filename(
     # Add compression if specified and not 'none'
     if compression and compression.lower() not in ("none", "uncompressed"):
-        compression_part = f".{compression.lower()}"
+        compression_part = f".{FILE_COMPRESSION_TO_EXTENSION.get(compression.upper(), compression.lower())}"
     else:
         compression_part = ""
     # Add format extension if specified
-    if format_ext:
+    if format_ext == "parquet":
         return f"{base_name}{compression_part}.{format_ext}"
+    elif format_ext is not None and format_ext != "":
+        return f"{base_name}.{format_ext}{compression_part}"
     else:
         return f"{base_name}{compression_part}"
@@ -267,3 +288,105 @@ def snowpark_functions_col(name: str, column_map: ColumnNameMap) -> snowpark.Col
     """
     is_qualified_name = name not in column_map.get_snowpark_columns()
     return snowpark_fn.col(name, _is_qualified_name=is_qualified_name)
+def is_aggregate_function(func_name: str) -> bool:
+    """
+    Check if a function name is an aggregate function.
+    Uses a hybrid approach:
+    1. First checks PySpark's docstring convention (docstrings starting with "Aggregate function:")
+    2. Falls back to a hardcoded list for functions with missing/incorrect docstrings
+    This ensures comprehensive coverage while automatically supporting new PySpark aggregate functions.
+    Args:
+        func_name: The function name to check (case-insensitive)
+    Returns:
+        True if the function is an aggregate function, False otherwise
+    """
+    try:
+        import pyspark.sql.functions as pyspark_functions
+        # TODO:
+        """
+        Check we can leverage scala classes to determine agg functions:
+        https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala#L207
+        """
+        # Try PySpark docstring approach first (covers most aggregate functions)
+        pyspark_func = getattr(pyspark_functions, func_name.lower(), None)
+        if pyspark_func and pyspark_func.__doc__:
+            if pyspark_func.__doc__.lstrip().startswith("Aggregate function:"):
+                return True
+        # Fallback list for aggregate functions with missing/incorrect docstrings
+        # These are known aggregate functions that don't have proper docstring markers
+        fallback_aggregates = {
+            "percentile_cont",
+            "percentile_disc",
+            "any_value",
+            "grouping",
+            "grouping_id",
+        }
+        return func_name.lower() in fallback_aggregates
+    except Exception:
+        return False
+def get_all_dependent_column_names(columns: list[Column]) -> set[str]:
+    all_dependent_column_names = set()
+    for col in columns:
+        if hasattr(col, "_expr1"):
+            all_dependent_column_names = all_dependent_column_names.union(
+                col._expr1.dependent_column_names()
+            )
+    return all_dependent_column_names
+def map_pivot_value_to_spark_column_name(pivot_value: Any) -> tuple[str, bool]:
+    """
+    Maps pivot_value to the spark column name, without appending the aggregation suffix.
+    Returns:
+        A tuple containing the spark column name and a boolean indicating whether the original_value was null or not.
+    """
+    is_null = False
+    if pivot_value in (None, "NULL", "None"):
+        spark_name = "null"
+        is_null = True
+    else:
+        if isinstance(pivot_value, tuple):
+            spark_name = str(list(pivot_value))
+        elif isinstance(pivot_value, dict):
+            spark_name = "{" + ", ".join(str(v) for v in pivot_value.values()) + "}"
+        else:
+            spark_name = str(pivot_value)
+    return spark_name, is_null
+def create_pivot_column_condition(
+    col: Column,
+    pivot_value: Any,
+    pivot_value_is_null: bool,
+    cast_literal_to: DataType | None = None,
+) -> snowpark.Column:
+    if isinstance(pivot_value, dict):
+        elements = [
+            snowpark_fn.lit(item) for pair in pivot_value.items() for item in pair
+        ]
+        lit = snowpark_fn.object_construct_keep_null(*elements)
+    else:
+        lit = snowpark_fn.lit(pivot_value)
+    if cast_literal_to:
+        lit = snowpark_fn.cast(lit, cast_literal_to)
+    return snowpark_fn.is_null(col) if pivot_value_is_null else (col == lit)

snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py CHANGED Viewed

@@ -11,6 +11,8 @@ from snowflake import snowpark
 from snowflake.snowpark import DataFrameWriter
 from snowflake.snowpark.dataframe import DataFrame
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
+from snowflake.snowpark_connect.error.error_codes import ErrorCodes
+from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
 from snowflake.snowpark_connect.relation.read import jdbc_read_dbapi
 from snowflake.snowpark_connect.relation.read.jdbc_read_dbapi import JdbcDialect
 from snowflake.snowpark_connect.relation.read.utils import Connection
@@ -65,9 +67,13 @@ class JdbcDataFrameWriter(DataFrameWriter):
                         self._create_table(conn, table, container, jdbc_dialect)
                 case "errorifexists":
                     if table_exist:
-                        raise ValueError(
+                        exception = ValueError(
                             "table is already exist and write mode is ERROR_IF_EXISTS"
                         )
+                        attach_custom_error_code(
+                            exception, ErrorCodes.INVALID_OPERATION
+                        )
+                        raise exception
                     else:
                         self._create_table(conn, table, container, jdbc_dialect)
                 case "overwrite":
@@ -82,7 +88,9 @@ class JdbcDataFrameWriter(DataFrameWriter):
                     else:
                         self._create_table(conn, table, container, jdbc_dialect)
                 case _:
-                    raise ValueError(f"Invalid write mode value{write_mode}")
+                    exception = ValueError(f"Invalid write mode value{write_mode}")
+                    attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
+                    raise exception
             task_insert_into_data_source_with_retry(
                 input_df,
@@ -141,6 +149,7 @@ class JdbcDataFrameWriter(DataFrameWriter):
                 cursor.execute(sql)
         except Exception as e:
             logger.error(f"failed to drop table {table} from the data source {e}")
+            attach_custom_error_code(e, ErrorCodes.INTERNAL_ERROR)
             raise e
     def _create_table(
@@ -189,6 +198,7 @@ class JdbcDataFrameWriter(DataFrameWriter):
                 cursor.execute(sql)
         except Exception as e:
             logger.error(f"failed to create a table {table} from the data source {e}")
+            attach_custom_error_code(e, ErrorCodes.INTERNAL_ERROR)
             raise e
@@ -218,6 +228,7 @@ def _task_insert_into_data_source(
     except Exception as e:
         logger.debug(f"failed to insert into data source  {e}")
         conn.rollback()
+        attach_custom_error_code(e, ErrorCodes.INTERNAL_ERROR)
         raise e
     finally:
         cursor.close()
@@ -274,6 +285,7 @@ def task_insert_into_data_source_with_retry(
             )
     except Exception as e:
         logger.debug(f"failed to insert into data source  {e}")
+        attach_custom_error_code(e, ErrorCodes.INTERNAL_ERROR)
         raise e
     finally:
         close_connection(conn)
@@ -339,4 +351,8 @@ def convert_sp_to_sql_type(
                 case _:
                     return "TIMESTAMP"
         case _:
-            raise TypeError(f"Unsupported data type: {datatype.__class__.__name__}")
+            exception = TypeError(
+                f"Unsupported data type: {datatype.__class__.__name__}"
+            )
+            attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_TYPE)
+            raise exception

snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

snowpark-connect 0.27.0py3-none-any.whl → 1.6.0py3-none-any.whl