PyPI - snowpark-connect - Versions diffs - 0.31.0__py3-none-any.whl → 0.33.0__py3-none-any.whl - Mend

snowpark-connect 0.31.0py3-none-any.whl → 0.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (111) hide show

snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/functions.py ADDED Viewed

@@ -0,0 +1,203 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Additional Spark functions used in pandas-on-Spark.
+"""
+from typing import Union
+from pyspark import SparkContext
+import pyspark.sql.functions as F
+from pyspark.sql.column import Column
+# For supporting Spark Connect
+from pyspark.sql.utils import is_remote
+def product(col: Column, dropna: bool) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_product",
+            col,  # type: ignore[arg-type]
+            lit(dropna),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasProduct(col._jc, dropna))
+def stddev(col: Column, ddof: int) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_stddev",
+            col,  # type: ignore[arg-type]
+            lit(ddof),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasStddev(col._jc, ddof))
+def var(col: Column, ddof: int) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_var",
+            col,  # type: ignore[arg-type]
+            lit(ddof),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasVariance(col._jc, ddof))
+def skew(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_skew",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasSkewness(col._jc))
+def kurt(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_kurt",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasKurtosis(col._jc))
+def mode(col: Column, dropna: bool) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_mode",
+            col,  # type: ignore[arg-type]
+            lit(dropna),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasMode(col._jc, dropna))
+def covar(col1: Column, col2: Column, ddof: int) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_covar",
+            col1,  # type: ignore[arg-type]
+            col2,  # type: ignore[arg-type]
+            lit(ddof),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasCovar(col1._jc, col2._jc, ddof))
+def repeat(col: Column, n: Union[int, Column]) -> Column:
+    """
+    Repeats a string column n times, and returns it as a new string column.
+    """
+    _n = F.lit(n) if isinstance(n, int) else n
+    return F.call_udf("repeat", col, _n)
+def ewm(col: Column, alpha: float, ignore_na: bool) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "ewm",
+            col,  # type: ignore[arg-type]
+            lit(alpha),
+            lit(ignore_na),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.ewm(col._jc, alpha, ignore_na))
+def last_non_null(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "last_non_null",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.lastNonNull(col._jc))
+def null_index(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "null_index",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.nullIndex(col._jc))
+def timestampdiff(unit: str, start: Column, end: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "timestampdiff",
+            lit(unit),
+            start,  # type: ignore[arg-type]
+            end,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.timestampDiff(unit, start._jc, end._jc))

snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/utils.py ADDED Viewed

@@ -0,0 +1,202 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Helpers and utilities to deal with PySpark instances
+"""
+from typing import overload
+from pyspark.sql.types import DecimalType, StructType, MapType, ArrayType, StructField, DataType
+@overload
+def as_nullable_spark_type(dt: StructType) -> StructType:
+    ...
+@overload
+def as_nullable_spark_type(dt: ArrayType) -> ArrayType:
+    ...
+@overload
+def as_nullable_spark_type(dt: MapType) -> MapType:
+    ...
+@overload
+def as_nullable_spark_type(dt: DataType) -> DataType:
+    ...
+def as_nullable_spark_type(dt: DataType) -> DataType:
+    """
+    Returns a nullable schema or data types.
+    Examples
+    --------
+    >>> from pyspark.sql.types import *
+    >>> as_nullable_spark_type(StructType([
+    ...     StructField("A", IntegerType(), True),
+    ...     StructField("B", FloatType(), False)]))  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A', IntegerType(), True), StructField('B', FloatType(), True)])
+    >>> as_nullable_spark_type(StructType([
+    ...     StructField("A",
+    ...         StructType([
+    ...             StructField('a',
+    ...                 MapType(IntegerType(),
+    ...                 ArrayType(IntegerType(), False), False), False),
+    ...             StructField('b', StringType(), True)])),
+    ...     StructField("B", FloatType(), False)]))  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A',
+        StructType([StructField('a',
+            MapType(IntegerType(),
+            ArrayType(IntegerType(), True), True), True),
+        StructField('b', StringType(), True)]), True),
+    StructField('B', FloatType(), True)])
+    """
+    if isinstance(dt, StructType):
+        new_fields = []
+        for field in dt.fields:
+            new_fields.append(
+                StructField(
+                    field.name,
+                    as_nullable_spark_type(field.dataType),
+                    nullable=True,
+                    metadata=field.metadata,
+                )
+            )
+        return StructType(new_fields)
+    elif isinstance(dt, ArrayType):
+        return ArrayType(as_nullable_spark_type(dt.elementType), containsNull=True)
+    elif isinstance(dt, MapType):
+        return MapType(
+            as_nullable_spark_type(dt.keyType),
+            as_nullable_spark_type(dt.valueType),
+            valueContainsNull=True,
+        )
+    else:
+        return dt
+@overload
+def force_decimal_precision_scale(
+    dt: StructType, *, precision: int = ..., scale: int = ...
+) -> StructType:
+    ...
+@overload
+def force_decimal_precision_scale(
+    dt: ArrayType, *, precision: int = ..., scale: int = ...
+) -> ArrayType:
+    ...
+@overload
+def force_decimal_precision_scale(
+    dt: MapType, *, precision: int = ..., scale: int = ...
+) -> MapType:
+    ...
+@overload
+def force_decimal_precision_scale(
+    dt: DataType, *, precision: int = ..., scale: int = ...
+) -> DataType:
+    ...
+def force_decimal_precision_scale(
+    dt: DataType, *, precision: int = 38, scale: int = 18
+) -> DataType:
+    """
+    Returns a data type with a fixed decimal type.
+    The precision and scale of the decimal type are fixed with the given values.
+    Examples
+    --------
+    >>> from pyspark.sql.types import *
+    >>> force_decimal_precision_scale(StructType([
+    ...     StructField("A", DecimalType(10, 0), True),
+    ...     StructField("B", DecimalType(14, 7), False)]))  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A', DecimalType(38,18), True),
+                StructField('B', DecimalType(38,18), False)])
+    >>> force_decimal_precision_scale(StructType([
+    ...     StructField("A",
+    ...         StructType([
+    ...             StructField('a',
+    ...                 MapType(DecimalType(5, 0),
+    ...                 ArrayType(DecimalType(20, 0), False), False), False),
+    ...             StructField('b', StringType(), True)])),
+    ...     StructField("B", DecimalType(30, 15), False)]),
+    ...     precision=30, scale=15)  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A',
+        StructType([StructField('a',
+            MapType(DecimalType(30,15),
+            ArrayType(DecimalType(30,15), False), False), False),
+        StructField('b', StringType(), True)]), True),
+    StructField('B', DecimalType(30,15), False)])
+    """
+    if isinstance(dt, StructType):
+        new_fields = []
+        for field in dt.fields:
+            new_fields.append(
+                StructField(
+                    field.name,
+                    force_decimal_precision_scale(field.dataType, precision=precision, scale=scale),
+                    nullable=field.nullable,
+                    metadata=field.metadata,
+                )
+            )
+        return StructType(new_fields)
+    elif isinstance(dt, ArrayType):
+        return ArrayType(
+            force_decimal_precision_scale(dt.elementType, precision=precision, scale=scale),
+            containsNull=dt.containsNull,
+        )
+    elif isinstance(dt, MapType):
+        return MapType(
+            force_decimal_precision_scale(dt.keyType, precision=precision, scale=scale),
+            force_decimal_precision_scale(dt.valueType, precision=precision, scale=scale),
+            valueContainsNull=dt.valueContainsNull,
+        )
+    elif isinstance(dt, DecimalType):
+        return DecimalType(precision=precision, scale=scale)
+    else:
+        return dt
+def _test() -> None:
+    import doctest
+    import sys
+    import pyspark.pandas.spark.utils
+    globs = pyspark.pandas.spark.utils.__dict__.copy()
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.pandas.spark.utils,
+        globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+    )
+    if failure_count:
+        sys.exit(-1)
+if __name__ == "__main__":
+    _test()

snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py CHANGED Viewed

@@ -19,6 +19,7 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
 )
 from snowflake.snowpark.functions import lit
 from snowflake.snowpark.types import BooleanType, StringType
+from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
 from snowflake.snowpark_connect.config import (
     auto_uppercase_non_column_identifiers,
     global_config,
@@ -743,7 +744,9 @@ class SnowflakeCatalog(AbstractSparkCatalog):
         sp_schema = proto_to_snowpark_type(schema)
         columns = [c.name for c in schema.struct.fields]
         table_name_parts = split_fully_qualified_spark_name(tableName)
-        qualifiers = [table_name_parts for _ in columns]
+        qualifiers: list[set[ColumnQualifier]] = [
+            {ColumnQualifier(tuple(table_name_parts))} for _ in columns
+        ]
         column_types = [f.datatype for f in sp_schema.fields]
         return DataFrameContainer.create_with_column_mapping(
             dataframe=session.createDataFrame([], sp_schema),

snowflake/snowpark_connect/relation/map_aggregate.py CHANGED Viewed

@@ -16,6 +16,7 @@ from snowflake.snowpark.types import DataType
 from snowflake.snowpark_connect.column_name_handler import (
     make_column_names_snowpark_compatible,
 )
+from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
 from snowflake.snowpark_connect.expression.literal import get_literal_field_and_name
 from snowflake.snowpark_connect.expression.map_expression import (
@@ -200,7 +201,7 @@ def map_pivot_aggregate(
                 dataframe=result.select(*column_selectors),
                 spark_column_names=reordered_spark_names,
                 snowpark_column_names=reordered_snowpark_names,
-                column_qualifiers=[[]] * len(reordered_spark_names),
+                column_qualifiers=[set() for _ in reordered_spark_names],
                 parent_column_name_map=input_container.column_map,
                 snowpark_column_types=reordered_types,
             )
@@ -349,7 +350,7 @@ class _ColumnMetadata:
     spark_name: str
     snowpark_name: str
     data_type: DataType
-    qualifiers: list[str]
+    qualifiers: set[ColumnQualifier]
 @dataclass(frozen=True)
@@ -385,7 +386,7 @@ class _Columns:
             col.spark_name for col in self.grouping_columns + self.aggregation_columns
         ]
-    def get_qualifiers(self) -> list[list[str]]:
+    def get_qualifiers(self) -> list[set[ColumnQualifier]]:
         return [
             col.qualifiers for col in self.grouping_columns + self.aggregation_columns
         ]
@@ -429,7 +430,7 @@ def map_aggregate_helper(
                     new_name,
                     None if skip_alias else alias,
                     None if pivot else snowpark_column.typ,
-                    snowpark_column.get_qualifiers(),
+                    qualifiers=snowpark_column.get_qualifiers(),
                 )
             )
@@ -469,7 +470,7 @@ def map_aggregate_helper(
                     new_name,
                     None if skip_alias else alias,
                     agg_col_typ,
-                    [],
+                    qualifiers=set(),
                 )
             )

snowflake/snowpark_connect/relation/map_column_ops.py CHANGED Viewed

@@ -29,6 +29,7 @@ from snowflake.snowpark.column import Column
 from snowflake.snowpark.table_function import _ExplodeFunctionCall
 from snowflake.snowpark.types import DataType, StructField, StructType, _NumericType
 from snowflake.snowpark_connect.column_name_handler import (
+    ColumnQualifier,
     make_column_names_snowpark_compatible,
 )
 from snowflake.snowpark_connect.config import global_config
@@ -315,6 +316,11 @@ def map_project(
         final_snowpark_columns = make_column_names_snowpark_compatible(
             new_spark_columns, rel.common.plan_id
         )
+        # if there are duplicate snowpark column names, we need to disambiguate them by their index
+        if len(new_spark_columns) != len(set(new_spark_columns)):
+            result = result.select(
+                [f"${i}" for i in range(1, len(new_spark_columns) + 1)]
+            )
         result = result.toDF(*final_snowpark_columns)
         new_snowpark_columns = final_snowpark_columns
@@ -1014,7 +1020,7 @@ def map_unpivot(
     column_project = []
     column_reverse_project = []
     snowpark_columns = []
-    qualifiers = []
+    qualifiers: list[set[ColumnQualifier]] = []
     for c in input_container.column_map.get_snowpark_columns():
         c_name = snowpark_functions_col(c, input_container.column_map).get_name()
         if c_name in unpivot_col_names:
@@ -1042,7 +1048,7 @@ def map_unpivot(
             )
             snowpark_columns.append(c)
             qualifiers.append(
-                input_container.column_map.get_qualifier_for_spark_column(c)
+                input_container.column_map.get_qualifiers_for_spark_column(c)
             )
     # Without the case when postprocessing, the result Spark dataframe is:
@@ -1087,7 +1093,7 @@ def map_unpivot(
         snowpark_functions_col(snowpark_value_column_name, input_container.column_map)
     )
     snowpark_columns.append(snowpark_value_column_name)
-    qualifiers.extend([[]] * 2)
+    qualifiers.extend([set() for _ in range(2)])
     result = (
         input_df.select(*column_project)

snowflake/snowpark_connect/relation/map_extension.py CHANGED Viewed

@@ -15,6 +15,7 @@ from snowflake.snowpark_connect.column_name_handler import (
     ColumnNameMap,
     make_column_names_snowpark_compatible,
 )
+from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
 from snowflake.snowpark_connect.config import get_boolean_session_config_param
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
 from snowflake.snowpark_connect.error.error_codes import ErrorCodes
@@ -178,7 +179,7 @@ def get_udtf_project(relation: relation_proto.Relation) -> bool:
 def handle_udtf_with_table_arguments(
     udtf_info: snowflake_proto.UDTFWithTableArguments,
-) -> snowpark.DataFrame:
+) -> DataFrameContainer:
     """
     Handle UDTF with one or more table arguments using Snowpark's join_table_function.
     For multiple table arguments, this creates a Cartesian product of all input tables.
@@ -286,7 +287,7 @@ def handle_lateral_join_with_udtf(
     left_result: DataFrameContainer,
     udtf_relation: relation_proto.Relation,
     udtf_info: tuple[snowpark.udtf.UserDefinedTableFunction, list],
-) -> snowpark.DataFrame:
+) -> DataFrameContainer:
     """
     Handle lateral join with UDTF on the right side using join_table_function.
     """
@@ -319,7 +320,7 @@ def handle_lateral_join_with_udtf(
 def map_aggregate(
     aggregate: snowflake_proto.Aggregate, plan_id: int
-) -> snowpark.DataFrame:
+) -> DataFrameContainer:
     input_container = map_relation(aggregate.input)
     input_df: snowpark.DataFrame = input_container.dataframe
@@ -363,7 +364,7 @@ def map_aggregate(
         return new_names[0], snowpark_column
     raw_groupings: list[tuple[str, TypedColumn]] = []
-    raw_aggregations: list[tuple[str, TypedColumn, list[str]]] = []
+    raw_aggregations: list[tuple[str, TypedColumn, set[ColumnQualifier]]] = []
     if not is_group_by_all:
         raw_groupings = [_map_column(exp) for exp in aggregate.grouping_expressions]
@@ -401,11 +402,11 @@ def map_aggregate(
         col = _map_column(exp)
         if exp.WhichOneof("expr_type") == "unresolved_attribute":
             spark_name = col[0]
-            qualifiers = input_container.column_map.get_qualifier_for_spark_column(
-                spark_name
-            )
+            qualifiers: set[
+                ColumnQualifier
+            ] = input_container.column_map.get_qualifiers_for_spark_column(spark_name)
         else:
-            qualifiers = []
+            qualifiers = set()
         raw_aggregations.append((col[0], col[1], qualifiers))
@@ -438,7 +439,7 @@ def map_aggregate(
     spark_columns: list[str] = []
     snowpark_columns: list[str] = []
     snowpark_column_types: list[snowpark_types.DataType] = []
-    all_qualifiers: list[list[str]] = []
+    all_qualifiers: list[set[ColumnQualifier]] = []
     # Use grouping columns directly without aliases
     groupings = [col.col for _, col in raw_groupings]

snowpark-connect 0.31.0__py3-none-any.whl → 0.33.0__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.31.0py3-none-any.whl → 0.33.0py3-none-any.whl