PyPI - snowpark-connect - Versions diffs - 0.23.0__py3-none-any.whl → 0.24.0__py3-none-any.whl - Mend

snowpark-connect 0.23.0py3-none-any.whl → 0.24.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (25) hide show

snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/functions.py ADDED Viewed

@@ -0,0 +1,203 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Additional Spark functions used in pandas-on-Spark.
+"""
+from typing import Union
+from pyspark import SparkContext
+import pyspark.sql.functions as F
+from pyspark.sql.column import Column
+# For supporting Spark Connect
+from pyspark.sql.utils import is_remote
+def product(col: Column, dropna: bool) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_product",
+            col,  # type: ignore[arg-type]
+            lit(dropna),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasProduct(col._jc, dropna))
+def stddev(col: Column, ddof: int) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_stddev",
+            col,  # type: ignore[arg-type]
+            lit(ddof),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasStddev(col._jc, ddof))
+def var(col: Column, ddof: int) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_var",
+            col,  # type: ignore[arg-type]
+            lit(ddof),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasVariance(col._jc, ddof))
+def skew(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_skew",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasSkewness(col._jc))
+def kurt(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_kurt",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasKurtosis(col._jc))
+def mode(col: Column, dropna: bool) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_mode",
+            col,  # type: ignore[arg-type]
+            lit(dropna),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasMode(col._jc, dropna))
+def covar(col1: Column, col2: Column, ddof: int) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_covar",
+            col1,  # type: ignore[arg-type]
+            col2,  # type: ignore[arg-type]
+            lit(ddof),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasCovar(col1._jc, col2._jc, ddof))
+def repeat(col: Column, n: Union[int, Column]) -> Column:
+    """
+    Repeats a string column n times, and returns it as a new string column.
+    """
+    _n = F.lit(n) if isinstance(n, int) else n
+    return F.call_udf("repeat", col, _n)
+def ewm(col: Column, alpha: float, ignore_na: bool) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "ewm",
+            col,  # type: ignore[arg-type]
+            lit(alpha),
+            lit(ignore_na),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.ewm(col._jc, alpha, ignore_na))
+def last_non_null(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "last_non_null",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.lastNonNull(col._jc))
+def null_index(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "null_index",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.nullIndex(col._jc))
+def timestampdiff(unit: str, start: Column, end: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "timestampdiff",
+            lit(unit),
+            start,  # type: ignore[arg-type]
+            end,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.timestampDiff(unit, start._jc, end._jc))

snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/utils.py ADDED Viewed

@@ -0,0 +1,202 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Helpers and utilities to deal with PySpark instances
+"""
+from typing import overload
+from pyspark.sql.types import DecimalType, StructType, MapType, ArrayType, StructField, DataType
+@overload
+def as_nullable_spark_type(dt: StructType) -> StructType:
+    ...
+@overload
+def as_nullable_spark_type(dt: ArrayType) -> ArrayType:
+    ...
+@overload
+def as_nullable_spark_type(dt: MapType) -> MapType:
+    ...
+@overload
+def as_nullable_spark_type(dt: DataType) -> DataType:
+    ...
+def as_nullable_spark_type(dt: DataType) -> DataType:
+    """
+    Returns a nullable schema or data types.
+    Examples
+    --------
+    >>> from pyspark.sql.types import *
+    >>> as_nullable_spark_type(StructType([
+    ...     StructField("A", IntegerType(), True),
+    ...     StructField("B", FloatType(), False)]))  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A', IntegerType(), True), StructField('B', FloatType(), True)])
+    >>> as_nullable_spark_type(StructType([
+    ...     StructField("A",
+    ...         StructType([
+    ...             StructField('a',
+    ...                 MapType(IntegerType(),
+    ...                 ArrayType(IntegerType(), False), False), False),
+    ...             StructField('b', StringType(), True)])),
+    ...     StructField("B", FloatType(), False)]))  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A',
+        StructType([StructField('a',
+            MapType(IntegerType(),
+            ArrayType(IntegerType(), True), True), True),
+        StructField('b', StringType(), True)]), True),
+    StructField('B', FloatType(), True)])
+    """
+    if isinstance(dt, StructType):
+        new_fields = []
+        for field in dt.fields:
+            new_fields.append(
+                StructField(
+                    field.name,
+                    as_nullable_spark_type(field.dataType),
+                    nullable=True,
+                    metadata=field.metadata,
+                )
+            )
+        return StructType(new_fields)
+    elif isinstance(dt, ArrayType):
+        return ArrayType(as_nullable_spark_type(dt.elementType), containsNull=True)
+    elif isinstance(dt, MapType):
+        return MapType(
+            as_nullable_spark_type(dt.keyType),
+            as_nullable_spark_type(dt.valueType),
+            valueContainsNull=True,
+        )
+    else:
+        return dt
+@overload
+def force_decimal_precision_scale(
+    dt: StructType, *, precision: int = ..., scale: int = ...
+) -> StructType:
+    ...
+@overload
+def force_decimal_precision_scale(
+    dt: ArrayType, *, precision: int = ..., scale: int = ...
+) -> ArrayType:
+    ...
+@overload
+def force_decimal_precision_scale(
+    dt: MapType, *, precision: int = ..., scale: int = ...
+) -> MapType:
+    ...
+@overload
+def force_decimal_precision_scale(
+    dt: DataType, *, precision: int = ..., scale: int = ...
+) -> DataType:
+    ...
+def force_decimal_precision_scale(
+    dt: DataType, *, precision: int = 38, scale: int = 18
+) -> DataType:
+    """
+    Returns a data type with a fixed decimal type.
+    The precision and scale of the decimal type are fixed with the given values.
+    Examples
+    --------
+    >>> from pyspark.sql.types import *
+    >>> force_decimal_precision_scale(StructType([
+    ...     StructField("A", DecimalType(10, 0), True),
+    ...     StructField("B", DecimalType(14, 7), False)]))  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A', DecimalType(38,18), True),
+                StructField('B', DecimalType(38,18), False)])
+    >>> force_decimal_precision_scale(StructType([
+    ...     StructField("A",
+    ...         StructType([
+    ...             StructField('a',
+    ...                 MapType(DecimalType(5, 0),
+    ...                 ArrayType(DecimalType(20, 0), False), False), False),
+    ...             StructField('b', StringType(), True)])),
+    ...     StructField("B", DecimalType(30, 15), False)]),
+    ...     precision=30, scale=15)  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A',
+        StructType([StructField('a',
+            MapType(DecimalType(30,15),
+            ArrayType(DecimalType(30,15), False), False), False),
+        StructField('b', StringType(), True)]), True),
+    StructField('B', DecimalType(30,15), False)])
+    """
+    if isinstance(dt, StructType):
+        new_fields = []
+        for field in dt.fields:
+            new_fields.append(
+                StructField(
+                    field.name,
+                    force_decimal_precision_scale(field.dataType, precision=precision, scale=scale),
+                    nullable=field.nullable,
+                    metadata=field.metadata,
+                )
+            )
+        return StructType(new_fields)
+    elif isinstance(dt, ArrayType):
+        return ArrayType(
+            force_decimal_precision_scale(dt.elementType, precision=precision, scale=scale),
+            containsNull=dt.containsNull,
+        )
+    elif isinstance(dt, MapType):
+        return MapType(
+            force_decimal_precision_scale(dt.keyType, precision=precision, scale=scale),
+            force_decimal_precision_scale(dt.valueType, precision=precision, scale=scale),
+            valueContainsNull=dt.valueContainsNull,
+        )
+    elif isinstance(dt, DecimalType):
+        return DecimalType(precision=precision, scale=scale)
+    else:
+        return dt
+def _test() -> None:
+    import doctest
+    import sys
+    import pyspark.pandas.spark.utils
+    globs = pyspark.pandas.spark.utils.__dict__.copy()
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.pandas.spark.utils,
+        globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+    )
+    if failure_count:
+        sys.exit(-1)
+if __name__ == "__main__":
+    _test()

snowflake/snowpark_connect/relation/map_column_ops.py CHANGED Viewed

@@ -6,12 +6,10 @@ import ast
 import json
 import sys
 from collections import defaultdict
-from copy import copy
 import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
 import pyspark.sql.connect.proto.types_pb2 as types_proto
-from pyspark.errors import PySparkValueError
 from pyspark.errors.exceptions.base import AnalysisException
 from pyspark.serializers import CloudPickleSerializer
@@ -46,7 +44,6 @@ from snowflake.snowpark_connect.expression.typer import ExpressionTyper
 from snowflake.snowpark_connect.relation.map_relation import map_relation
 from snowflake.snowpark_connect.relation.utils import (
     TYPE_MAP_FOR_TO_SCHEMA,
-    can_sort_be_flattened,
     snowpark_functions_col,
 )
 from snowflake.snowpark_connect.type_mapping import (
@@ -346,12 +343,6 @@ def map_sort(
     sort_order = sort.order
-    if not sort_order:
-        raise PySparkValueError(
-            error_class="CANNOT_BE_EMPTY",
-            message="At least one column must be specified.",
-        )
     if len(sort_order) == 1:
         parsed_col_name = split_fully_qualified_spark_name(
             sort_order[0].child.unresolved_attribute.unparsed_identifier
@@ -433,29 +424,7 @@ def map_sort(
     if not order_specified:
         ascending = None
-    select_statement = getattr(input_df, "_select_statement", None)
-    sort_expressions = [c._expression for c in cols]
-    if (
-        can_sort_be_flattened(select_statement, *sort_expressions)
-        and input_df._ops_after_agg is None
-    ):
-        # "flattened" order by that will allow using dropped columns
-        new = copy(select_statement)
-        new.from_ = select_statement.from_.to_subqueryable()
-        new.pre_actions = new.from_.pre_actions
-        new.post_actions = new.from_.post_actions
-        new.order_by = sort_expressions + (select_statement.order_by or [])
-        new.column_states = select_statement.column_states
-        new._merge_projection_complexity_with_subquery = False
-        new.df_ast_ids = (
-            select_statement.df_ast_ids.copy()
-            if select_statement.df_ast_ids is not None
-            else None
-        )
-        new.attributes = select_statement.attributes
-        result = input_df._with_plan(new)
-    else:
-        result = input_df.sort(cols, ascending=ascending)
+    result = input_df.sort(cols, ascending=ascending)
     return DataFrameContainer(
         result,

snowflake/snowpark_connect/relation/map_extension.py CHANGED Viewed

@@ -347,6 +347,13 @@ def map_aggregate(
     raw_groupings: list[tuple[str, TypedColumn]] = []
     raw_aggregations: list[tuple[str, TypedColumn]] = []
+    if not is_group_by_all:
+        raw_groupings = [_map_column(exp) for exp in aggregate.grouping_expressions]
+    # Set the current grouping columns in context for grouping_id() function
+    grouping_spark_columns = [spark_name for spark_name, _ in raw_groupings]
+    set_current_grouping_columns(grouping_spark_columns)
     agg_count = get_sql_aggregate_function_count()
     for exp in aggregate.aggregate_expressions:
         col = _map_column(exp)
@@ -359,13 +366,6 @@ def map_aggregate(
             else:
                 agg_count = new_agg_count
-    if not is_group_by_all:
-        raw_groupings = [_map_column(exp) for exp in aggregate.grouping_expressions]
-    # Set the current grouping columns in context for grouping_id() function
-    grouping_spark_columns = [spark_name for spark_name, _ in raw_groupings]
-    set_current_grouping_columns(grouping_spark_columns)
     # Now create column name lists and assign aliases.
     # In case of GROUP BY ALL, even though groupings are a subset of aggregations,
     # they will have their own aliases so we can drop them later.

snowflake/snowpark_connect/relation/map_row_ops.py CHANGED Viewed

@@ -1,7 +1,7 @@
 #
 # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
 #
-from copy import copy
 import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
@@ -9,7 +9,6 @@ from pyspark.errors.exceptions.base import AnalysisException, IllegalArgumentExc
 import snowflake.snowpark_connect.relation.utils as utils
 from snowflake import snowpark
-from snowflake.snowpark._internal.analyzer.binary_expression import And
 from snowflake.snowpark.functions import col, expr as snowpark_expr
 from snowflake.snowpark.types import (
     BooleanType,
@@ -31,7 +30,6 @@ from snowflake.snowpark_connect.expression.map_expression import (
 )
 from snowflake.snowpark_connect.expression.typer import ExpressionTyper
 from snowflake.snowpark_connect.relation.map_relation import map_relation
-from snowflake.snowpark_connect.relation.utils import can_filter_be_flattened
 from snowflake.snowpark_connect.utils.telemetry import (
     SnowparkConnectNotImplementedError,
 )
@@ -555,32 +553,7 @@ def map_filter(
         rel.filter.condition, input_container.column_map, typer
     )
-    select_statement = getattr(input_df, "_select_statement", None)
-    condition_exp = condition.col._expression
-    if (
-        can_filter_be_flattened(select_statement, condition_exp)
-        and input_df._ops_after_agg is None
-    ):
-        new = copy(select_statement)
-        new.from_ = select_statement.from_.to_subqueryable()
-        new.pre_actions = new.from_.pre_actions
-        new.post_actions = new.from_.post_actions
-        new.column_states = select_statement.column_states
-        new.where = (
-            And(select_statement.where, condition_exp)
-            if select_statement.where is not None
-            else condition_exp
-        )
-        new._merge_projection_complexity_with_subquery = False
-        new.df_ast_ids = (
-            select_statement.df_ast_ids.copy()
-            if select_statement.df_ast_ids is not None
-            else None
-        )
-        new.attributes = select_statement.attributes
-        result = input_df._with_plan(new)
-    else:
-        result = input_df.filter(condition.col)
+    result = input_df.filter(condition.col)
     return DataFrameContainer(
         result,

snowflake/snowpark_connect/relation/read/utils.py CHANGED Viewed

@@ -73,13 +73,12 @@ def rename_columns_as_snowflake_standard(
         return df, []
     new_columns = make_column_names_snowpark_compatible(df.columns, plan_id)
-    result = df.toDF(*new_columns)
-    if result._select_statement is not None:
-        # do not allow snowpark to flatten the to_df result
-        # TODO: remove after SNOW-2203706 is fixed
-        result._select_statement.flatten_disabled = True
-    return (result, new_columns)
+    return (
+        df.select(
+            *(df.col(orig).alias(alias) for orig, alias in zip(df.columns, new_columns))
+        ),
+        new_columns,
+    )
 class Connection(Protocol):

snowpark-connect 0.23.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.23.0py3-none-any.whl → 0.24.0py3-none-any.whl