PyPI - snowpark-connect - Versions diffs - 0.22.1__py3-none-any.whl → 0.24.0__py3-none-any.whl - Mend

snowpark-connect 0.22.1py3-none-any.whl → 0.24.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (46) hide show

snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/functions.py ADDED Viewed

@@ -0,0 +1,203 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Additional Spark functions used in pandas-on-Spark.
+"""
+from typing import Union
+from pyspark import SparkContext
+import pyspark.sql.functions as F
+from pyspark.sql.column import Column
+# For supporting Spark Connect
+from pyspark.sql.utils import is_remote
+def product(col: Column, dropna: bool) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_product",
+            col,  # type: ignore[arg-type]
+            lit(dropna),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasProduct(col._jc, dropna))
+def stddev(col: Column, ddof: int) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_stddev",
+            col,  # type: ignore[arg-type]
+            lit(ddof),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasStddev(col._jc, ddof))
+def var(col: Column, ddof: int) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_var",
+            col,  # type: ignore[arg-type]
+            lit(ddof),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasVariance(col._jc, ddof))
+def skew(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_skew",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasSkewness(col._jc))
+def kurt(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_kurt",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasKurtosis(col._jc))
+def mode(col: Column, dropna: bool) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_mode",
+            col,  # type: ignore[arg-type]
+            lit(dropna),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasMode(col._jc, dropna))
+def covar(col1: Column, col2: Column, ddof: int) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "pandas_covar",
+            col1,  # type: ignore[arg-type]
+            col2,  # type: ignore[arg-type]
+            lit(ddof),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.pandasCovar(col1._jc, col2._jc, ddof))
+def repeat(col: Column, n: Union[int, Column]) -> Column:
+    """
+    Repeats a string column n times, and returns it as a new string column.
+    """
+    _n = F.lit(n) if isinstance(n, int) else n
+    return F.call_udf("repeat", col, _n)
+def ewm(col: Column, alpha: float, ignore_na: bool) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "ewm",
+            col,  # type: ignore[arg-type]
+            lit(alpha),
+            lit(ignore_na),
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.ewm(col._jc, alpha, ignore_na))
+def last_non_null(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "last_non_null",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.lastNonNull(col._jc))
+def null_index(col: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "null_index",
+            col,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.nullIndex(col._jc))
+def timestampdiff(unit: str, start: Column, end: Column) -> Column:
+    if is_remote():
+        from pyspark.sql.connect.functions import _invoke_function_over_columns, lit
+        return _invoke_function_over_columns(  # type: ignore[return-value]
+            "timestampdiff",
+            lit(unit),
+            start,  # type: ignore[arg-type]
+            end,  # type: ignore[arg-type]
+        )
+    else:
+        sc = SparkContext._active_spark_context
+        return Column(sc._jvm.PythonSQLUtils.timestampDiff(unit, start._jc, end._jc))

snowflake/snowpark_connect/includes/python/pyspark/pandas/spark/utils.py ADDED Viewed

@@ -0,0 +1,202 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Helpers and utilities to deal with PySpark instances
+"""
+from typing import overload
+from pyspark.sql.types import DecimalType, StructType, MapType, ArrayType, StructField, DataType
+@overload
+def as_nullable_spark_type(dt: StructType) -> StructType:
+    ...
+@overload
+def as_nullable_spark_type(dt: ArrayType) -> ArrayType:
+    ...
+@overload
+def as_nullable_spark_type(dt: MapType) -> MapType:
+    ...
+@overload
+def as_nullable_spark_type(dt: DataType) -> DataType:
+    ...
+def as_nullable_spark_type(dt: DataType) -> DataType:
+    """
+    Returns a nullable schema or data types.
+    Examples
+    --------
+    >>> from pyspark.sql.types import *
+    >>> as_nullable_spark_type(StructType([
+    ...     StructField("A", IntegerType(), True),
+    ...     StructField("B", FloatType(), False)]))  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A', IntegerType(), True), StructField('B', FloatType(), True)])
+    >>> as_nullable_spark_type(StructType([
+    ...     StructField("A",
+    ...         StructType([
+    ...             StructField('a',
+    ...                 MapType(IntegerType(),
+    ...                 ArrayType(IntegerType(), False), False), False),
+    ...             StructField('b', StringType(), True)])),
+    ...     StructField("B", FloatType(), False)]))  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A',
+        StructType([StructField('a',
+            MapType(IntegerType(),
+            ArrayType(IntegerType(), True), True), True),
+        StructField('b', StringType(), True)]), True),
+    StructField('B', FloatType(), True)])
+    """
+    if isinstance(dt, StructType):
+        new_fields = []
+        for field in dt.fields:
+            new_fields.append(
+                StructField(
+                    field.name,
+                    as_nullable_spark_type(field.dataType),
+                    nullable=True,
+                    metadata=field.metadata,
+                )
+            )
+        return StructType(new_fields)
+    elif isinstance(dt, ArrayType):
+        return ArrayType(as_nullable_spark_type(dt.elementType), containsNull=True)
+    elif isinstance(dt, MapType):
+        return MapType(
+            as_nullable_spark_type(dt.keyType),
+            as_nullable_spark_type(dt.valueType),
+            valueContainsNull=True,
+        )
+    else:
+        return dt
+@overload
+def force_decimal_precision_scale(
+    dt: StructType, *, precision: int = ..., scale: int = ...
+) -> StructType:
+    ...
+@overload
+def force_decimal_precision_scale(
+    dt: ArrayType, *, precision: int = ..., scale: int = ...
+) -> ArrayType:
+    ...
+@overload
+def force_decimal_precision_scale(
+    dt: MapType, *, precision: int = ..., scale: int = ...
+) -> MapType:
+    ...
+@overload
+def force_decimal_precision_scale(
+    dt: DataType, *, precision: int = ..., scale: int = ...
+) -> DataType:
+    ...
+def force_decimal_precision_scale(
+    dt: DataType, *, precision: int = 38, scale: int = 18
+) -> DataType:
+    """
+    Returns a data type with a fixed decimal type.
+    The precision and scale of the decimal type are fixed with the given values.
+    Examples
+    --------
+    >>> from pyspark.sql.types import *
+    >>> force_decimal_precision_scale(StructType([
+    ...     StructField("A", DecimalType(10, 0), True),
+    ...     StructField("B", DecimalType(14, 7), False)]))  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A', DecimalType(38,18), True),
+                StructField('B', DecimalType(38,18), False)])
+    >>> force_decimal_precision_scale(StructType([
+    ...     StructField("A",
+    ...         StructType([
+    ...             StructField('a',
+    ...                 MapType(DecimalType(5, 0),
+    ...                 ArrayType(DecimalType(20, 0), False), False), False),
+    ...             StructField('b', StringType(), True)])),
+    ...     StructField("B", DecimalType(30, 15), False)]),
+    ...     precision=30, scale=15)  # doctest: +NORMALIZE_WHITESPACE
+    StructType([StructField('A',
+        StructType([StructField('a',
+            MapType(DecimalType(30,15),
+            ArrayType(DecimalType(30,15), False), False), False),
+        StructField('b', StringType(), True)]), True),
+    StructField('B', DecimalType(30,15), False)])
+    """
+    if isinstance(dt, StructType):
+        new_fields = []
+        for field in dt.fields:
+            new_fields.append(
+                StructField(
+                    field.name,
+                    force_decimal_precision_scale(field.dataType, precision=precision, scale=scale),
+                    nullable=field.nullable,
+                    metadata=field.metadata,
+                )
+            )
+        return StructType(new_fields)
+    elif isinstance(dt, ArrayType):
+        return ArrayType(
+            force_decimal_precision_scale(dt.elementType, precision=precision, scale=scale),
+            containsNull=dt.containsNull,
+        )
+    elif isinstance(dt, MapType):
+        return MapType(
+            force_decimal_precision_scale(dt.keyType, precision=precision, scale=scale),
+            force_decimal_precision_scale(dt.valueType, precision=precision, scale=scale),
+            valueContainsNull=dt.valueContainsNull,
+        )
+    elif isinstance(dt, DecimalType):
+        return DecimalType(precision=precision, scale=scale)
+    else:
+        return dt
+def _test() -> None:
+    import doctest
+    import sys
+    import pyspark.pandas.spark.utils
+    globs = pyspark.pandas.spark.utils.__dict__.copy()
+    (failure_count, test_count) = doctest.testmod(
+        pyspark.pandas.spark.utils,
+        globs=globs,
+        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE,
+    )
+    if failure_count:
+        sys.exit(-1)
+if __name__ == "__main__":
+    _test()

snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py CHANGED Viewed

@@ -16,7 +16,7 @@ from pyspark.sql.connect.proto import relations_pb2 as spark_dot_connect_dot_rel
 from pyspark.sql.connect.proto import expressions_pb2 as spark_dot_connect_dot_expressions__pb2
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x1csnowflake_relation_ext.proto\x12\rsnowflake.ext\x1a\x1dspark/connect/relations.proto\x1a\x1fspark/connect/expressions.proto\"\xe3\x02\n\tExtension\x12(\n\x07rdd_map\x18\x01 \x01(\x0b\x32\x15.snowflake.ext.RddMapH\x00\x12.\n\nrdd_reduce\x18\x02 \x01(\x0b\x32\x18.snowflake.ext.RddReduceH\x00\x12G\n\x17subquery_column_aliases\x18\x03 \x01(\x0b\x32$.snowflake.ext.SubqueryColumnAliasesH\x00\x12\x32\n\x0clateral_join\x18\x04 \x01(\x0b\x32\x1a.snowflake.ext.LateralJoinH\x00\x12J\n\x19udtf_with_table_arguments\x18\x05 \x01(\x0b\x32%.snowflake.ext.UDTFWithTableArgumentsH\x00\x12-\n\taggregate\x18\x06 \x01(\x0b\x32\x18.snowflake.ext.AggregateH\x00\x42\x04\n\x02op\">\n\x06RddMap\x12&\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12\x0c\n\x04\x66unc\x18\x02 \x01(\x0c\"A\n\tRddReduce\x12&\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12\x0c\n\x04\x66unc\x18\x02 \x01(\x0c\"P\n\x15SubqueryColumnAliases\x12&\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12\x0f\n\x07\x61liases\x18\x02 \x03(\t\"\\\n\x0bLateralJoin\x12%\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12&\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.Relation\"\x98\x01\n\x16UDTFWithTableArguments\x12\x15\n\rfunction_name\x18\x01 \x01(\t\x12,\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.Expression\x12\x39\n\x0ftable_arguments\x18\x03 \x03(\x0b\x32 .snowflake.ext.TableArgumentInfo\"`\n\x11TableArgumentInfo\x12/\n\x0etable_argument\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12\x1a\n\x12table_argument_idx\x18\x02 \x01(\x05\"\x92\x05\n\tAggregate\x12&\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12\x36\n\ngroup_type\x18\x02 \x01(\x0e\x32\".snowflake.ext.Aggregate.GroupType\x12\x37\n\x14grouping_expressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.Expression\x12\x38\n\x15\x61ggregate_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.Expression\x12-\n\x05pivot\x18\x05 \x01(\x0b\x32\x1e.snowflake.ext.Aggregate.Pivot\x12<\n\rgrouping_sets\x18\x06 \x03(\x0b\x32%.snowflake.ext.Aggregate.GroupingSets\x1a\x62\n\x05Pivot\x12&\n\x03\x63ol\x18\x01 \x01(\x0b\x32\x19.spark.connect.Expression\x12\x31\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.Literal\x1a?\n\x0cGroupingSets\x12/\n\x0cgrouping_set\x18\x01 \x03(\x0b\x32\x19.spark.connect.Expression\"\x9f\x01\n\tGroupType\x12\x1a\n\x16GROUP_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12GROUP_TYPE_GROUPBY\x10\x01\x12\x15\n\x11GROUP_TYPE_ROLLUP\x10\x02\x12\x13\n\x0fGROUP_TYPE_CUBE\x10\x03\x12\x14\n\x10GROUP_TYPE_PIVOT\x10\x04\x12\x1c\n\x18GROUP_TYPE_GROUPING_SETS\x10\x05\x62\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x1csnowflake_relation_ext.proto\x12\rsnowflake.ext\x1a\x1dspark/connect/relations.proto\x1a\x1fspark/connect/expressions.proto\"\xe3\x02\n\tExtension\x12(\n\x07rdd_map\x18\x01 \x01(\x0b\x32\x15.snowflake.ext.RddMapH\x00\x12.\n\nrdd_reduce\x18\x02 \x01(\x0b\x32\x18.snowflake.ext.RddReduceH\x00\x12G\n\x17subquery_column_aliases\x18\x03 \x01(\x0b\x32$.snowflake.ext.SubqueryColumnAliasesH\x00\x12\x32\n\x0clateral_join\x18\x04 \x01(\x0b\x32\x1a.snowflake.ext.LateralJoinH\x00\x12J\n\x19udtf_with_table_arguments\x18\x05 \x01(\x0b\x32%.snowflake.ext.UDTFWithTableArgumentsH\x00\x12-\n\taggregate\x18\x06 \x01(\x0b\x32\x18.snowflake.ext.AggregateH\x00\x42\x04\n\x02op\">\n\x06RddMap\x12&\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12\x0c\n\x04\x66unc\x18\x02 \x01(\x0c\"A\n\tRddReduce\x12&\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12\x0c\n\x04\x66unc\x18\x02 \x01(\x0c\"P\n\x15SubqueryColumnAliases\x12&\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12\x0f\n\x07\x61liases\x18\x02 \x03(\t\"\\\n\x0bLateralJoin\x12%\n\x04left\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12&\n\x05right\x18\x02 \x01(\x0b\x32\x17.spark.connect.Relation\"\x98\x01\n\x16UDTFWithTableArguments\x12\x15\n\rfunction_name\x18\x01 \x01(\t\x12,\n\targuments\x18\x02 \x03(\x0b\x32\x19.spark.connect.Expression\x12\x39\n\x0ftable_arguments\x18\x03 \x03(\x0b\x32 .snowflake.ext.TableArgumentInfo\"`\n\x11TableArgumentInfo\x12/\n\x0etable_argument\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12\x1a\n\x12table_argument_idx\x18\x02 \x01(\x05\"\xc7\x05\n\tAggregate\x12&\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.Relation\x12\x36\n\ngroup_type\x18\x02 \x01(\x0e\x32\".snowflake.ext.Aggregate.GroupType\x12\x37\n\x14grouping_expressions\x18\x03 \x03(\x0b\x32\x19.spark.connect.Expression\x12\x38\n\x15\x61ggregate_expressions\x18\x04 \x03(\x0b\x32\x19.spark.connect.Expression\x12-\n\x05pivot\x18\x05 \x01(\x0b\x32\x1e.snowflake.ext.Aggregate.Pivot\x12<\n\rgrouping_sets\x18\x06 \x03(\x0b\x32%.snowflake.ext.Aggregate.GroupingSets\x12\x33\n\x10having_condition\x18\x07 \x01(\x0b\x32\x19.spark.connect.Expression\x1a\x62\n\x05Pivot\x12&\n\x03\x63ol\x18\x01 \x01(\x0b\x32\x19.spark.connect.Expression\x12\x31\n\x06values\x18\x02 \x03(\x0b\x32!.spark.connect.Expression.Literal\x1a?\n\x0cGroupingSets\x12/\n\x0cgrouping_set\x18\x01 \x03(\x0b\x32\x19.spark.connect.Expression\"\x9f\x01\n\tGroupType\x12\x1a\n\x16GROUP_TYPE_UNSPECIFIED\x10\x00\x12\x16\n\x12GROUP_TYPE_GROUPBY\x10\x01\x12\x15\n\x11GROUP_TYPE_ROLLUP\x10\x02\x12\x13\n\x0fGROUP_TYPE_CUBE\x10\x03\x12\x14\n\x10GROUP_TYPE_PIVOT\x10\x04\x12\x1c\n\x18GROUP_TYPE_GROUPING_SETS\x10\x05\x62\x06proto3')
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
@@ -38,11 +38,11 @@ if _descriptor._USE_C_DESCRIPTORS == False:
   _globals['_TABLEARGUMENTINFO']._serialized_start=931
   _globals['_TABLEARGUMENTINFO']._serialized_end=1027
   _globals['_AGGREGATE']._serialized_start=1030
-  _globals['_AGGREGATE']._serialized_end=1688
-  _globals['_AGGREGATE_PIVOT']._serialized_start=1363
-  _globals['_AGGREGATE_PIVOT']._serialized_end=1461
-  _globals['_AGGREGATE_GROUPINGSETS']._serialized_start=1463
-  _globals['_AGGREGATE_GROUPINGSETS']._serialized_end=1526
-  _globals['_AGGREGATE_GROUPTYPE']._serialized_start=1529
-  _globals['_AGGREGATE_GROUPTYPE']._serialized_end=1688
+  _globals['_AGGREGATE']._serialized_end=1741
+  _globals['_AGGREGATE_PIVOT']._serialized_start=1416
+  _globals['_AGGREGATE_PIVOT']._serialized_end=1514
+  _globals['_AGGREGATE_GROUPINGSETS']._serialized_start=1516
+  _globals['_AGGREGATE_GROUPINGSETS']._serialized_end=1579
+  _globals['_AGGREGATE_GROUPTYPE']._serialized_start=1582
+  _globals['_AGGREGATE_GROUPTYPE']._serialized_end=1741
 # @@protoc_insertion_point(module_scope)

snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi CHANGED Viewed

@@ -75,7 +75,7 @@ class TableArgumentInfo(_message.Message):
     def __init__(self, table_argument: _Optional[_Union[_relations_pb2.Relation, _Mapping]] = ..., table_argument_idx: _Optional[int] = ...) -> None: ...
 class Aggregate(_message.Message):
-    __slots__ = ("input", "group_type", "grouping_expressions", "aggregate_expressions", "pivot", "grouping_sets")
+    __slots__ = ("input", "group_type", "grouping_expressions", "aggregate_expressions", "pivot", "grouping_sets", "having_condition")
     class GroupType(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
         __slots__ = ()
         GROUP_TYPE_UNSPECIFIED: _ClassVar[Aggregate.GroupType]
@@ -108,10 +108,12 @@ class Aggregate(_message.Message):
     AGGREGATE_EXPRESSIONS_FIELD_NUMBER: _ClassVar[int]
     PIVOT_FIELD_NUMBER: _ClassVar[int]
     GROUPING_SETS_FIELD_NUMBER: _ClassVar[int]
+    HAVING_CONDITION_FIELD_NUMBER: _ClassVar[int]
     input: _relations_pb2.Relation
     group_type: Aggregate.GroupType
     grouping_expressions: _containers.RepeatedCompositeFieldContainer[_expressions_pb2.Expression]
     aggregate_expressions: _containers.RepeatedCompositeFieldContainer[_expressions_pb2.Expression]
     pivot: Aggregate.Pivot
     grouping_sets: _containers.RepeatedCompositeFieldContainer[Aggregate.GroupingSets]
-    def __init__(self, input: _Optional[_Union[_relations_pb2.Relation, _Mapping]] = ..., group_type: _Optional[_Union[Aggregate.GroupType, str]] = ..., grouping_expressions: _Optional[_Iterable[_Union[_expressions_pb2.Expression, _Mapping]]] = ..., aggregate_expressions: _Optional[_Iterable[_Union[_expressions_pb2.Expression, _Mapping]]] = ..., pivot: _Optional[_Union[Aggregate.Pivot, _Mapping]] = ..., grouping_sets: _Optional[_Iterable[_Union[Aggregate.GroupingSets, _Mapping]]] = ...) -> None: ...
+    having_condition: _expressions_pb2.Expression
+    def __init__(self, input: _Optional[_Union[_relations_pb2.Relation, _Mapping]] = ..., group_type: _Optional[_Union[Aggregate.GroupType, str]] = ..., grouping_expressions: _Optional[_Iterable[_Union[_expressions_pb2.Expression, _Mapping]]] = ..., aggregate_expressions: _Optional[_Iterable[_Union[_expressions_pb2.Expression, _Mapping]]] = ..., pivot: _Optional[_Union[Aggregate.Pivot, _Mapping]] = ..., grouping_sets: _Optional[_Iterable[_Union[Aggregate.GroupingSets, _Mapping]]] = ..., having_condition: _Optional[_Union[_expressions_pb2.Expression, _Mapping]] = ...) -> None: ...

snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py CHANGED Viewed

@@ -8,7 +8,10 @@ import typing
 import pandas
 import pyspark.sql.connect.proto.common_pb2 as common_proto
 import pyspark.sql.connect.proto.types_pb2 as types_proto
-from snowflake.core.exceptions import NotFoundError
+from pyspark.sql.connect.client.core import Retrying
+from snowflake.core.exceptions import APIError, NotFoundError
+from snowflake.core.schema import Schema
+from snowflake.core.table import Table, TableColumn
 from snowflake.snowpark import functions
 from snowflake.snowpark._internal.analyzer.analyzer_utils import (
@@ -22,6 +25,7 @@ from snowflake.snowpark_connect.config import (
     global_config,
 )
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
+from snowflake.snowpark_connect.error.exceptions import MaxRetryExceeded
 from snowflake.snowpark_connect.relation.catalogs.abstract_spark_catalog import (
     AbstractSparkCatalog,
     _get_current_snowflake_schema,
@@ -39,6 +43,37 @@ from snowflake.snowpark_connect.utils.telemetry import (
 from snowflake.snowpark_connect.utils.udf_cache import cached_udf
+def _is_retryable_api_error(e: Exception) -> bool:
+    """
+    Determine if an APIError should be retried.
+    Only retry on server errors, rate limiting, and transient network issues.
+    Don't retry on client errors like authentication, authorization, or validation failures.
+    """
+    if not isinstance(e, APIError):
+        return False
+    # Check if the error has a status_code attribute
+    if hasattr(e, "status_code"):
+        # Retry on server errors (5xx), rate limiting (429), and some client errors (400)
+        # 400 can be transient in some cases (like the original error trace shows)
+        return e.status_code in [400, 429, 500, 502, 503, 504]
+    # For APIErrors without explicit status codes, check the message
+    error_msg = str(e).lower()
+    retryable_patterns = [
+        "timeout",
+        "connection",
+        "network",
+        "unavailable",
+        "temporary",
+        "rate limit",
+        "throttle",
+    ]
+    return any(pattern in error_msg for pattern in retryable_patterns)
 def _normalize_identifier(identifier: str | None) -> str | None:
     if identifier is None:
         return None
@@ -73,10 +108,25 @@ class SnowflakeCatalog(AbstractSparkCatalog):
             )
         sp_catalog = get_or_create_snowpark_session().catalog
-        dbs = sp_catalog.list_schemas(
-            database=sf_quote(sf_database),
-            pattern=_normalize_identifier(sf_schema),
-        )
+        dbs: list[Schema] | None = None
+        for attempt in Retrying(
+            max_retries=5,
+            initial_backoff=100,  # 100ms
+            max_backoff=5000,  # 5 s
+            backoff_multiplier=2.0,
+            jitter=100,
+            min_jitter_threshold=200,
+            can_retry=_is_retryable_api_error,
+        ):
+            with attempt:
+                dbs = sp_catalog.list_schemas(
+                    database=sf_quote(sf_database),
+                    pattern=_normalize_identifier(sf_schema),
+                )
+        if dbs is None:
+            raise MaxRetryExceeded(
+                f"Failed to fetch databases {f'with pattern {pattern} ' if pattern is not None else ''}after all retry attempts"
+            )
         names: list[str] = list()
         catalogs: list[str] = list()
         descriptions: list[str | None] = list()
@@ -112,9 +162,24 @@ class SnowflakeCatalog(AbstractSparkCatalog):
             )
         sp_catalog = get_or_create_snowpark_session().catalog
-        db = sp_catalog.get_schema(
-            schema=sf_quote(sf_schema), database=sf_quote(sf_database)
-        )
+        db: Schema | None = None
+        for attempt in Retrying(
+            max_retries=5,
+            initial_backoff=100,  # 100ms
+            max_backoff=5000,  # 5 s
+            backoff_multiplier=2.0,
+            jitter=100,
+            min_jitter_threshold=200,
+            can_retry=_is_retryable_api_error,
+        ):
+            with attempt:
+                db = sp_catalog.get_schema(
+                    schema=sf_quote(sf_schema), database=sf_quote(sf_database)
+                )
+        if db is None:
+            raise MaxRetryExceeded(
+                f"Failed to fetch database {spark_dbName} after all retry attempts"
+            )
         name = unquote_if_quoted(db.name)
         return pandas.DataFrame(
@@ -241,11 +306,27 @@ class SnowflakeCatalog(AbstractSparkCatalog):
                 "Calling into another catalog is not currently supported"
             )
-        table = sp_catalog.get_table(
-            database=sf_quote(sf_database),
-            schema=sf_quote(sf_schema),
-            table_name=sf_quote(table_name),
-        )
+        table: Table | None = None
+        for attempt in Retrying(
+            max_retries=5,
+            initial_backoff=100,  # 100ms
+            max_backoff=5000,  # 5 s
+            backoff_multiplier=2.0,
+            jitter=100,
+            min_jitter_threshold=200,
+            can_retry=_is_retryable_api_error,
+        ):
+            with attempt:
+                table = sp_catalog.get_table(
+                    database=sf_quote(sf_database),
+                    schema=sf_quote(sf_schema),
+                    table_name=sf_quote(table_name),
+                )
+        if table is None:
+            raise MaxRetryExceeded(
+                f"Failed to fetch table {spark_tableName} after all retry attempts"
+            )
         return pandas.DataFrame(
             {
@@ -286,6 +367,7 @@ class SnowflakeCatalog(AbstractSparkCatalog):
     ) -> pandas.DataFrame:
         """List all columns in a table/view, optionally database name filter can be provided."""
         sp_catalog = get_or_create_snowpark_session().catalog
+        columns: list[TableColumn] | None = None
         if spark_dbName is None:
             catalog, sf_database, sf_schema, sf_table = _process_multi_layer_identifier(
                 spark_tableName
@@ -294,15 +376,39 @@ class SnowflakeCatalog(AbstractSparkCatalog):
                 raise SnowparkConnectNotImplementedError(
                     "Calling into another catalog is not currently supported"
                 )
-            columns = sp_catalog.list_columns(
-                database=sf_quote(sf_database),
-                schema=sf_quote(sf_schema),
-                table_name=sf_quote(sf_table),
-            )
+            for attempt in Retrying(
+                max_retries=5,
+                initial_backoff=100,  # 100ms
+                max_backoff=5000,  # 5 s
+                backoff_multiplier=2.0,
+                jitter=100,
+                min_jitter_threshold=200,
+                can_retry=_is_retryable_api_error,
+            ):
+                with attempt:
+                    columns = sp_catalog.list_columns(
+                        database=sf_quote(sf_database),
+                        schema=sf_quote(sf_schema),
+                        table_name=sf_quote(sf_table),
+                    )
         else:
-            columns = sp_catalog.list_columns(
-                schema=sf_quote(spark_dbName),
-                table_name=sf_quote(spark_tableName),
+            for attempt in Retrying(
+                max_retries=5,
+                initial_backoff=100,  # 100ms
+                max_backoff=5000,  # 5 s
+                backoff_multiplier=2.0,
+                jitter=100,
+                min_jitter_threshold=200,
+                can_retry=_is_retryable_api_error,
+            ):
+                with attempt:
+                    columns = sp_catalog.list_columns(
+                        schema=sf_quote(spark_dbName),
+                        table_name=sf_quote(spark_tableName),
+                    )
+        if columns is None:
+            raise MaxRetryExceeded(
+                f"Failed to fetch columns of {spark_tableName} after all retry attempts"
             )
         names: list[str] = list()
         descriptions: list[str | None] = list()

snowpark-connect 0.22.1__py3-none-any.whl → 0.24.0__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.22.1py3-none-any.whl → 0.24.0py3-none-any.whl