PyPI - snowpark-connect - Versions diffs - 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl - Mend

snowpark-connect 0.28.0py3-none-any.whl → 0.29.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (36) hide show

snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py CHANGED Viewed

@@ -13,7 +13,6 @@ from snowflake.core.exceptions import APIError, NotFoundError
 from snowflake.core.schema import Schema
 from snowflake.core.table import Table, TableColumn
-from snowflake.snowpark import functions
 from snowflake.snowpark._internal.analyzer.analyzer_utils import (
     quote_name_without_upper_casing,
     unquote_if_quoted,
@@ -34,12 +33,19 @@ from snowflake.snowpark_connect.relation.catalogs.abstract_spark_catalog import
 )
 from snowflake.snowpark_connect.type_mapping import proto_to_snowpark_type
 from snowflake.snowpark_connect.utils.identifiers import (
+    FQN,
+    spark_to_sf_single_id_with_unquoting,
     split_fully_qualified_spark_name,
 )
 from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
 from snowflake.snowpark_connect.utils.telemetry import (
     SnowparkConnectNotImplementedError,
 )
+from snowflake.snowpark_connect.utils.temporary_view_cache import (
+    get_temp_view,
+    get_temp_view_normalized_names,
+    unregister_temp_view,
+)
 from snowflake.snowpark_connect.utils.udf_cache import cached_udf
@@ -203,6 +209,93 @@ class SnowflakeCatalog(AbstractSparkCatalog):
             exists = False
         return pandas.DataFrame({"exists": [exists]})
+    def _get_temp_view_prefixes(self, spark_dbName: str | None) -> list[str]:
+        if spark_dbName is None:
+            return []
+        return [
+            quote_name_without_upper_casing(part)
+            for part in split_fully_qualified_spark_name(spark_dbName)
+        ]
+    def _list_temp_views(
+        self,
+        spark_dbName: str | None = None,
+        pattern: str | None = None,
+    ) -> typing.Tuple[
+        list[str | None],
+        list[list[str | None]],
+        list[str],
+        list[str | None],
+        list[str | None],
+        list[bool],
+    ]:
+        catalogs: list[str | None] = list()
+        namespaces: list[list[str | None]] = list()
+        names: list[str] = list()
+        descriptions: list[str | None] = list()
+        table_types: list[str | None] = list()
+        is_temporaries: list[bool] = list()
+        temp_views_prefix = ".".join(self._get_temp_view_prefixes(spark_dbName))
+        normalized_spark_dbName = (
+            temp_views_prefix.lower()
+            if global_config.spark_sql_caseSensitive
+            else temp_views_prefix
+        )
+        normalized_global_temp_database_name = (
+            quote_name_without_upper_casing(
+                global_config.spark_sql_globalTempDatabase.lower()
+            )
+            if global_config.spark_sql_caseSensitive
+            else quote_name_without_upper_casing(
+                global_config.spark_sql_globalTempDatabase
+            )
+        )
+        temp_views = get_temp_view_normalized_names()
+        null_safe_pattern = pattern if pattern is not None else ""
+        for temp_view in temp_views:
+            normalized_temp_view = (
+                temp_view.lower()
+                if global_config.spark_sql_caseSensitive
+                else temp_view
+            )
+            fqn = FQN.from_string(temp_view)
+            normalized_schema = (
+                fqn.schema.lower()
+                if fqn.schema is not None and global_config.spark_sql_caseSensitive
+                else fqn.schema
+            )
+            is_global_view = normalized_global_temp_database_name == normalized_schema
+            is_local_temp_view = fqn.schema is None
+            # Temporary views are always shown if they match the pattern
+            matches_prefix = (
+                normalized_spark_dbName == normalized_schema or is_local_temp_view
+            )
+            if matches_prefix and bool(
+                re.match(null_safe_pattern, normalized_temp_view)
+            ):
+                names.append(unquote_if_quoted(fqn.name))
+                catalogs.append(None)
+                namespaces.append(
+                    [global_config.spark_sql_globalTempDatabase]
+                    if is_global_view
+                    else []
+                )
+                descriptions.append(None)
+                table_types.append("TEMPORARY")
+                is_temporaries.append(True)
+        return (
+            catalogs,
+            namespaces,
+            names,
+            descriptions,
+            table_types,
+            is_temporaries,
+        )
     def listTables(
         self,
         spark_dbName: str | None = None,
@@ -232,8 +325,7 @@ class SnowflakeCatalog(AbstractSparkCatalog):
             schema=sf_quote(sf_schema),
             pattern=_normalize_identifier(pattern),
         )
-        names: list[str] = list()
-        catalogs: list[str] = list()
+        catalogs: list[str | None] = list()
         namespaces: list[list[str | None]] = list()
         names: list[str] = list()
         descriptions: list[str | None] = list()
@@ -253,6 +345,22 @@ class SnowflakeCatalog(AbstractSparkCatalog):
             descriptions.append(o[6] if o[6] else None)
             table_types.append("PERMANENT")
             is_temporaries.append(False)
+        (
+            non_materialized_catalogs,
+            non_materialized_namespaces,
+            non_materialized_names,
+            non_materialized_descriptions,
+            non_materialized_table_types,
+            non_materialized_is_temporaries,
+        ) = self._list_temp_views(spark_dbName, pattern)
+        catalogs.extend(non_materialized_catalogs)
+        namespaces.extend(non_materialized_namespaces)
+        names.extend(non_materialized_names)
+        descriptions.extend(non_materialized_descriptions)
+        table_types.extend(non_materialized_table_types)
+        is_temporaries.extend(non_materialized_is_temporaries)
         return pandas.DataFrame(
             {
                 "name": names,
@@ -297,6 +405,36 @@ class SnowflakeCatalog(AbstractSparkCatalog):
         spark_tableName: str,
     ) -> pandas.DataFrame:
         """Listing a single table/view with provided name that's accessible in Snowflake."""
+        def _get_temp_view():
+            spark_table_name_parts = [
+                quote_name_without_upper_casing(part)
+                for part in split_fully_qualified_spark_name(spark_tableName)
+            ]
+            spark_view_name = ".".join(spark_table_name_parts)
+            temp_view = get_temp_view(spark_view_name)
+            if temp_view:
+                return pandas.DataFrame(
+                    {
+                        "name": [unquote_if_quoted(spark_table_name_parts[-1])],
+                        "catalog": [None],
+                        "namespace": [
+                            [unquote_if_quoted(spark_table_name_parts[-2])]
+                            if len(spark_table_name_parts) > 1
+                            else []
+                        ],
+                        "description": [None],
+                        "tableType": ["TEMPORARY"],
+                        "isTemporary": [True],
+                    }
+                )
+            return None
+        # Attempt to get the view from the non materialized views first
+        temp_view = _get_temp_view()
+        if temp_view is not None:
+            return temp_view
         sp_catalog = get_or_create_snowpark_session().catalog
         catalog, sf_database, sf_schema, table_name = _process_multi_layer_identifier(
             spark_tableName
@@ -360,12 +498,64 @@ class SnowflakeCatalog(AbstractSparkCatalog):
             exists = False
         return pandas.DataFrame({"exists": [exists]})
+    def _list_temp_view_columns(
+        self,
+        spark_tableName: str,
+        spark_dbName: typing.Optional[str] = None,
+    ):
+        spark_view_name_parts = [
+            quote_name_without_upper_casing(part)
+            for part in split_fully_qualified_spark_name(spark_tableName)
+        ]
+        spark_view_name_parts = (
+            self._get_temp_view_prefixes(spark_dbName) + spark_view_name_parts
+        )
+        spark_view_name = ".".join(spark_view_name_parts)
+        temp_view = get_temp_view(spark_view_name)
+        if not temp_view:
+            return None
+        names: list[str] = list()
+        descriptions: list[str | None] = list()
+        data_types: list[str] = list()
+        nullables: list[bool] = list()
+        is_partitions: list[bool] = list()
+        is_buckets: list[bool] = list()
+        for field, spark_column in zip(
+            temp_view.dataframe.schema.fields,
+            temp_view.column_map.get_spark_columns(),
+        ):
+            names.append(spark_column)
+            descriptions.append(None)
+            data_types.append(field.datatype.simpleString())
+            nullables.append(field.nullable)
+            is_partitions.append(False)
+            is_buckets.append(False)
+        return pandas.DataFrame(
+            {
+                "name": names,
+                "description": descriptions,
+                "dataType": data_types,
+                "nullable": nullables,
+                "isPartition": is_partitions,
+                "isBucket": is_buckets,
+            }
+        )
     def listColumns(
         self,
         spark_tableName: str,
         spark_dbName: typing.Optional[str] = None,
     ) -> pandas.DataFrame:
         """List all columns in a table/view, optionally database name filter can be provided."""
+        temp_view_columns = self._list_temp_view_columns(spark_tableName, spark_dbName)
+        if temp_view_columns is not None:
+            return temp_view_columns
         sp_catalog = get_or_create_snowpark_session().catalog
         columns: list[TableColumn] | None = None
         if spark_dbName is None:
@@ -455,17 +645,15 @@ class SnowflakeCatalog(AbstractSparkCatalog):
         spark_view_name: str,
     ) -> DataFrameContainer:
         session = get_or_create_snowpark_session()
-        schema = global_config.spark_sql_globalTempDatabase
-        result_df = session.sql(
-            "drop view if exists identifier(?)",
-            params=[f"{sf_quote(schema)}.{sf_quote(spark_view_name)}"],
-        )
-        result_df = result_df.select(
-            functions.contains('"status"', functions.lit("successfully dropped")).alias(
-                "value"
+        if not spark_view_name == "":
+            schema = global_config.spark_sql_globalTempDatabase
+            result = unregister_temp_view(
+                f"{spark_to_sf_single_id_with_unquoting(schema)}.{spark_to_sf_single_id_with_unquoting(spark_view_name)}"
             )
-        )
+        else:
+            result = False
         columns = ["value"]
+        result_df = session.createDataFrame([result], schema=columns)
         return DataFrameContainer.create_with_column_mapping(
             dataframe=result_df,
             spark_column_names=columns,
@@ -479,15 +667,14 @@ class SnowflakeCatalog(AbstractSparkCatalog):
     ) -> DataFrameContainer:
         """Drop the current temporary view."""
         session = get_or_create_snowpark_session()
-        result = session.sql(
-            "drop view if exists identifier(?)",
-            params=[sf_quote(spark_view_name)],
-        ).collect()
-        view_was_dropped = (
-            len(result) == 1 and "successfully dropped" in result[0]["status"]
-        )
-        result_df = session.createDataFrame([(view_was_dropped,)], schema=["value"])
         columns = ["value"]
+        if spark_view_name:
+            result = unregister_temp_view(
+                spark_to_sf_single_id_with_unquoting(spark_view_name)
+            )
+        else:
+            result = False
+        result_df = session.createDataFrame([result], schema=columns)
         return DataFrameContainer.create_with_column_mapping(
             dataframe=result_df,
             spark_column_names=columns,

snowflake/snowpark_connect/relation/io_utils.py CHANGED Viewed

@@ -7,8 +7,27 @@ from urllib.parse import urlparse
 CLOUD_PREFIX_TO_CLOUD = {
     "abfss": "azure",
     "wasbs": "azure",
+    "gcs": "gcp",
+    "gs": "gcp",
 }
+SUPPORTED_COMPRESSION_PER_FORMAT = {
+    "csv": {"AUTO", "GZIP", "BZ2", "BROTLI", "ZSTD", "DEFLATE", "RAW_DEFLATE", "NONE"},
+    "json": {"AUTO", "GZIP", "BZ2", "BROTLI", "ZSTD", "DEFLATE", "RAW_DEFLATE", "NONE"},
+    "parquet": {"AUTO", "LZO", "SNAPPY", "NONE"},
+    "text": {"NONE"},
+}
+def supported_compressions_for_format(format: str) -> set[str]:
+    return SUPPORTED_COMPRESSION_PER_FORMAT.get(format, set())
+def is_supported_compression(format: str, compression: str | None) -> bool:
+    if compression is None:
+        return True
+    return compression in supported_compressions_for_format(format)
 def get_cloud_from_url(
     url: str,
@@ -66,7 +85,8 @@ def is_cloud_path(path: str) -> bool:
         or path.startswith("azure://")
         or path.startswith("abfss://")
         or path.startswith("wasbs://")  # Azure
-        or path.startswith("gcs://")  # GCP
+        or path.startswith("gcs://")
+        or path.startswith("gs://")  # GCP
     )

snowflake/snowpark_connect/relation/map_extension.py CHANGED Viewed

@@ -345,7 +345,7 @@ def map_aggregate(
         return new_names[0], snowpark_column
     raw_groupings: list[tuple[str, TypedColumn]] = []
-    raw_aggregations: list[tuple[str, TypedColumn]] = []
+    raw_aggregations: list[tuple[str, TypedColumn, list[str]]] = []
     if not is_group_by_all:
         raw_groupings = [_map_column(exp) for exp in aggregate.grouping_expressions]
@@ -375,10 +375,21 @@ def map_aggregate(
     # Note: We don't clear the map here to preserve any parent context aliases
     from snowflake.snowpark_connect.utils.context import register_lca_alias
+    # If it's an unresolved attribute when its in aggregate.aggregate_expressions, we know it came from the parent map straight away
+    # in this case, we should see if the parent map has a qualifier for it and propagate that here, in case the order by references it in
+    # a qualified way later.
     agg_count = get_sql_aggregate_function_count()
     for exp in aggregate.aggregate_expressions:
         col = _map_column(exp)
-        raw_aggregations.append(col)
+        if exp.WhichOneof("expr_type") == "unresolved_attribute":
+            spark_name = col[0]
+            qualifiers = input_container.column_map.get_qualifier_for_spark_column(
+                spark_name
+            )
+        else:
+            qualifiers = []
+        raw_aggregations.append((col[0], col[1], qualifiers))
         # If this is an alias, register it in the LCA map for subsequent expressions
         if (
@@ -409,18 +420,20 @@ def map_aggregate(
     spark_columns: list[str] = []
     snowpark_columns: list[str] = []
     snowpark_column_types: list[snowpark_types.DataType] = []
+    all_qualifiers: list[list[str]] = []
     # Use grouping columns directly without aliases
     groupings = [col.col for _, col in raw_groupings]
     # Create aliases only for aggregation columns
     aggregations = []
-    for i, (spark_name, snowpark_column) in enumerate(raw_aggregations):
+    for i, (spark_name, snowpark_column, qualifiers) in enumerate(raw_aggregations):
         alias = make_column_names_snowpark_compatible([spark_name], plan_id, i)[0]
         spark_columns.append(spark_name)
         snowpark_columns.append(alias)
         snowpark_column_types.append(snowpark_column.typ)
+        all_qualifiers.append(qualifiers)
         aggregations.append(snowpark_column.col.alias(alias))
@@ -483,6 +496,7 @@ def map_aggregate(
             spark_column_names=spark_columns,
             snowpark_column_names=snowpark_columns,
             snowpark_column_types=snowpark_column_types,
+            column_qualifiers=all_qualifiers,
         ).column_map
         # Create hybrid column map that can resolve both input and aggregate contexts
@@ -494,7 +508,9 @@ def map_aggregate(
             aggregate_expressions=list(aggregate.aggregate_expressions),
             grouping_expressions=list(aggregate.grouping_expressions),
             spark_columns=spark_columns,
-            raw_aggregations=raw_aggregations,
+            raw_aggregations=[
+                (spark_name, col) for spark_name, col, _ in raw_aggregations
+            ],
         )
         # Map the HAVING condition using hybrid resolution
@@ -515,4 +531,5 @@ def map_aggregate(
         snowpark_column_names=snowpark_columns,
         snowpark_column_types=snowpark_column_types,
         parent_column_name_map=input_df._column_map,
+        column_qualifiers=all_qualifiers,
     )

snowflake/snowpark_connect/relation/map_map_partitions.py CHANGED Viewed

@@ -12,7 +12,6 @@ from snowflake.snowpark_connect.constants import MAP_IN_ARROW_EVAL_TYPE
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
 from snowflake.snowpark_connect.relation.map_relation import map_relation
 from snowflake.snowpark_connect.type_mapping import proto_to_snowpark_type
-from snowflake.snowpark_connect.utils.context import map_partitions_depth
 from snowflake.snowpark_connect.utils.pandas_udtf_utils import (
     create_pandas_udtf,
     create_pandas_udtf_with_arrow,
@@ -53,18 +52,18 @@ def _call_udtf(
         ).cast("int"),
     )
-    udtf_columns = input_df.columns + [snowpark_fn.col("_DUMMY_PARTITION_KEY")]
+    udtf_columns = [f"snowflake_jtf_{column}" for column in input_df.columns] + [
+        "_DUMMY_PARTITION_KEY"
+    ]
     tfc = snowpark_fn.call_table_function(udtf_name, *udtf_columns).over(
         partition_by=[snowpark_fn.col("_DUMMY_PARTITION_KEY")]
     )
-    # Use map_partitions_depth only when mapping non nested map_partitions
-    # When mapping chained functions additional column casting is necessary
-    if map_partitions_depth() == 1:
-        result_df_with_dummy = input_df_with_dummy.join_table_function(tfc)
-    else:
-        result_df_with_dummy = input_df_with_dummy.select(tfc)
+    # Overwrite the input_df columns to prevent name conflicts with UDTF output columns
+    result_df_with_dummy = input_df_with_dummy.to_df(udtf_columns).join_table_function(
+        tfc
+    )
     output_cols = [field.name for field in return_type.fields]

snowflake/snowpark_connect/relation/map_relation.py CHANGED Viewed

@@ -16,7 +16,6 @@ from snowflake.snowpark_connect.utils.context import (
     get_plan_id_map,
     get_session_id,
     not_resolving_fun_args,
-    push_map_partitions,
     push_operation_scope,
     set_is_aggregate_function,
     set_plan_id_map,
@@ -185,8 +184,7 @@ def map_relation(
                     )
                 return cached_df
             case "map_partitions":
-                with push_map_partitions():
-                    result = map_map_partitions.map_map_partitions(rel)
+                result = map_map_partitions.map_map_partitions(rel)
             case "offset":
                 result = map_row_ops.map_offset(rel)
             case "project":

snowpark-connect 0.28.0__py3-none-any.whl → 0.29.0__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.28.0py3-none-any.whl → 0.29.0py3-none-any.whl