PyPI - snowpark-connect - Versions diffs - 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl - Mend

snowpark-connect 0.27.0py3-none-any.whl → 1.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

snowflake/snowpark_connect/relation/map_stats.py CHANGED Viewed

@@ -7,7 +7,7 @@ import ast
 import numpy as np
 import pandas
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
-from pyspark.errors.exceptions.base import AnalysisException
+from pyspark.errors.exceptions.base import AnalysisException, IllegalArgumentException
 import snowflake.snowpark.functions as fn
 import snowflake.snowpark.types as snowpark_types
@@ -15,6 +15,9 @@ from snowflake import snowpark
 from snowflake.snowpark.exceptions import SnowparkSQLException
 from snowflake.snowpark_connect.config import get_boolean_session_config_param
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
+from snowflake.snowpark_connect.error.error_codes import ErrorCodes
+from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
+from snowflake.snowpark_connect.includes.python.pyspark.sql.types import StructField
 from snowflake.snowpark_connect.relation.map_relation import map_relation
 from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
@@ -52,7 +55,7 @@ def map_cov(
     """
     Find the covariance of two columns in the input DataFrame.
-    Returns a pandas DataFrame because the corvariance of two columns produces
+    Returns a pandas DataFrame because the covariance of two columns produces
     a scalar value.
     """
     input_container = map_relation(rel.cov.input)
@@ -64,6 +67,16 @@ def map_cov(
     col2 = input_container.column_map.get_snowpark_column_name_from_spark_column_name(
         rel.cov.col2
     )
+    col1_type = next(
+        field.datatype for field in input_df.schema.fields if field.name == col1
+    )
+    col2_type = next(
+        field.datatype for field in input_df.schema.fields if field.name == col2
+    )
+    _check_numeric_column(col_name=rel.cov.col1, col_type=col1_type)
+    _check_numeric_column(col_name=rel.cov.col2, col_type=col2_type)
     result: float = input_df.cov(col1, col2)
     return pandas.DataFrame({"cov": [result]})
@@ -81,7 +94,7 @@ def map_approx_quantile(
     input_df = input_container.dataframe
     snowflake_compatible = get_boolean_session_config_param(
-        "enable_snowflake_extension_behavior"
+        "snowpark.connect.enable_snowflake_extension_behavior"
     )
     if not snowflake_compatible:
@@ -99,9 +112,11 @@ def map_approx_quantile(
                     else ""
                 )
-                raise AnalysisException(
+                exception = AnalysisException(
                     f"[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `{col_name}` cannot be resolved.{suggestion_text}"
                 )
+                attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
+                raise exception
     cols = input_container.column_map.get_snowpark_column_names_from_spark_column_names(
         list(rel.approx_quantile.cols)
@@ -309,9 +324,28 @@ def map_freq_items(rel: relation_proto.Relation) -> DataFrameContainer:
     cols = input_container.column_map.get_snowpark_column_names_from_spark_column_names(
         list(rel.freq_items.cols)
     )
+    # handle empty DataFrame case
+    row_count = input_df.count()
+    for sp_col_name in cols:
+        spark_col_names.append(
+            f"{input_container.column_map.get_spark_column_name_from_snowpark_column_name(sp_col_name)}_freqItems"
+        )
+    if row_count == 0:
+        # If DataFrame is empty, return empty arrays for each column
+        empty_values = [[] for _ in cols]
+        approx_top_k_df = session.createDataFrame([empty_values], spark_col_names)
+        return DataFrameContainer.create_with_column_mapping(
+            dataframe=approx_top_k_df,
+            spark_column_names=spark_col_names,
+            snowpark_column_names=spark_col_names,
+        )
     approx_top_k_df = input_df.select(
         *[
-            fn.function("approx_top_k")(fn.col(col), round(input_df.count() / support))
+            fn.function("approx_top_k")(fn.col(col), round(row_count / support))
             for col in cols
         ]
     )
@@ -330,10 +364,6 @@ def map_freq_items(rel: relation_proto.Relation) -> DataFrameContainer:
         for value in approx_top_k_values
     ]
-    for sp_col_name in cols:
-        spark_col_names.append(
-            f"{input_container.column_map.get_spark_column_name_from_snowpark_column_name(sp_col_name)}_freqItems"
-        )
     approx_top_k_df = session.createDataFrame([filtered_values], spark_col_names)
     return DataFrameContainer.create_with_column_mapping(
@@ -371,3 +401,12 @@ def _build_column_map_helper_container(
         spark_column_names=spark_col_names,
         snowpark_column_names=desc_df.columns,
     )
+def _check_numeric_column(col_name: str, col_type: StructField) -> None:
+    """Checks if a column type is a Snowpark NumericType and raises an exception if not."""
+    if not isinstance(col_type, snowpark_types._NumericType):
+        raise IllegalArgumentException(
+            f"Column '{col_name}' must be of numeric type for covariance calculation, "
+            f"but got {col_type}"
+        )

snowflake/snowpark_connect/relation/map_subquery_alias.py CHANGED Viewed

@@ -4,8 +4,12 @@
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
+from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
 from snowflake.snowpark_connect.relation.map_relation import map_relation
+from snowflake.snowpark_connect.relation.read.metadata_utils import (
+    without_internal_columns,
+)
 def map_alias(
@@ -17,8 +21,12 @@ def map_alias(
     alias: str = rel.subquery_alias.alias
     # we set reuse_parsed_plan=False because we need new expr_id for the attributes (output columns) in aliased snowpark dataframe
     # reuse_parsed_plan will lead to ambiguous column name for operations like joining two dataframes that are aliased from the same dataframe
-    input_container = map_relation(rel.subquery_alias.input, reuse_parsed_plan=False)
-    qualifiers = [[alias]] * len(input_container.column_map.columns)
+    input_container = without_internal_columns(
+        map_relation(rel.subquery_alias.input, reuse_parsed_plan=False)
+    )
+    qualifiers = [
+        {ColumnQualifier((alias,))} for _ in input_container.column_map.columns
+    ]
     return DataFrameContainer.create_with_column_mapping(
         dataframe=input_container.dataframe,
@@ -28,4 +36,5 @@ def map_alias(
         column_qualifiers=qualifiers,
         parent_column_name_map=input_container.column_map.get_parent_column_name_map(),
         alias=alias,
+        equivalent_snowpark_names=input_container.column_map.get_equivalent_snowpark_names(),
     )

snowflake/snowpark_connect/relation/map_udtf.py CHANGED Viewed

@@ -22,6 +22,8 @@ from snowflake.snowpark_connect.config import (
     global_config,
 )
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
+from snowflake.snowpark_connect.error.error_codes import ErrorCodes
+from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
 from snowflake.snowpark_connect.expression.map_expression import (
     map_single_column_expression,
 )
@@ -163,17 +165,21 @@ def process_return_type(
         else:
             parsed_return = return_type
     except ValueError as e:
-        raise PythonException(
+        exception = PythonException(
             f"[UDTF_ARROW_TYPE_CAST_ERROR] Error parsing UDTF return type DDL: {e}"
         )
+        attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
+        raise exception
     original_output_schema = proto_to_snowpark_type(parsed_return)
     output_schema = proto_to_snowpark_type(parsed_return)
     # Snowflake UDTF does not support MapType, so we convert it to VariantType.
     output_schema = convert_maptype_to_variant(output_schema)
     if not isinstance(output_schema, StructType):
-        raise PySparkTypeError(
+        exception = PySparkTypeError(
             f"Invalid Python user-defined table function return type. Expect a struct type, but got {parsed_return}"
         )
+        attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
+        raise exception
     expected_types = None
     if is_arrow_enabled_in_udtf() or is_spark_compatible_udtf_mode_enabled():
@@ -276,12 +282,16 @@ def map_common_inline_user_defined_table_function(
         if require_creating_udtf_in_sproc(udtf_proto):
             snowpark_udtf_or_error = create_udtf_in_sproc(**kwargs)
             if isinstance(snowpark_udtf_or_error, str):
-                raise PythonException(snowpark_udtf_or_error)
+                exception = PythonException(snowpark_udtf_or_error)
+                attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
+                raise exception
             snowpark_udtf = snowpark_udtf_or_error
         else:
             udtf_or_error = create_udtf(**kwargs)
             if isinstance(udtf_or_error, str):
-                raise PythonException(udtf_or_error)
+                exception = PythonException(udtf_or_error)
+                attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
+                raise exception
             udtf = udtf_or_error
             snowpark_udtf = SnowparkUDTF(
                 name=udtf.name,

snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py CHANGED Viewed

@@ -38,6 +38,8 @@ from snowflake.snowpark.types import (
     TimeType,
     _NumericType,
 )
+from snowflake.snowpark_connect.error.error_codes import ErrorCodes
+from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
 from snowflake.snowpark_connect.relation.read.utils import (
     DATA_SOURCE_SQL_COMMENT,
     Connection,
@@ -147,9 +149,11 @@ class JdbcDataFrameReader(DataFrameReader):
                     or upper_bound is not None
                     or num_partitions is not None
                 ):
-                    raise ValueError(
+                    exception = ValueError(
                         "when column is not specified, lower_bound, upper_bound, num_partitions are expected to be None"
                     )
+                    attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
+                    raise exception
                 if table is not None:
                     partitioned_queries = []
                     table_query = f"SELECT * FROM {table}"
@@ -160,24 +164,32 @@ class JdbcDataFrameReader(DataFrameReader):
                 elif query is not None:
                     partitioned_queries = [query]
                 else:
-                    raise ValueError("table or query is not specified")
+                    exception = ValueError("table or query is not specified")
+                    attach_custom_error_code(exception, ErrorCodes.INSUFFICIENT_INPUT)
+                    raise exception
             else:
                 if lower_bound is None or upper_bound is None or num_partitions is None:
-                    raise ValueError(
+                    exception = ValueError(
                         "when column is specified, lower_bound, upper_bound, num_partitions must be specified"
                     )
+                    attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
+                    raise exception
                 column_type = None
                 for field in struct_schema.fields:
                     if field.name.lower() == column.lower():
                         column_type = field.datatype
                 if column_type is None:
-                    raise ValueError("Column does not exist")
+                    exception = ValueError("Column does not exist")
+                    attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
+                    raise exception
                 if not isinstance(column_type, _NumericType) and not isinstance(
                     column_type, DateType
                 ):
-                    raise ValueError(f"unsupported type {column_type}")
+                    exception = ValueError(f"unsupported type {column_type}")
+                    attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_TYPE)
+                    raise exception
                 spark_column_name = f'"{column}"'
                 partitioned_queries = self._generate_partition(
                     table,
@@ -240,7 +252,11 @@ class JdbcDataFrameReader(DataFrameReader):
                             )
                             query_thread_executor.shutdown(wait=False)
                             upload_thread_executor.shutdown(wait=False)
-                            raise future.result()
+                            exception = future.result()
+                            attach_custom_error_code(
+                                exception, ErrorCodes.INTERNAL_ERROR
+                            )
+                            raise exception
                         else:
                             path = future.result()
                             if not path:
@@ -266,7 +282,11 @@ class JdbcDataFrameReader(DataFrameReader):
                             )
                             query_thread_executor.shutdown(wait=False)
                             upload_thread_executor.shutdown(wait=False)
-                            raise f.result()
+                            exception = f.result()
+                            attach_custom_error_code(
+                                exception, ErrorCodes.INTERNAL_ERROR
+                            )
+                            raise exception
         finally:
             close_connection(conn)
@@ -281,9 +301,14 @@ class JdbcDataFrameReader(DataFrameReader):
         if table is not None:
             sql = f"SELECT * FROM {table} WHERE 1=0"
         elif query is not None:
-            sql = f"SELECT * FROM ({query}) WHERE 1=0"
+            # We need "jdbc_query" subquery alias as other datasources such as SQL Server and PostgreSQL
+            # do not work without an alias.
+            # Snowflake works with or without subquery alias.
+            sql = f"SELECT jdbc_query.* FROM ({query}) as jdbc_query WHERE 1=0"
         else:
-            raise ValueError("table or query is not specified")
+            exception = ValueError("table or query is not specified")
+            attach_custom_error_code(exception, ErrorCodes.INSUFFICIENT_INPUT)
+            raise exception
         cursor = conn.cursor()
         cursor.execute(sql)
@@ -301,7 +326,11 @@ class JdbcDataFrameReader(DataFrameReader):
             dt = parser.parse(value)
             return int(dt.replace(tzinfo=pytz.UTC).timestamp())
         else:
-            raise TypeError(f"unsupported column type for partition: {column_type}")
+            exception = TypeError(
+                f"unsupported column type for partition: {column_type}"
+            )
+            attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_TYPE)
+            raise exception
     # this function is only used in data source API for SQL server
     def _to_external_value(self, value: Union[int, str, float], column_type: DataType):
@@ -311,7 +340,11 @@ class JdbcDataFrameReader(DataFrameReader):
             # TODO: SNOW-1909315: support timezone
             return datetime.datetime.fromtimestamp(value, tz=pytz.UTC)
         else:
-            raise TypeError(f"unsupported column type for partition: {column_type}")
+            exception = TypeError(
+                f"unsupported column type for partition: {column_type}"
+            )
+            attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_TYPE)
+            raise exception
     def _to_snowpark_type(self, schema: Tuple[tuple]) -> StructType:
         fields = []
@@ -339,7 +372,9 @@ class JdbcDataFrameReader(DataFrameReader):
                 case jaydebeapi.BINARY:
                     field = StructField(name, BinaryType(), is_nullable)
                 case _:
-                    raise ValueError(f"unsupported type: {dbapi_type}")
+                    exception = ValueError(f"unsupported type: {dbapi_type}")
+                    attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_TYPE)
+                    raise exception
             fields.append(field)
         return StructType(fields)
@@ -359,7 +394,9 @@ class JdbcDataFrameReader(DataFrameReader):
         processed_lower_bound = self._to_internal_value(lower_bound, column_type)
         processed_upper_bound = self._to_internal_value(upper_bound, column_type)
         if processed_lower_bound > processed_upper_bound:
-            raise ValueError("lower_bound cannot be greater than upper_bound")
+            exception = ValueError("lower_bound cannot be greater than upper_bound")
+            attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
+            raise exception
         if processed_lower_bound == processed_upper_bound or num_partitions <= 1:
             return [select_query]
@@ -665,4 +702,6 @@ def get_jdbc_dialect(url: str) -> JdbcDialect:
     for jdbc_dialect in jdbc_dialects:
         if jdbc_dialect.can_handle(url):
             return jdbc_dialect
-    raise ValueError(f"Unsupported JDBC datasource: {url}")
+    exception = ValueError(f"Unsupported JDBC datasource: {url}")
+    attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+    raise exception

snowflake/snowpark_connect/relation/read/map_read.py CHANGED Viewed

@@ -10,13 +10,18 @@ import re
 from pathlib import Path
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
+from pyspark.errors.exceptions.base import AnalysisException
 from snowflake import snowpark
+from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
 from snowflake.snowpark.types import StructType
 from snowflake.snowpark_connect.config import global_config
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
+from snowflake.snowpark_connect.error.error_codes import ErrorCodes
+from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
 from snowflake.snowpark_connect.relation.io_utils import (
     convert_file_prefix_path,
+    get_compression_for_source_and_options,
     is_cloud_path,
 )
 from snowflake.snowpark_connect.relation.read.map_read_table import map_read_table
@@ -26,9 +31,12 @@ from snowflake.snowpark_connect.relation.read.reader_config import (
     ParquetReaderConfig,
 )
 from snowflake.snowpark_connect.relation.stage_locator import get_paths_from_stage
-from snowflake.snowpark_connect.type_mapping import map_json_schema_to_snowpark
+from snowflake.snowpark_connect.type_mapping import (
+    _parse_ddl_with_spark_scala,
+    map_json_schema_to_snowpark,
+)
 from snowflake.snowpark_connect.utils.cache import df_cache_map_put_if_absent
-from snowflake.snowpark_connect.utils.context import get_session_id
+from snowflake.snowpark_connect.utils.context import get_spark_session_id
 from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
 from snowflake.snowpark_connect.utils.telemetry import (
     SnowparkConnectNotImplementedError,
@@ -46,6 +54,7 @@ def map_read(
     Currently, the supported read formats are `csv`, `json` and `parquet`.
     """
     match rel.read.WhichOneof("read_type"):
         case "named_table":
             return map_read_table_or_file(rel)
@@ -74,28 +83,26 @@ def map_read(
                 try:
                     parsed_schema = json.loads(rel.read.data_source.schema)
                 except json.JSONDecodeError:
-                    # It's a DDL-formatted string. e.g. "a INT, b DOUBLE"
-                    parsed_schema = {"fields": [], "type": "struct"}
-                    for field in rel.read.data_source.schema.split(","):
-                        name_and_type = field.split()
-                        assert (
-                            len(name_and_type) == 2
-                        ), f"Schema's definition {name_and_type} is invalid"
-                        parsed_schema["fields"].append(
-                            {
-                                "name": name_and_type[0],
-                                "nullable": True,
-                                "type": name_and_type[1],
-                            }
-                        )
+                    # Scala clients send DDL-formatted strings like
+                    # "billing_account_id STRING, cost STRING" or "struct<id:bigint>"
+                    spark_datatype = _parse_ddl_with_spark_scala(
+                        rel.read.data_source.schema
+                    )
+                    parsed_schema = json.loads(spark_datatype.json())
                 schema = map_json_schema_to_snowpark(parsed_schema)
             options = dict(rel.read.data_source.options)
             telemetry.report_io_read(read_format)
             session: snowpark.Session = get_or_create_snowpark_session()
             if len(rel.read.data_source.paths) > 0:
+                if options.get("path"):
+                    raise AnalysisException(
+                        "There is a 'path' or 'paths' option set and load() is called with path parameters. "
+                        "Either remove the path option if it's the same as the path parameter, "
+                        "or add it to the load() parameter if you do want to read multiple paths."
+                    )
                 # Normalize paths to ensure consistent behavior
                 clean_source_paths = [
-                    path.rstrip("/") if is_cloud_path(path) else str(Path(path))
+                    path if is_cloud_path(path) else str(Path(path))
                     for path in rel.read.data_source.paths
                 ]
@@ -121,23 +128,67 @@ def map_read(
                         options = {k.lower(): v for k, v in options.items()}
                         QUERY_OPTION = "query"
                         DBTABLE_OPTION = "dbtable"
-                        logger.warning(
-                            "Changing the Role, Warehouse, Database, or Schema via "
-                            "options is supported, but these will change for the "
-                            "entire session."
-                        )
-                        session.use_role(
-                            options.get("sfRole", session.get_current_role())
-                        )
-                        session.use_warehouse(
-                            options.get("sfWarehouse", session.get_current_warehouse())
-                        )
-                        session.use_database(
-                            options.get("sfDatabase", session.get_current_database())
-                        )
-                        session.use_schema(
-                            options.get("sfSchema", session.get_current_schema())
-                        )
+                        def _identifiers_match(
+                            desired: str, current: str | None
+                        ) -> bool:
+                            if current is None:
+                                return False
+                            desired_unquoted = unquote_if_quoted(desired)
+                            current_unquoted = unquote_if_quoted(current)
+                            desired_was_quoted = desired != desired_unquoted
+                            # If both are quoted, exact match required. session.get* always returns quoted identifier
+                            # name.
+                            if desired_was_quoted:
+                                return desired == current
+                            return desired_unquoted.upper() == current_unquoted
+                        if "sfrole" in options:
+                            desired_role = options["sfrole"]
+                            current_role = session.get_current_role()
+                            if not _identifiers_match(desired_role, current_role):
+                                logger.warning(
+                                    f"Changing Role from {current_role} to {desired_role} via "
+                                    "options. This will change the role for the entire session."
+                                )
+                                session.use_role(desired_role)
+                        if "sfwarehouse" in options:
+                            desired_warehouse = options["sfwarehouse"]
+                            current_warehouse = session.get_current_warehouse()
+                            if not _identifiers_match(
+                                desired_warehouse, current_warehouse
+                            ):
+                                logger.warning(
+                                    f"Changing Warehouse from {current_warehouse} to {desired_warehouse} via "
+                                    "options. This will change the warehouse for the entire session."
+                                )
+                                session.use_warehouse(desired_warehouse)
+                        if "sfdatabase" in options:
+                            desired_database = options["sfdatabase"]
+                            current_database = session.get_current_database()
+                            if not _identifiers_match(
+                                desired_database, current_database
+                            ):
+                                logger.warning(
+                                    f"Changing Database from {current_database} to {desired_database} via "
+                                    "options. This will change the database for the entire session."
+                                )
+                                session.use_database(desired_database)
+                        if "sfschema" in options:
+                            desired_schema = options["sfschema"]
+                            current_schema = session.get_current_schema()
+                            if not _identifiers_match(desired_schema, current_schema):
+                                logger.warning(
+                                    f"Changing Schema from {current_schema} to {desired_schema} via "
+                                    "options. This will change the schema for the entire session."
+                                )
+                                session.use_schema(desired_schema)
                         if QUERY_OPTION in options.keys():
                             from .map_read_table import get_table_from_query
@@ -151,19 +202,27 @@ def map_read(
                                 options[DBTABLE_OPTION], session, rel.common.plan_id
                             )
                     case other:
-                        raise SnowparkConnectNotImplementedError(
+                        exception = SnowparkConnectNotImplementedError(
                             f"UNSUPPORTED FORMAT {other} WITH NO PATH"
                         )
+                        attach_custom_error_code(
+                            exception, ErrorCodes.UNSUPPORTED_OPERATION
+                        )
+                        raise exception
         case other:
             # TODO: Empty data source
-            raise SnowparkConnectNotImplementedError(f"Unsupported read type: {other}")
+            exception = SnowparkConnectNotImplementedError(
+                f"Unsupported read type: {other}"
+            )
+            attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+            raise exception
     return df_cache_map_put_if_absent(
-        (get_session_id(), rel.common.plan_id), lambda: result, materialize=True
+        (get_spark_session_id(), rel.common.plan_id), lambda: result
     )
-def map_read_table_or_file(rel):
+def map_read_table_or_file(rel) -> DataFrameContainer:
     read_named_table_from_file = (
         rel.read.named_table.unparsed_identifier
         and _get_supported_read_file_format(rel.read.named_table.unparsed_identifier)
@@ -205,6 +264,23 @@ def _get_supported_read_file_format(unparsed_identifier: str) -> str | None:
     return None
+# TODO: [SNOW-2465948] Remove this once Snowpark fixes the issue with stage paths.
+class StagePathStr(str):
+    def partition(self, __sep):
+        if str(self)[0] == "'":
+            return str(self)[1:].partition(__sep)
+        return str(self).partition(__sep)
+def _quote_stage_path(stage_path: str) -> str:
+    """
+    Quote stage paths to escape any special characters.
+    """
+    if stage_path.startswith("@"):
+        return StagePathStr(f"'{stage_path}'")
+    return stage_path
 def _read_file(
     clean_source_paths: list[str],
     options: dict,
@@ -212,12 +288,21 @@ def _read_file(
     rel: relation_proto.Relation,
     schema: StructType | None,
     session: snowpark.Session,
-) -> snowpark.DataFrame:
+) -> DataFrameContainer:
     paths = get_paths_from_stage(
         clean_source_paths,
         session,
     )
     upload_files_if_needed(paths, clean_source_paths, session, read_format)
+    paths = [_quote_stage_path(path) for path in paths]
+    if read_format in ("csv", "text", "json", "parquet"):
+        compression = get_compression_for_source_and_options(
+            read_format, options, from_read=True
+        )
+        if compression is not None:
+            options["compression"] = compression
     match read_format:
         case "csv":
             from snowflake.snowpark_connect.relation.read.map_read_csv import (
@@ -230,7 +315,11 @@ def _read_file(
                 map_read_json,
             )
-            return map_read_json(rel, schema, session, paths, JsonReaderConfig(options))
+            # JSON already materializes the table internally
+            return map_read_json(
+                rel, schema, session, paths, JsonReaderConfig(options)
+            ).without_materialization()
         case "parquet":
             from snowflake.snowpark_connect.relation.read.map_read_parquet import (
                 map_read_parquet,
@@ -246,9 +335,11 @@ def _read_file(
             return map_read_text(rel, schema, session, paths)
         case _:
-            raise SnowparkConnectNotImplementedError(
+            exception = SnowparkConnectNotImplementedError(
                 f"Unsupported format: {read_format}"
             )
+            attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+            raise exception
 def _skip_upload(path: str, read_format: str):
@@ -285,8 +376,8 @@ def upload_files_if_needed(
     def _upload_dir(target: str, source: str) -> None:
         # overwrite=True will not remove all stale files in the target prefix
-        remove_command = f"REMOVE {target}/"
+        # Quote the target path to allow special characters.
+        remove_command = f"REMOVE '{target}/'"
         assert (
             "//" not in remove_command
         ), f"Remove command {remove_command} contains double slash"

snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

snowpark-connect 0.27.0py3-none-any.whl → 1.7.0py3-none-any.whl