PyPI - snowpark-connect - Versions diffs - 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl - Mend

snowpark-connect 0.27.0py3-none-any.whl → 1.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (192) hide show

snowflake/snowpark_connect/relation/read/map_read_csv.py CHANGED Viewed

@@ -3,19 +3,30 @@
 #
 import copy
+from typing import Any
 import pyspark.sql.connect.proto.relations_pb2 as relation_proto
+from pyspark.errors.exceptions.base import AnalysisException
 import snowflake.snowpark.functions as snowpark_fn
 from snowflake import snowpark
 from snowflake.snowpark.dataframe_reader import DataFrameReader
 from snowflake.snowpark.types import StringType, StructField, StructType
+from snowflake.snowpark_connect.config import global_config, str_to_bool
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
+from snowflake.snowpark_connect.error.error_codes import ErrorCodes
+from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
 from snowflake.snowpark_connect.relation.read.map_read import CsvReaderConfig
+from snowflake.snowpark_connect.relation.read.metadata_utils import (
+    add_filename_metadata_to_reader,
+    get_non_metadata_fields,
+)
 from snowflake.snowpark_connect.relation.read.utils import (
+    apply_metadata_exclusion_pattern,
     get_spark_column_names_from_snowpark_columns,
     rename_columns_as_snowflake_standard,
 )
+from snowflake.snowpark_connect.utils.io_utils import cached_file_format
 from snowflake.snowpark_connect.utils.telemetry import (
     SnowparkConnectNotImplementedError,
 )
@@ -37,33 +48,68 @@ def map_read_csv(
     if rel.read.is_streaming is True:
         # TODO: Structured streaming implementation.
-        raise SnowparkConnectNotImplementedError(
+        exception = SnowparkConnectNotImplementedError(
             "Streaming is not supported for CSV files."
         )
+        attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
+        raise exception
     else:
-        snowpark_options = options.convert_to_snowpark_args()
+        converted_snowpark_options = options.convert_to_snowpark_args()
+        parse_header = converted_snowpark_options.get("PARSE_HEADER", False)
+        file_format_options = _parse_csv_snowpark_options(converted_snowpark_options)
+        file_format = cached_file_format(session, "csv", file_format_options)
+        snowpark_reader_options = dict()
+        snowpark_reader_options["FORMAT_NAME"] = file_format
+        snowpark_reader_options["ENFORCE_EXISTING_FILE_FORMAT"] = True
+        snowpark_reader_options["INFER_SCHEMA"] = converted_snowpark_options.get(
+            "INFER_SCHEMA", False
+        )
+        snowpark_reader_options[
+            "INFER_SCHEMA_OPTIONS"
+        ] = converted_snowpark_options.get("INFER_SCHEMA_OPTIONS", {})
+        # Use Try_cast to avoid schema inference errors
+        if snowpark_reader_options.get("INFER_SCHEMA", False):
+            snowpark_reader_options["TRY_CAST"] = True
+        apply_metadata_exclusion_pattern(converted_snowpark_options)
+        snowpark_reader_options["PATTERN"] = converted_snowpark_options.get(
+            "PATTERN", None
+        )
         raw_options = rel.read.data_source.options
         if schema is None or (
-            snowpark_options.get("PARSE_HEADER", False)
-            and raw_options.get("enforceSchema", "True").lower() == "false"
+            parse_header
+            and str(raw_options.get("enforceSchema", "True")).lower() == "false"
         ):  # Schema has to equals to header's format
-            reader = session.read.options(snowpark_options)
+            reader = add_filename_metadata_to_reader(
+                session.read.options(snowpark_reader_options), raw_options
+            )
         else:
-            reader = session.read.options(snowpark_options).schema(schema)
+            reader = add_filename_metadata_to_reader(
+                session.read.options(snowpark_reader_options).schema(schema),
+                raw_options,
+            )
         df = read_data(
             reader,
             schema,
             session,
             paths[0],
-            snowpark_options,
+            file_format_options,
+            snowpark_reader_options,
             raw_options,
+            parse_header,
         )
         if len(paths) > 1:
             # TODO: figure out if this is what Spark does.
             for p in paths[1:]:
                 df = df.union_all(reader.csv(p))
-        if schema is None:
+        if schema is None and not str_to_bool(
+            str(raw_options.get("inferSchema", "false"))
+        ):
             df = df.select(
                 [snowpark_fn.col(c).cast("STRING").alias(c) for c in df.schema.names]
             )
@@ -81,62 +127,226 @@ def map_read_csv(
         )
+_csv_file_format_allowed_options = {
+    "COMPRESSION",
+    "RECORD_DELIMITER",
+    "FIELD_DELIMITER",
+    "MULTI_LINE",
+    "FILE_EXTENSION",
+    "PARSE_HEADER",
+    "SKIP_HEADER",
+    "SKIP_BLANK_LINES",
+    "DATE_FORMAT",
+    "TIME_FORMAT",
+    "TIMESTAMP_FORMAT",
+    "BINARY_FORMAT",
+    "ESCAPE",
+    "ESCAPE_UNENCLOSED_FIELD",
+    "TRIM_SPACE",
+    "FIELD_OPTIONALLY_ENCLOSED_BY",
+    "NULL_IF",
+    "ERROR_ON_COLUMN_COUNT_MISMATCH",
+    "REPLACE_INVALID_CHARACTERS",
+    "EMPTY_FIELD_AS_NULL",
+    "SKIP_BYTE_ORDER_MARK",
+    "ENCODING",
+}
+def _parse_csv_snowpark_options(snowpark_options: dict[str, Any]) -> dict[str, Any]:
+    file_format_options = dict()
+    for key, value in snowpark_options.items():
+        upper_key = key.upper()
+        if upper_key in _csv_file_format_allowed_options:
+            file_format_options[upper_key] = value
+    # This option has to be removed, because we cannot use at the same time predefined file format and parse_header option
+    # Such combination causes snowpark to raise SQL compilation error: Invalid file format "PARSE_HEADER" is only allowed for CSV INFER_SCHEMA and MATCH_BY_COLUMN_NAME
+    parse_header = file_format_options.get("PARSE_HEADER", False)
+    if parse_header:
+        file_format_options["SKIP_HEADER"] = 1
+        del file_format_options["PARSE_HEADER"]
+    return file_format_options
+def _deduplicate_column_names_pyspark_style(
+    column_names: list[str], case_sensitive: bool
+) -> list[str]:
+    """
+    Deduplicate column names following PySpark's behavior in CSVUtils.scala::makeSafeHeader by appending
+    global position index to all occurrences of duplicated names.
+    Examples with case_sensitive=False:
+        ['ab', 'AB'] -> ['ab0', 'AB1']
+        ['ab', 'ab'] -> ['ab0', 'ab1']
+        ['a', 'b', 'A', 'c', 'B'] -> ['a0', 'b1', 'A2', 'c', 'B4']  (positions: a=0,2; b=1,4; c=3)
+    Examples with case_sensitive=True:
+        ['ab', 'AB'] -> ['ab', 'AB']  (no duplicates, different case)
+        ['ab', 'ab'] -> ['ab0', 'ab1']  (exact duplicates at positions 0, 1)
+        ['a', 'b', 'A', 'c', 'B'] -> ['a', 'b', 'A', 'c', 'B']  (no duplicates)
+    Edge cases:
+        ['a0', 'a0'] -> ['a00', 'a01']  (appends position even if name already has digits)
+        ['a', '', 'b'] -> ['a', '_c1', 'b']  (empty names become _c<position>)
+    """
+    seen = set()
+    duplicates = set()
+    for name in column_names:
+        # filter out nulls and apply case transformation
+        if not name:
+            continue
+        key = name if case_sensitive else name.lower()
+        if key in seen:
+            duplicates.add(key)
+        else:
+            seen.add(key)
+    result = []
+    for index, value in enumerate(column_names):
+        # Empty/null, append _c<index>
+        if value is None or value == "":
+            result.append(f"_c{index}")
+        # Case-insensitive duplicate, append index
+        elif not case_sensitive and value.lower() in duplicates:
+            result.append(f"{value}{index}")
+        # Case-sensitive duplicate, append index
+        elif case_sensitive and value in duplicates:
+            result.append(f"{value}{index}")
+        else:
+            result.append(value)
+    return result
 def get_header_names(
     session: snowpark.Session,
     path: list[str],
-    snowpark_options: dict,
+    file_format_options: dict,
+    snowpark_read_options: dict,
+    raw_options: dict,
+    parse_header: bool,
 ) -> list[str]:
-    snowpark_options_no_header = copy.copy(snowpark_options)
-    snowpark_options_no_header["PARSE_HEADER"] = False
-    header_df = session.read.options(snowpark_options_no_header).csv(path).limit(1)
-    header_data = header_df.collect()[0]
-    return [
-        f'"{header_data[i]}"'
-        for i in range(len(header_df.schema.fields))
-        if header_data[i] is not None
+    no_header_file_format_options = copy.copy(file_format_options)
+    no_header_file_format_options["PARSE_HEADER"] = False
+    no_header_file_format_options.pop("SKIP_HEADER", None)
+    file_format = cached_file_format(session, "csv", no_header_file_format_options)
+    no_header_snowpark_read_options = copy.copy(snowpark_read_options)
+    no_header_snowpark_read_options["FORMAT_NAME"] = file_format
+    no_header_snowpark_read_options.pop("INFER_SCHEMA", None)
+    # If we don't set this, snowpark will try to infer the schema for all rows in the csv file.
+    # Since there's no easy way to just read the header from the csv, we use this approach where we force the df reader to infer the schema for 10 rows and
+    # and we are only interested in the first row to get the header names and discard the inferred schema.
+    no_header_snowpark_read_options["INFER_SCHEMA_OPTIONS"] = {
+        "MAX_RECORDS_PER_FILE": 1,
+    }
+    header_df = session.read.options(no_header_snowpark_read_options).csv(path).limit(1)
+    collected_data = header_df.collect()
+    if len(collected_data) == 0:
+        error_msg = f"Path does not exist or contains no data: {path}"
+        user_pattern = raw_options.get("pathGlobFilter", None)
+        if user_pattern:
+            error_msg += f" (with pathGlobFilter: {user_pattern})"
+        exception = AnalysisException(error_msg)
+        attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
+        raise exception
+    header_data = collected_data[0]
+    num_columns = len(header_df.schema.fields)
+    if not parse_header:
+        # parse_header=False, use default _c0, _c1, _c2... naming for columns
+        return [f'"_c{i}"' for i in range(num_columns)]
+    # parse_header=True: Read first row as column names and deduplicate
+    raw_column_names = [
+        header_data[i] if header_data[i] is not None else "" for i in range(num_columns)
     ]
+    case_sensitive = global_config.spark_sql_caseSensitive
+    deduplicated_names = _deduplicate_column_names_pyspark_style(
+        raw_column_names, case_sensitive
+    )
+    return [f'"{name}"' for name in deduplicated_names]
 def read_data(
     reader: DataFrameReader,
     schema: snowpark.types.StructType | None,
     session: snowpark.Session,
     path: list[str],
-    snowpark_options: dict,
+    file_format_options: dict,
+    snowpark_read_options: dict,
     raw_options: dict,
+    parse_header: bool,
 ) -> snowpark.DataFrame:
-    df = reader.csv(path)
     filename = path.strip("/").split("/")[-1]
     if schema is not None:
-        if len(schema.fields) != len(df.schema.fields):
-            raise Exception(f"csv load from {filename} failed.")
-        if raw_options.get("enforceSchema", "True").lower() == "false":
+        df = reader.csv(path)
+        non_metadata_fields = get_non_metadata_fields(df.schema.fields)
+        if len(schema.fields) != len(non_metadata_fields):
+            exception = Exception(f"csv load from {filename} failed.")
+            attach_custom_error_code(exception, ErrorCodes.INVALID_CAST)
+            raise exception
+        if str(raw_options.get("enforceSchema", "True")).lower() == "false":
             for i in range(len(schema.fields)):
                 if (
-                    schema.fields[i].name != df.schema.fields[i].name
-                    and f'"{schema.fields[i].name}"' != df.schema.fields[i].name
+                    schema.fields[i].name != non_metadata_fields[i].name
+                    and f'"{schema.fields[i].name}"' != non_metadata_fields[i].name
                 ):
-                    raise Exception("CSV header does not conform to the schema")
+                    exception = Exception("CSV header does not conform to the schema")
+                    attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
+                    raise exception
         return df
-    headers = get_header_names(session, path, snowpark_options)
-    # Handle mismatch in column count between header and data
-    if (
-        len(df.schema.fields) == 1
-        and df.schema.fields[0].name.upper() == "C1"
-        and snowpark_options.get("PARSE_HEADER") is True
-        and len(headers) != len(df.schema.fields)
-    ):
-        df = (
-            session.read.options(snowpark_options)
-            .schema(StructType([StructField(h, StringType(), True) for h in headers]))
-            .csv(path)
-        )
-    elif snowpark_options.get("PARSE_HEADER") is False and len(headers) != len(
-        df.schema.fields
-    ):
-        return df.select([df.schema.fields[i].name for i in range(len(headers))])
+    headers = get_header_names(
+        session,
+        path,
+        file_format_options,
+        snowpark_read_options,
+        raw_options,
+        parse_header,
+    )
+    # Create schema with the column names and read CSV
+    if len(headers) > 0:
+        if (
+            not str_to_bool(str(raw_options.get("inferSchema", "false")))
+            and schema is None
+        ):
+            inferred_schema = StructType(
+                [StructField(h, StringType(), True) for h in headers]
+            )
+            df = reader.schema(inferred_schema).csv(path)
+        else:
+            df = reader.csv(path)
+            non_metadata_fields = get_non_metadata_fields(df.schema.fields)
+            if len(non_metadata_fields) != len(headers):
+                exception = Exception(
+                    f"CSV header: {headers} does not conform to the schema"
+                )
+                attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
+                raise exception
+            if any(
+                non_metadata_fields[i].name != headers[i]
+                for i in range(len(non_metadata_fields))
+            ):
+                df = df.select(
+                    [
+                        snowpark_fn.col(non_metadata_fields[i].name).alias(headers[i])
+                        for i in range(len(non_metadata_fields))
+                    ]
+                )
+        return df
-    return df
+    # Fallback: no headers, shouldn't reach here
+    return reader.csv(path)

snowflake/snowpark_connect/relation/read/map_read_jdbc.py CHANGED Viewed

@@ -9,6 +9,8 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
 from snowflake import snowpark
 from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
 from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
+from snowflake.snowpark_connect.error.error_codes import ErrorCodes
+from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
 from snowflake.snowpark_connect.relation.read.jdbc_read_dbapi import JdbcDataFrameReader
 from snowflake.snowpark_connect.relation.read.utils import (
     Connection,
@@ -28,7 +30,9 @@ def create_connection(jdbc_options: dict[str, str]) -> Connection:
         return jaydebeapi.connect(driver, url, jdbc_options)
     except Exception as e:
         jpype.detachThreadFromJVM()
-        raise Exception(f"Error connecting JDBC datasource: {e}")
+        exception = Exception(f"Error connecting JDBC datasource: {e}")
+        attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
+        raise exception
 def close_connection(conn: Connection) -> None:
@@ -70,17 +74,23 @@ def map_read_jdbc(
         dbtable = None
     if not dbtable and not query:
-        raise ValueError("Include dbtable or query is required option")
+        exception = ValueError("Include dbtable or query is required option")
+        attach_custom_error_code(exception, ErrorCodes.INSUFFICIENT_INPUT)
+        raise exception
     if query is not None and dbtable is not None:
-        raise ValueError(
+        exception = ValueError(
             "Not allowed to specify dbtable and query options at the same time"
         )
+        attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
+        raise exception
     if query is not None and partition_column is not None:
-        raise ValueError(
+        exception = ValueError(
             "Not allowed to specify partitionColumn and query options at the same time"
         )
+        attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
+        raise exception
     try:
         df = JdbcDataFrameReader(session, jdbc_options).jdbc_read_dbapi(
@@ -105,4 +115,6 @@ def map_read_jdbc(
             snowpark_column_types=[f.datatype for f in df.schema.fields],
         )
     except Exception as e:
-        raise Exception(f"Error accessing JDBC datasource for read: {e}")
+        exception = Exception(f"Error accessing JDBC datasource for read: {e}")
+        attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
+        raise exception

snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

snowpark-connect 0.27.0py3-none-any.whl → 1.6.0py3-none-any.whl