PyPI - snowpark-connect - Versions diffs - 0.32.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

snowpark-connect 0.32.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of snowpark-connect might be problematic. Click here for more details.

Files changed (106) hide show

snowflake/snowpark_connect/relation/read/map_read_json.py CHANGED Viewed

@@ -35,6 +35,7 @@ from snowflake.snowpark_connect.relation.read.metadata_utils import (
     add_filename_metadata_to_reader,
 )
 from snowflake.snowpark_connect.relation.read.utils import (
+    apply_metadata_exclusion_pattern,
     get_spark_column_names_from_snowpark_columns,
     rename_columns_as_snowflake_standard,
 )
@@ -80,6 +81,8 @@ def map_read_json(
         dropFieldIfAllNull = snowpark_options.pop("dropfieldifallnull", False)
         batch_size = snowpark_options.pop("batchsize", 1000)
+        apply_metadata_exclusion_pattern(snowpark_options)
         reader = add_filename_metadata_to_reader(
             session.read.options(snowpark_options), raw_options
         )
@@ -117,6 +120,10 @@ def map_read_json(
                     if unquote_if_quoted(sf.name) in columns_with_valid_contents
                 ]
+        new_schema, fields_changed = validate_and_update_schema(schema)
+        if fields_changed:
+            schema = new_schema
         df = construct_dataframe_by_schema(
             schema, df.to_local_iterator(), session, snowpark_options, batch_size
         )
@@ -134,6 +141,84 @@ def map_read_json(
         )
+def should_drop_field(field: StructField) -> bool:
+    if isinstance(field.datatype, StructType):
+        # "a" : {} => drop the field
+        if len(field.datatype.fields) == 0:
+            return True
+    elif (
+        isinstance(field.datatype, ArrayType)
+        and field.datatype.element_type is not None
+        and isinstance(field.datatype.element_type, StructType)
+    ):
+        if len(field.datatype.element_type.fields) == 0:
+            # "a" : [{}] => drop the field
+            return True
+    return False
+# Validate the schema to ensure it is valid for Snowflake
+# Handles these cases:
+#   1. Drops StructField([])
+#   2. Drops ArrayType(StructType([]))
+#   3. ArrayType() -> ArrayType(StringType())
+def validate_and_update_schema(schema: StructType | None) -> (StructType | None, bool):
+    if not isinstance(schema, StructType):
+        return schema, False
+    new_fields = []
+    fields_changed = False
+    for sf in schema.fields:
+        if should_drop_field(sf):
+            fields_changed = True
+            continue
+        if isinstance(sf.datatype, StructType):
+            # If the schema is a struct, validate the child schema
+            if len(sf.datatype.fields) == 0:
+                # No fields in the struct, drop the field
+                fields_changed = True
+                continue
+            child_field = StructField(sf.name, sf.datatype, sf.nullable)
+            # Recursively validate the child schema
+            child_field.datatype, child_field_changes = validate_and_update_schema(
+                sf.datatype
+            )
+            if should_drop_field(child_field):
+                fields_changed = True
+                continue
+            new_fields.append(child_field)
+            fields_changed = fields_changed or child_field_changes
+        elif isinstance(sf.datatype, ArrayType):
+            # If the schema is an array, validate the element schema
+            if sf.datatype.element_type is not None and isinstance(
+                sf.datatype.element_type, StructType
+            ):
+                # If the element schema is a struct, validate the element schema
+                if len(sf.datatype.element_type.fields) == 0:
+                    # No fields in the struct, drop the field
+                    fields_changed = True
+                    continue
+                else:
+                    # Recursively validate the element schema
+                    element_schema, element_field_changes = validate_and_update_schema(
+                        sf.datatype.element_type
+                    )
+                    if element_field_changes:
+                        sf.datatype.element_type = element_schema
+                        fields_changed = True
+                    if should_drop_field(sf):
+                        fields_changed = True
+                        continue
+            elif sf.datatype.element_type is None:
+                fields_changed = True
+                sf.datatype.element_type = StringType()
+            new_fields.append(sf)
+        else:
+            new_fields.append(sf)
+    if fields_changed:
+        schema.fields = new_fields
+    return schema, fields_changed
 def merge_json_schema(
     content: typing.Any,
     schema: StructType | None,
@@ -378,8 +463,11 @@ def construct_row_by_schema(
         inner_schema = schema.element_type
         if isinstance(content, str):
             content = json.loads(content)
-        for ele in content:
-            result.append(construct_row_by_schema(ele, inner_schema, snowpark_options))
+        if inner_schema is not None:
+            for ele in content:
+                result.append(
+                    construct_row_by_schema(ele, inner_schema, snowpark_options)
+                )
         return result
     elif isinstance(schema, DateType):
         return cast_to_match_snowpark_type(

snowflake/snowpark_connect/relation/read/map_read_parquet.py CHANGED Viewed

@@ -29,6 +29,7 @@ from snowflake.snowpark_connect.relation.read.metadata_utils import (
 )
 from snowflake.snowpark_connect.relation.read.reader_config import ReaderWriterConfig
 from snowflake.snowpark_connect.relation.read.utils import (
+    apply_metadata_exclusion_pattern,
     rename_columns_as_snowflake_standard,
 )
 from snowflake.snowpark_connect.utils.telemetry import (
@@ -57,6 +58,8 @@ def map_read_parquet(
     assert schema is None, "Read PARQUET does not support user schema"
     assert len(paths) > 0, "Read PARQUET expects at least one path"
+    apply_metadata_exclusion_pattern(snowpark_options)
     reader = add_filename_metadata_to_reader(
         session.read.options(snowpark_options), raw_options
     )

snowflake/snowpark_connect/relation/read/map_read_text.py CHANGED Viewed

@@ -26,6 +26,10 @@ def get_file_paths_from_stage(
 ) -> typing.List[str]:
     files_paths = []
     for listed_path_row in session.sql(f"LIST {path}").collect():
+        # Skip _SUCCESS marker files
+        if listed_path_row[0].endswith("_SUCCESS"):
+            continue
         listed_path = listed_path_row[0].split("/")
         if listed_path_row[0].startswith("s3://") or listed_path_row[0].startswith(
             "s3a://"

snowflake/snowpark_connect/relation/read/reader_config.py CHANGED Viewed

@@ -126,6 +126,7 @@ CSV_READ_SUPPORTED_OPTIONS = lowercase_set(
         "compression",
         # "escapeQuotes",
         # "quoteAll",
+        "rowsToInferSchema",  # Snowflake specific option, number of rows to infer schema
     }
 )
@@ -201,6 +202,15 @@ def csv_convert_to_snowpark_args(snowpark_config: dict[str, Any]) -> dict[str, A
     if snowpark_config["escape"] and snowpark_config["escape"] == "\\":
         snowpark_config["escape"] = "\\\\"
+    # Snowflake specific option, number of rows to infer schema for CSV files
+    if "rowstoinferschema" in snowpark_config:
+        rows_to_infer_schema = snowpark_config["rowstoinferschema"]
+        del snowpark_config["rowstoinferschema"]
+        snowpark_config["INFER_SCHEMA_OPTIONS"] = {
+            "MAX_RECORDS_PER_FILE": int(rows_to_infer_schema),
+            "USE_RELAXED_TYPES": True,
+        }
     # Rename the keys to match the Snowpark configuration.
     for spark_arg, snowpark_arg in renamed_args.items():
         if spark_arg not in snowpark_config:

snowflake/snowpark_connect/relation/read/utils.py CHANGED Viewed

@@ -40,6 +40,47 @@ DATA_SOURCE_SQL_COMMENT = (
 INDEXED_COLUMN_NAME_PATTERN = re.compile(r"(^\"c)(\d+)(\"$)")
+def apply_metadata_exclusion_pattern(options: dict) -> None:
+    """
+    Exclude metadata and hidden files from reads, matching Spark's behavior.
+    Automatically filters out internal metadata files that should never be read as data:
+        - _SUCCESS, _metadata, _common_metadata (Spark/Parquet metadata)
+        - .crc (Hadoop checksum files)
+        - .DS_Store (macOS system files)
+        - Any file starting with _ or .
+    Pattern used: ".*/[^_.][^/]*$|^[^_.][^/]*$"
+        - Matches files where filename does NOT start with _ or .
+        - Works at any directory depth (flat or partitioned data)
+        - Allows files with or without extensions
+    Examples of excluded files:
+        ❌ _SUCCESS, _metadata, _common_metadata (Spark/Parquet metadata)
+        ❌ .crc, .DS_Store, .hidden (system/hidden files)
+        ❌ year=2024/_SUCCESS (metadata in partitioned directories)
+    Examples of allowed files:
+        ✅ part-00000.parquet, data.csv, output.json (data files)
+        ✅ success, myfile (files without extensions, don't start with _ or .)
+        ✅ year=2024/month=01/part-00000.parquet (partitioned data)
+    User pattern handling:
+        - No pattern or "*" or ".*" → Apply metadata exclusion
+        - Custom patterns → Default to user provided pattern.
+    Leak cases (user explicitly requests metadata files and are intentional):
+        ⚠️ "_*" → Matches _SUCCESS, _metadata (explicit underscore prefix)
+        ⚠️ "*SUCCESS*" → Matches _SUCCESS (broad wildcard side effect)
+        ⚠️ "[_.].*" → Matches _SUCCESS, .crc (character class includes _)
+    Args:
+        options: Dictionary of Snowpark read options (modified in place)
+    """
+    if "PATTERN" not in options or options["PATTERN"] in ("*", ".*"):
+        options["PATTERN"] = ".*/[^_.][^/]*$|^[^_.][^/]*$"
 def subtract_one(match: re.Match[str]) -> str:
     """Spark column names are 0 indexed, Snowpark is 1 indexed."""
     return f"_c{str(int(match.group(2)) - 1)}"

snowflake/snowpark_connect/relation/utils.py CHANGED Viewed

@@ -174,6 +174,7 @@ def generate_spark_compatible_filename(
     attempt_number: int = 0,
     compression: str = None,
     format_ext: str = "parquet",
+    shared_uuid: str = None,
 ) -> str:
     """Generate a Spark-compatible filename following the convention:
     part-<task-id>-<uuid>-c<attempt-number>.<compression>.<format>
@@ -183,12 +184,13 @@ def generate_spark_compatible_filename(
         attempt_number: Attempt number (usually 0)
         compression: Compression type (e.g., 'snappy', 'gzip', 'none')
         format_ext: File format extension (e.g., 'parquet', 'csv', 'json')
+        shared_uuid: Shared UUID for the file
     Returns:
         A filename string following Spark's naming convention
     """
-    # Generate a UUID for uniqueness
-    file_uuid = str(uuid.uuid4())
+    # Use the shared UUID if provided, otherwise generate a new one for uniqueness
+    file_uuid = shared_uuid or str(uuid.uuid4())
     # Format task ID with leading zeros (5 digits)
     formatted_task_id = f"{task_id:05d}"
@@ -284,3 +286,49 @@ def snowpark_functions_col(name: str, column_map: ColumnNameMap) -> snowpark.Col
     """
     is_qualified_name = name not in column_map.get_snowpark_columns()
     return snowpark_fn.col(name, _is_qualified_name=is_qualified_name)
+def is_aggregate_function(func_name: str) -> bool:
+    """
+    Check if a function name is an aggregate function.
+    Uses a hybrid approach:
+    1. First checks PySpark's docstring convention (docstrings starting with "Aggregate function:")
+    2. Falls back to a hardcoded list for functions with missing/incorrect docstrings
+    This ensures comprehensive coverage while automatically supporting new PySpark aggregate functions.
+    Args:
+        func_name: The function name to check (case-insensitive)
+    Returns:
+        True if the function is an aggregate function, False otherwise
+    """
+    try:
+        import pyspark.sql.functions as pyspark_functions
+        # TODO:
+        """
+        Check we can leverage scala classes to determine agg functions:
+        https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala#L207
+        """
+        # Try PySpark docstring approach first (covers most aggregate functions)
+        pyspark_func = getattr(pyspark_functions, func_name.lower(), None)
+        if pyspark_func and pyspark_func.__doc__:
+            if pyspark_func.__doc__.lstrip().startswith("Aggregate function:"):
+                return True
+        # Fallback list for aggregate functions with missing/incorrect docstrings
+        # These are known aggregate functions that don't have proper docstring markers
+        fallback_aggregates = {
+            "percentile_cont",
+            "percentile_disc",
+            "any_value",
+            "grouping",
+            "grouping_id",
+        }
+        return func_name.lower() in fallback_aggregates
+    except Exception:
+        return False

snowpark-connect 0.32.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

Potentially problematic release.

snowpark-connect 0.32.0py3-none-any.whl → 1.0.0py3-none-any.whl