snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +717 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +309 -26
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/error_utils.py +28 -0
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +224 -15
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +86 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
- snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
- snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +171 -48
- snowflake/snowpark_connect/server.py +528 -473
- snowflake/snowpark_connect/server_common/__init__.py +503 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/type_support.py +130 -0
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +195 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +192 -40
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Utilities for handling internal metadata columns in file-based DataFrames.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
import pandas
|
|
12
|
+
from pyspark.errors.exceptions.base import AnalysisException
|
|
13
|
+
|
|
14
|
+
from snowflake import snowpark
|
|
15
|
+
from snowflake.snowpark.column import METADATA_FILENAME
|
|
16
|
+
from snowflake.snowpark.functions import col
|
|
17
|
+
from snowflake.snowpark.types import StructField, StructType
|
|
18
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
19
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
20
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
21
|
+
|
|
22
|
+
# Constant for the metadata filename column name
|
|
23
|
+
METADATA_FILENAME_COLUMN = "METADATA$FILENAME"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def add_filename_metadata_to_reader(
|
|
27
|
+
reader: snowpark.DataFrameReader,
|
|
28
|
+
options: dict | None = None,
|
|
29
|
+
) -> snowpark.DataFrameReader:
|
|
30
|
+
"""
|
|
31
|
+
Add filename metadata to a DataFrameReader based on configuration.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
reader: Snowpark DataFrameReader instance
|
|
35
|
+
options: Dictionary of options to check for metadata configuration
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
DataFrameReader with filename metadata enabled if configured, otherwise unchanged
|
|
39
|
+
"""
|
|
40
|
+
# NOTE: SNOWPARK_POPULATE_FILE_METADATA_DEFAULT is an internal environment variable
|
|
41
|
+
# used only for CI testing to verify no metadata columns leak in regular file operations.
|
|
42
|
+
# This environment variable should NOT be exposed to end users. Users should only use snowpark.populateFileMetadata
|
|
43
|
+
# to enable metadata population.
|
|
44
|
+
metadata_default = os.environ.get(
|
|
45
|
+
"SNOWPARK_POPULATE_FILE_METADATA_DEFAULT", "false"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
populate_metadata = (
|
|
49
|
+
options.get("snowpark.populateFileMetadata", metadata_default)
|
|
50
|
+
if options
|
|
51
|
+
else metadata_default
|
|
52
|
+
).lower() == "true"
|
|
53
|
+
|
|
54
|
+
if populate_metadata:
|
|
55
|
+
return reader.with_metadata(METADATA_FILENAME)
|
|
56
|
+
else:
|
|
57
|
+
return reader
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_non_metadata_fields(schema_fields: list[StructField]) -> list[StructField]:
|
|
61
|
+
"""
|
|
62
|
+
Filter out METADATA$FILENAME fields from a list of schema fields.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
schema_fields: List of StructField objects from a DataFrame schema
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
List of StructField objects excluding METADATA$FILENAME
|
|
69
|
+
"""
|
|
70
|
+
return [field for field in schema_fields if field.name != METADATA_FILENAME_COLUMN]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def get_non_metadata_column_names(schema_fields: list[StructField]) -> list[str]:
|
|
74
|
+
"""
|
|
75
|
+
Get column names from schema fields, excluding METADATA$FILENAME.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
schema_fields: List of StructField objects from a DataFrame schema
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
List of column names (strings) excluding METADATA$FILENAME
|
|
82
|
+
"""
|
|
83
|
+
return [
|
|
84
|
+
field.name for field in schema_fields if field.name != METADATA_FILENAME_COLUMN
|
|
85
|
+
]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def filter_metadata_column_name(column_names: list[str]) -> list[str]:
|
|
89
|
+
"""
|
|
90
|
+
Get column names from column_names, excluding METADATA$FILENAME.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of column names (strings) excluding METADATA$FILENAME
|
|
94
|
+
"""
|
|
95
|
+
return [
|
|
96
|
+
col_name for col_name in column_names if col_name != METADATA_FILENAME_COLUMN
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def without_internal_columns(
|
|
101
|
+
result_container: DataFrameContainer | pandas.DataFrame | None,
|
|
102
|
+
) -> DataFrameContainer | pandas.DataFrame | None:
|
|
103
|
+
"""
|
|
104
|
+
Filters internal columns like:
|
|
105
|
+
* METADATA$FILENAME from DataFrame container for execution and write operations
|
|
106
|
+
* hidden columns needed for outer joins implementation
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
result_container: DataFrameContainer or pandas DataFrame to filter
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
Filtered container (callers can access dataframe via container.dataframe)
|
|
113
|
+
"""
|
|
114
|
+
# Handle pandas DataFrame case - return as-is
|
|
115
|
+
if isinstance(result_container, pandas.DataFrame):
|
|
116
|
+
return result_container
|
|
117
|
+
|
|
118
|
+
if result_container is None:
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
result_container = result_container.without_hidden_columns()
|
|
122
|
+
result_df = result_container.dataframe
|
|
123
|
+
if not isinstance(result_df, snowpark.DataFrame):
|
|
124
|
+
return result_container
|
|
125
|
+
|
|
126
|
+
df_columns = result_container.column_map.get_snowpark_columns()
|
|
127
|
+
has_metadata_filename = any(name == METADATA_FILENAME_COLUMN for name in df_columns)
|
|
128
|
+
|
|
129
|
+
if not has_metadata_filename:
|
|
130
|
+
return result_container
|
|
131
|
+
|
|
132
|
+
non_metadata_columns = filter_metadata_column_name(df_columns)
|
|
133
|
+
|
|
134
|
+
if len(non_metadata_columns) == 0:
|
|
135
|
+
# DataFrame contains only metadata columns (METADATA$FILENAME), no actual data columns remaining.
|
|
136
|
+
# We don't have a way to return an empty dataframe.
|
|
137
|
+
exception = AnalysisException(
|
|
138
|
+
"[DATAFRAME_MISSING_DATA_COLUMNS] Cannot perform operation on DataFrame that contains no data columns."
|
|
139
|
+
)
|
|
140
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
141
|
+
raise exception
|
|
142
|
+
|
|
143
|
+
filtered_df = result_df.select([col(name) for name in non_metadata_columns])
|
|
144
|
+
|
|
145
|
+
original_spark_columns = result_container.column_map.get_spark_columns()
|
|
146
|
+
original_snowpark_columns = result_container.column_map.get_snowpark_columns()
|
|
147
|
+
|
|
148
|
+
filtered_spark_columns = []
|
|
149
|
+
filtered_snowpark_columns = []
|
|
150
|
+
|
|
151
|
+
for i, colname in enumerate(df_columns):
|
|
152
|
+
if colname != METADATA_FILENAME_COLUMN:
|
|
153
|
+
filtered_spark_columns.append(original_spark_columns[i])
|
|
154
|
+
filtered_snowpark_columns.append(original_snowpark_columns[i])
|
|
155
|
+
|
|
156
|
+
new_container = DataFrameContainer.create_with_column_mapping(
|
|
157
|
+
dataframe=filtered_df,
|
|
158
|
+
spark_column_names=filtered_spark_columns,
|
|
159
|
+
snowpark_column_names=filtered_snowpark_columns,
|
|
160
|
+
column_metadata=result_container.column_map.column_metadata,
|
|
161
|
+
table_name=result_container.table_name,
|
|
162
|
+
alias=result_container.alias,
|
|
163
|
+
partition_hint=result_container.partition_hint,
|
|
164
|
+
# we don't want to evaluate `filtered_df` schema since it will always trigger a describe query
|
|
165
|
+
cached_schema_getter=lambda: StructType(
|
|
166
|
+
[f for f in result_df.schema if f.name != METADATA_FILENAME_COLUMN]
|
|
167
|
+
),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return new_container
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from typing import Any
|
|
7
7
|
|
|
8
|
-
from snowflake.snowpark_connect.config import str_to_bool
|
|
8
|
+
from snowflake.snowpark_connect.config import global_config, str_to_bool
|
|
9
9
|
from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
10
10
|
|
|
11
11
|
|
|
@@ -126,6 +126,8 @@ CSV_READ_SUPPORTED_OPTIONS = lowercase_set(
|
|
|
126
126
|
"compression",
|
|
127
127
|
# "escapeQuotes",
|
|
128
128
|
# "quoteAll",
|
|
129
|
+
"rowsToInferSchema", # Snowflake specific option, number of rows to infer schema
|
|
130
|
+
"relaxTypesToInferSchema", # Snowflake specific option, whether to relax types to infer schema
|
|
129
131
|
}
|
|
130
132
|
)
|
|
131
133
|
|
|
@@ -201,6 +203,21 @@ def csv_convert_to_snowpark_args(snowpark_config: dict[str, Any]) -> dict[str, A
|
|
|
201
203
|
if snowpark_config["escape"] and snowpark_config["escape"] == "\\":
|
|
202
204
|
snowpark_config["escape"] = "\\\\"
|
|
203
205
|
|
|
206
|
+
# Snowflake specific option, number of rows to infer schema for CSV files
|
|
207
|
+
if "rowstoinferschema" in snowpark_config:
|
|
208
|
+
rows_to_infer_schema = snowpark_config["rowstoinferschema"]
|
|
209
|
+
del snowpark_config["rowstoinferschema"]
|
|
210
|
+
relax_types_to_infer_schema = True
|
|
211
|
+
if "relaxtypestoinferschema" in snowpark_config:
|
|
212
|
+
relax_types_to_infer_schema = str_to_bool(
|
|
213
|
+
str(snowpark_config["relaxtypestoinferschema"])
|
|
214
|
+
)
|
|
215
|
+
del snowpark_config["relaxtypestoinferschema"]
|
|
216
|
+
snowpark_config["INFER_SCHEMA_OPTIONS"] = {
|
|
217
|
+
"MAX_RECORDS_PER_FILE": int(rows_to_infer_schema),
|
|
218
|
+
"USE_RELAXED_TYPES": relax_types_to_infer_schema,
|
|
219
|
+
}
|
|
220
|
+
|
|
204
221
|
# Rename the keys to match the Snowpark configuration.
|
|
205
222
|
for spark_arg, snowpark_arg in renamed_args.items():
|
|
206
223
|
if spark_arg not in snowpark_config:
|
|
@@ -339,7 +356,7 @@ class JsonReaderConfig(ReaderWriterConfig):
|
|
|
339
356
|
"dropFieldIfAllNull",
|
|
340
357
|
"encoding",
|
|
341
358
|
# "locale",
|
|
342
|
-
|
|
359
|
+
"pathGlobFilter",
|
|
343
360
|
# "recursiveFileLookup",
|
|
344
361
|
# "modifiedBefore",
|
|
345
362
|
# "modifiedAfter",
|
|
@@ -366,6 +383,7 @@ class JsonReaderConfig(ReaderWriterConfig):
|
|
|
366
383
|
"dateFormat": "DATE_FORMAT",
|
|
367
384
|
"timestampFormat": "TIMESTAMP_FORMAT",
|
|
368
385
|
"multiLine": "STRIP_OUTER_ARRAY",
|
|
386
|
+
"pathGlobFilter": "PATTERN",
|
|
369
387
|
}
|
|
370
388
|
renamed_args = lowercase_dict_keys(renamed_args)
|
|
371
389
|
snowpark_config = super().convert_to_snowpark_args()
|
|
@@ -385,7 +403,7 @@ class ParquetReaderConfig(ReaderWriterConfig):
|
|
|
385
403
|
default_config={},
|
|
386
404
|
supported_options={
|
|
387
405
|
# "mergeSchema",
|
|
388
|
-
|
|
406
|
+
"pathGlobFilter",
|
|
389
407
|
# "recursiveFileLookup",
|
|
390
408
|
# "modifiedBefore",
|
|
391
409
|
# "modifiedAfter",
|
|
@@ -402,10 +420,31 @@ class ParquetReaderConfig(ReaderWriterConfig):
|
|
|
402
420
|
)
|
|
403
421
|
|
|
404
422
|
def convert_to_snowpark_args(self) -> dict[str, Any]:
|
|
423
|
+
renamed_args = {
|
|
424
|
+
"pathGlobFilter": "PATTERN",
|
|
425
|
+
}
|
|
426
|
+
renamed_args = lowercase_dict_keys(renamed_args)
|
|
405
427
|
snowpark_args = super().convert_to_snowpark_args()
|
|
406
428
|
|
|
429
|
+
for spark_arg, snowpark_arg in renamed_args.items():
|
|
430
|
+
if spark_arg not in snowpark_args:
|
|
431
|
+
continue
|
|
432
|
+
snowpark_args[snowpark_arg] = snowpark_args[spark_arg]
|
|
433
|
+
del snowpark_args[spark_arg]
|
|
434
|
+
|
|
407
435
|
# Should be determined by spark.sql.parquet.binaryAsString, but currently Snowpark Connect only supports
|
|
408
436
|
# the default value (false). TODO: Add support for spark.sql.parquet.binaryAsString equal to "true".
|
|
409
437
|
snowpark_args["BINARY_AS_TEXT"] = False
|
|
410
438
|
|
|
439
|
+
# Set USE_VECTORIZED_SCANNER from global config. This will become the default in a future BCR.
|
|
440
|
+
snowpark_args["USE_VECTORIZED_SCANNER"] = global_config._get_config_setting(
|
|
441
|
+
"snowpark.connect.parquet.useVectorizedScanner"
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
# Set USE_LOGICAL_TYPE from global config to properly handle Parquet logical types like TIMESTAMP.
|
|
445
|
+
# Without this, Parquet TIMESTAMP (INT64 physical) is incorrectly read as NUMBER(38,0).
|
|
446
|
+
snowpark_args["USE_LOGICAL_TYPE"] = global_config._get_config_setting(
|
|
447
|
+
"snowpark.connect.parquet.useLogicalType"
|
|
448
|
+
)
|
|
449
|
+
|
|
411
450
|
return snowpark_args
|
|
@@ -40,6 +40,47 @@ DATA_SOURCE_SQL_COMMENT = (
|
|
|
40
40
|
INDEXED_COLUMN_NAME_PATTERN = re.compile(r"(^\"c)(\d+)(\"$)")
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
def apply_metadata_exclusion_pattern(options: dict) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Exclude metadata and hidden files from reads, matching Spark's behavior.
|
|
46
|
+
|
|
47
|
+
Automatically filters out internal metadata files that should never be read as data:
|
|
48
|
+
- _SUCCESS, _metadata, _common_metadata (Spark/Parquet metadata)
|
|
49
|
+
- .crc (Hadoop checksum files)
|
|
50
|
+
- .DS_Store (macOS system files)
|
|
51
|
+
- Any file starting with _ or .
|
|
52
|
+
|
|
53
|
+
Pattern used: ".*/[^_.][^/]*$|^[^_.][^/]*$"
|
|
54
|
+
- Matches files where filename does NOT start with _ or .
|
|
55
|
+
- Works at any directory depth (flat or partitioned data)
|
|
56
|
+
- Allows files with or without extensions
|
|
57
|
+
|
|
58
|
+
Examples of excluded files:
|
|
59
|
+
❌ _SUCCESS, _metadata, _common_metadata (Spark/Parquet metadata)
|
|
60
|
+
❌ .crc, .DS_Store, .hidden (system/hidden files)
|
|
61
|
+
❌ year=2024/_SUCCESS (metadata in partitioned directories)
|
|
62
|
+
|
|
63
|
+
Examples of allowed files:
|
|
64
|
+
✅ part-00000.parquet, data.csv, output.json (data files)
|
|
65
|
+
✅ success, myfile (files without extensions, don't start with _ or .)
|
|
66
|
+
✅ year=2024/month=01/part-00000.parquet (partitioned data)
|
|
67
|
+
|
|
68
|
+
User pattern handling:
|
|
69
|
+
- No pattern or "*" or ".*" → Apply metadata exclusion
|
|
70
|
+
- Custom patterns → Default to user provided pattern.
|
|
71
|
+
|
|
72
|
+
Leak cases (user explicitly requests metadata files and are intentional):
|
|
73
|
+
⚠️ "_*" → Matches _SUCCESS, _metadata (explicit underscore prefix)
|
|
74
|
+
⚠️ "*SUCCESS*" → Matches _SUCCESS (broad wildcard side effect)
|
|
75
|
+
⚠️ "[_.].*" → Matches _SUCCESS, .crc (character class includes _)
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
options: Dictionary of Snowpark read options (modified in place)
|
|
79
|
+
"""
|
|
80
|
+
if "PATTERN" not in options or options["PATTERN"] in ("*", ".*"):
|
|
81
|
+
options["PATTERN"] = ".*/[^_.][^/]*$|^[^_.][^/]*$"
|
|
82
|
+
|
|
83
|
+
|
|
43
84
|
def subtract_one(match: re.Match[str]) -> str:
|
|
44
85
|
"""Spark column names are 0 indexed, Snowpark is 1 indexed."""
|
|
45
86
|
return f"_c{str(int(match.group(2)) - 1)}"
|
|
@@ -73,13 +114,17 @@ def rename_columns_as_snowflake_standard(
|
|
|
73
114
|
return df, []
|
|
74
115
|
|
|
75
116
|
new_columns = make_column_names_snowpark_compatible(df.columns, plan_id)
|
|
76
|
-
|
|
77
|
-
df.
|
|
78
|
-
*(df.col(orig).alias(alias) for orig, alias in zip(df.columns, new_columns))
|
|
79
|
-
),
|
|
80
|
-
new_columns,
|
|
117
|
+
result_df = df.select(
|
|
118
|
+
*(df.col(orig).alias(alias) for orig, alias in zip(df.columns, new_columns))
|
|
81
119
|
)
|
|
82
120
|
|
|
121
|
+
# do not flatten initial rename when reading table
|
|
122
|
+
# TODO: remove once SNOW-2203826 is done
|
|
123
|
+
if result_df._select_statement is not None:
|
|
124
|
+
result_df._select_statement.flatten_disabled = True
|
|
125
|
+
|
|
126
|
+
return result_df, new_columns
|
|
127
|
+
|
|
83
128
|
|
|
84
129
|
class Connection(Protocol):
|
|
85
130
|
"""External datasource connection created from user-input create_connection function."""
|
|
@@ -5,17 +5,20 @@
|
|
|
5
5
|
import os
|
|
6
6
|
|
|
7
7
|
from fsspec.core import url_to_fs
|
|
8
|
+
from pyspark.errors.exceptions.base import AnalysisException
|
|
8
9
|
from s3fs.core import S3FileSystem
|
|
9
10
|
|
|
10
11
|
from snowflake import snowpark
|
|
11
12
|
from snowflake.snowpark.session import Session
|
|
12
13
|
from snowflake.snowpark_connect.config import sessions_config
|
|
14
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
15
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
13
16
|
from snowflake.snowpark_connect.relation.io_utils import (
|
|
14
17
|
get_cloud_from_url,
|
|
15
18
|
parse_azure_url,
|
|
16
19
|
)
|
|
17
20
|
from snowflake.snowpark_connect.relation.utils import random_string
|
|
18
|
-
from snowflake.snowpark_connect.utils.context import
|
|
21
|
+
from snowflake.snowpark_connect.utils.context import get_spark_session_id
|
|
19
22
|
from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
20
23
|
|
|
21
24
|
|
|
@@ -33,37 +36,44 @@ def get_paths_from_stage(
|
|
|
33
36
|
|
|
34
37
|
# TODO : What if GCP?
|
|
35
38
|
# TODO: What if already stage path?
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
_, bucket_name, path = parse_azure_url(p)
|
|
40
|
-
rewrite_paths.append(f"{stage_name}/{path}")
|
|
41
|
-
paths = rewrite_paths
|
|
42
|
-
else:
|
|
43
|
-
filesystem, parsed_path = url_to_fs(paths[0])
|
|
44
|
-
if isinstance(filesystem, S3FileSystem): # aws
|
|
45
|
-
# Remove bucket name from the path since the stage name will replace
|
|
46
|
-
# the bucket name in the path.
|
|
47
|
-
paths = [
|
|
48
|
-
f"{stage_name}/{'/'.join(url_to_fs(p)[1].split('/')[1:])}"
|
|
49
|
-
for p in paths
|
|
50
|
-
]
|
|
51
|
-
else: # local
|
|
52
|
-
# For local files, we need to preserve directory structure for partitioned data
|
|
53
|
-
# Instead of just using basename, we'll use the last few path components
|
|
54
|
-
new_paths = []
|
|
39
|
+
match get_cloud_from_url(paths[0]):
|
|
40
|
+
case "azure":
|
|
41
|
+
rewrite_paths = []
|
|
55
42
|
for p in paths:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
43
|
+
_, bucket_name, path = parse_azure_url(p)
|
|
44
|
+
rewrite_paths.append(f"{stage_name}/{path}")
|
|
45
|
+
paths = rewrite_paths
|
|
46
|
+
case "gcp":
|
|
47
|
+
exception = AnalysisException(
|
|
48
|
+
"You must configure an integration for Google Cloud Storage to perform I/O operations rather than accessing the URL directly. Reference: https://docs.snowflake.com/en/user-guide/data-load-gcs-config"
|
|
49
|
+
)
|
|
50
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
51
|
+
raise exception
|
|
52
|
+
case _:
|
|
53
|
+
filesystem, parsed_path = url_to_fs(paths[0])
|
|
54
|
+
if isinstance(filesystem, S3FileSystem): # aws
|
|
55
|
+
# Remove bucket name from the path since the stage name will replace
|
|
56
|
+
# the bucket name in the path.
|
|
57
|
+
paths = [
|
|
58
|
+
f"{stage_name}/{'/'.join(url_to_fs(p)[1].split('/')[1:])}"
|
|
59
|
+
for p in paths
|
|
60
|
+
]
|
|
61
|
+
else: # local
|
|
62
|
+
# For local files, we need to preserve directory structure for partitioned data
|
|
63
|
+
# Instead of just using basename, we'll use the last few path components
|
|
64
|
+
new_paths = []
|
|
65
|
+
for p in paths:
|
|
66
|
+
# Split the path and take the last 2-3 components to preserve structure
|
|
67
|
+
# but avoid very long paths
|
|
68
|
+
path_parts = p.split(os.sep)
|
|
69
|
+
if len(path_parts) >= 2:
|
|
70
|
+
# Take last 2 components (e.g., "base_case/x=abc")
|
|
71
|
+
relative_path = "/".join(path_parts[-2:])
|
|
72
|
+
else:
|
|
73
|
+
# Single component, use basename
|
|
74
|
+
relative_path = os.path.basename(p)
|
|
75
|
+
new_paths.append(f"{stage_name}/{relative_path}")
|
|
76
|
+
paths = new_paths
|
|
67
77
|
|
|
68
78
|
return paths
|
|
69
79
|
|
|
@@ -89,7 +99,7 @@ class StageLocator:
|
|
|
89
99
|
self,
|
|
90
100
|
url: str = "/",
|
|
91
101
|
) -> str:
|
|
92
|
-
spark_session_id =
|
|
102
|
+
spark_session_id = get_spark_session_id()
|
|
93
103
|
|
|
94
104
|
match get_cloud_from_url(url):
|
|
95
105
|
case "azure":
|
|
@@ -102,15 +112,21 @@ class StageLocator:
|
|
|
102
112
|
sql_query = f"CREATE OR REPLACE TEMP STAGE {stage_name[1:]} URL='azure://{account}.blob.core.windows.net/{bucket_name}'"
|
|
103
113
|
|
|
104
114
|
credential_session_key = (
|
|
105
|
-
f"fs.azure.sas.
|
|
115
|
+
f"fs.azure.sas.fixed.token.{account}.dfs.core.windows.net",
|
|
116
|
+
f"fs.azure.sas.{bucket_name}.{account}.blob.core.windows.net",
|
|
106
117
|
)
|
|
107
118
|
credential = sessions_config.get(spark_session_id, None)
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
119
|
+
sas_token = None
|
|
120
|
+
for session_key in credential_session_key:
|
|
121
|
+
if (
|
|
122
|
+
credential is not None
|
|
123
|
+
and credential.get(session_key) is not None
|
|
124
|
+
and credential.get(session_key).strip() != ""
|
|
125
|
+
):
|
|
126
|
+
sas_token = credential.get(session_key)
|
|
127
|
+
break
|
|
128
|
+
if sas_token is not None:
|
|
129
|
+
sql_query += f" CREDENTIALS = (AZURE_SAS_TOKEN = '{sas_token}')"
|
|
114
130
|
|
|
115
131
|
logger.info(self.session.sql(sql_query).collect())
|
|
116
132
|
self.stages_for_azure[bucket_name] = stage_name
|
|
@@ -128,24 +144,44 @@ class StageLocator:
|
|
|
128
144
|
# but the rest of the time it's used, it does. We just drop it here.
|
|
129
145
|
sql_query = f"CREATE OR REPLACE TEMP STAGE {stage_name[1:]} URL='s3://{parsed_path.split('/')[0]}'"
|
|
130
146
|
credential = sessions_config.get(spark_session_id, None)
|
|
131
|
-
if
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
147
|
+
if credential is not None:
|
|
148
|
+
if ( # USE AWS KEYS to connect
|
|
149
|
+
credential.get("spark.hadoop.fs.s3a.access.key") is not None
|
|
150
|
+
and credential.get("spark.hadoop.fs.s3a.secret.key")
|
|
151
|
+
is not None
|
|
152
|
+
and credential.get("spark.hadoop.fs.s3a.access.key").strip()
|
|
153
|
+
!= ""
|
|
154
|
+
and credential.get("spark.hadoop.fs.s3a.secret.key").strip()
|
|
155
|
+
!= ""
|
|
156
|
+
):
|
|
157
|
+
aws_keys = f" AWS_KEY_ID = '{credential.get('spark.hadoop.fs.s3a.access.key')}'"
|
|
158
|
+
aws_keys += f" AWS_SECRET_KEY = '{credential.get('spark.hadoop.fs.s3a.secret.key')}'"
|
|
159
|
+
if (
|
|
160
|
+
credential.get("spark.hadoop.fs.s3a.session.token")
|
|
161
|
+
is not None
|
|
162
|
+
):
|
|
163
|
+
aws_keys += f" AWS_TOKEN = '{credential.get('spark.hadoop.fs.s3a.session.token')}'"
|
|
164
|
+
sql_query += f" CREDENTIALS = ({aws_keys})"
|
|
165
|
+
sql_query += " ENCRYPTION = ( TYPE = 'AWS_SSE_S3' )"
|
|
166
|
+
elif ( # USE AWS ROLE and KMS KEY to connect
|
|
167
|
+
credential.get(
|
|
168
|
+
"spark.hadoop.fs.s3a.server-side-encryption.key"
|
|
169
|
+
)
|
|
170
|
+
is not None
|
|
171
|
+
and credential.get(
|
|
172
|
+
"spark.hadoop.fs.s3a.server-side-encryption.key"
|
|
173
|
+
).strip()
|
|
174
|
+
!= ""
|
|
175
|
+
and credential.get("spark.hadoop.fs.s3a.assumed.role.arn")
|
|
144
176
|
is not None
|
|
177
|
+
and credential.get(
|
|
178
|
+
"spark.hadoop.fs.s3a.assumed.role.arn"
|
|
179
|
+
).strip()
|
|
180
|
+
!= ""
|
|
145
181
|
):
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
182
|
+
aws_role = f" AWS_ROLE = '{credential.get('spark.hadoop.fs.s3a.assumed.role.arn')}'"
|
|
183
|
+
sql_query += f" CREDENTIALS = ({aws_role})"
|
|
184
|
+
sql_query += f" ENCRYPTION = ( TYPE='AWS_SSE_KMS' KMS_KEY_ID = '{credential.get('spark.hadoop.fs.s3a.server-side-encryption.key')}' )"
|
|
149
185
|
|
|
150
186
|
logger.info(self.session.sql(sql_query).collect())
|
|
151
187
|
self.stages_for_aws[bucket_name] = stage_name
|