snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +717 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +309 -26
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/error_utils.py +28 -0
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +224 -15
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +86 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
- snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
- snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +171 -48
- snowflake/snowpark_connect/server.py +528 -473
- snowflake/snowpark_connect/server_common/__init__.py +503 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/type_support.py +130 -0
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +195 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +192 -40
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -3,21 +3,27 @@
|
|
|
3
3
|
#
|
|
4
4
|
|
|
5
5
|
import re
|
|
6
|
+
from typing import Any, Optional
|
|
6
7
|
|
|
7
8
|
import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
|
|
8
9
|
from pyspark.errors.exceptions.connect import AnalysisException
|
|
9
10
|
|
|
10
|
-
|
|
11
|
+
from snowflake.snowpark import Column, functions as snowpark_fn
|
|
11
12
|
from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
12
13
|
quote_name_without_upper_casing,
|
|
13
14
|
)
|
|
14
15
|
from snowflake.snowpark.exceptions import SnowparkSQLException
|
|
15
|
-
from snowflake.snowpark.types import ArrayType, LongType, MapType, StructType
|
|
16
|
-
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap
|
|
16
|
+
from snowflake.snowpark.types import ArrayType, DataType, LongType, MapType, StructType
|
|
17
|
+
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap, ColumnNames
|
|
17
18
|
from snowflake.snowpark_connect.config import global_config
|
|
19
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
20
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
21
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
22
|
+
from snowflake.snowpark_connect.expression.map_sql_expression import NILARY_FUNCTIONS
|
|
18
23
|
from snowflake.snowpark_connect.expression.typer import ExpressionTyper
|
|
19
24
|
from snowflake.snowpark_connect.typed_column import TypedColumn
|
|
20
25
|
from snowflake.snowpark_connect.utils.context import (
|
|
26
|
+
capture_attribute_name,
|
|
21
27
|
get_current_grouping_columns,
|
|
22
28
|
get_is_evaluating_sql,
|
|
23
29
|
get_outer_dataframes,
|
|
@@ -67,6 +73,382 @@ def _get_catalog_database_from_column_map(
|
|
|
67
73
|
return catalog_database_info
|
|
68
74
|
|
|
69
75
|
|
|
76
|
+
def _resolve_struct_field(
|
|
77
|
+
path: list[str], col: Column, typer: ExpressionTyper
|
|
78
|
+
) -> Column:
|
|
79
|
+
try:
|
|
80
|
+
col_type = typer.type(col)[0]
|
|
81
|
+
except SnowparkSQLException as e:
|
|
82
|
+
if e.raw_message is not None and "invalid identifier" in e.raw_message:
|
|
83
|
+
exception = AnalysisException(
|
|
84
|
+
f'[COLUMN_NOT_FOUND] The column "{path[0]}" does not exist in the target dataframe.'
|
|
85
|
+
)
|
|
86
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
87
|
+
raise exception
|
|
88
|
+
else:
|
|
89
|
+
raise
|
|
90
|
+
|
|
91
|
+
field_path = path[1:]
|
|
92
|
+
if not global_config.spark_sql_caseSensitive:
|
|
93
|
+
field_path = _match_path_to_struct(field_path, col_type)
|
|
94
|
+
|
|
95
|
+
for field_name in field_path:
|
|
96
|
+
col = col.getItem(field_name)
|
|
97
|
+
|
|
98
|
+
return col
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _try_resolve_column_in_scopes(
|
|
102
|
+
column_name: str,
|
|
103
|
+
column_mapping: ColumnNameMap,
|
|
104
|
+
original_snowpark_name: Optional[str] = None,
|
|
105
|
+
) -> tuple[str | None, ColumnNameMap | None, ExpressionTyper | None]:
|
|
106
|
+
"""
|
|
107
|
+
Try to resolve a column name in current and outer scopes.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
column_name: The column name to resolve
|
|
111
|
+
column_mapping: The column mapping for the current scope
|
|
112
|
+
original_snowpark_name: target df snowpark name when we resolve a specific plan id
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
Tuple of (snowpark_name, found_column_map, found_typer) or (None, None, None) if not found
|
|
116
|
+
"""
|
|
117
|
+
# Try current scope
|
|
118
|
+
snowpark_name = column_mapping.get_snowpark_column_name_from_spark_column_name(
|
|
119
|
+
column_name,
|
|
120
|
+
allow_non_exists=True,
|
|
121
|
+
original_snowpark_name=original_snowpark_name,
|
|
122
|
+
)
|
|
123
|
+
if snowpark_name is not None:
|
|
124
|
+
return snowpark_name, column_mapping, None
|
|
125
|
+
|
|
126
|
+
# Try outer scopes
|
|
127
|
+
for outer_df in get_outer_dataframes():
|
|
128
|
+
snowpark_name = (
|
|
129
|
+
outer_df.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
130
|
+
column_name,
|
|
131
|
+
allow_non_exists=True,
|
|
132
|
+
original_snowpark_name=original_snowpark_name,
|
|
133
|
+
)
|
|
134
|
+
)
|
|
135
|
+
if snowpark_name is not None:
|
|
136
|
+
return (
|
|
137
|
+
snowpark_name,
|
|
138
|
+
outer_df.column_map,
|
|
139
|
+
ExpressionTyper(outer_df.dataframe),
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
return None, None, None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _find_column_with_qualifier_match(
|
|
146
|
+
name_parts: list[str],
|
|
147
|
+
column_mapping: ColumnNameMap,
|
|
148
|
+
) -> tuple[int, str | None, Any]:
|
|
149
|
+
"""
|
|
150
|
+
Find the column position in name_parts where the prefix matches a qualifier.
|
|
151
|
+
|
|
152
|
+
In Spark, table qualifiers have at most 3 parts:
|
|
153
|
+
- 1 part: table only (e.g., 't1') → ColumnQualifier(('t1',))
|
|
154
|
+
- 2 parts: database.table (e.g., 'mydb.t5') → ColumnQualifier(('mydb', 't5'))
|
|
155
|
+
- 3 parts: catalog.database.table (e.g., 'cat.mydb.t5') → ColumnQualifier(('cat', 'mydb', 't5'))
|
|
156
|
+
|
|
157
|
+
Examples of how this works (suffix matching):
|
|
158
|
+
1) Input: "mydb1.t5.t5.i1" with qualifier ('mydb1', 't5')
|
|
159
|
+
- At i=2: prefix=['mydb1','t5'], matches qualifier suffix ('mydb1', 't5') → Column found!
|
|
160
|
+
- Remaining ['i1'] is treated as field access
|
|
161
|
+
|
|
162
|
+
2) Input: "t5.t5.i1" with qualifier ('mydb1', 't5')
|
|
163
|
+
- At i=1: prefix=['t5'], matches qualifier suffix ('t5',) → Column found!
|
|
164
|
+
- Remaining ['i1'] is treated as field access
|
|
165
|
+
|
|
166
|
+
3) Input: "cat.mydb.t5.t5.i1" with qualifier ('cat', 'mydb', 't5')
|
|
167
|
+
- At i=3: prefix=['cat','mydb','t5'], matches qualifier suffix → Column found!
|
|
168
|
+
- Remaining ['i1'] is treated as field access
|
|
169
|
+
|
|
170
|
+
The key insight: if the prefix before a candidate matches the END (suffix) of a qualifier,
|
|
171
|
+
then that position is the column reference. This allows partial qualification (e.g., just table
|
|
172
|
+
name instead of full database.table)
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
name_parts: The parts of the qualified name (e.g., ['mydb1', 't5', 't5', 'i1'])
|
|
176
|
+
column_mapping: The column mapping to resolve columns against
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Tuple of (column_part_index, snowpark_name, found_column_map)
|
|
180
|
+
Returns (0, None, None) if no valid column found
|
|
181
|
+
|
|
182
|
+
Raises:
|
|
183
|
+
AnalysisException: If a column is found but with invalid qualifier (scope violation)
|
|
184
|
+
"""
|
|
185
|
+
# Track if we found a column but with wrong qualifier (scope violation)
|
|
186
|
+
scope_violation = None
|
|
187
|
+
|
|
188
|
+
for i in range(len(name_parts)):
|
|
189
|
+
candidate_column = name_parts[i]
|
|
190
|
+
snowpark_name, found_column_map, _ = _try_resolve_column_in_scopes(
|
|
191
|
+
candidate_column, column_mapping
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
if snowpark_name is not None:
|
|
195
|
+
candidate_qualifiers = found_column_map.get_qualifiers_for_snowpark_column(
|
|
196
|
+
snowpark_name
|
|
197
|
+
)
|
|
198
|
+
prefix_parts = name_parts[:i]
|
|
199
|
+
|
|
200
|
+
# Check if this is a valid column reference position
|
|
201
|
+
# A valid position is where the prefix exactly matches one of the qualifiers
|
|
202
|
+
is_valid_reference = False
|
|
203
|
+
|
|
204
|
+
if i == 0:
|
|
205
|
+
# No prefix (unqualified access)
|
|
206
|
+
# Always valid - Spark allows unqualified access to any column
|
|
207
|
+
# The remaining parts (name_parts[1:]) will be treated as
|
|
208
|
+
# struct/map/array field access (e.g., "person.address.city" where
|
|
209
|
+
# person is the column and address.city is the field path)
|
|
210
|
+
is_valid_reference = True
|
|
211
|
+
else:
|
|
212
|
+
# Has prefix - check if it matches the end (suffix) of any qualifier
|
|
213
|
+
# Spark allows partial qualification, so for qualifier ('mydb1', 't5'):
|
|
214
|
+
# - Can access as mydb1.t5.t5.i1 (full qualifier match)
|
|
215
|
+
# - Can access as t5.t5.i1 (suffix match - just table part)
|
|
216
|
+
# e.g., for "t5.t5.i1", when i=1, prefix=['t5'] matches suffix of ('mydb1', 't5')
|
|
217
|
+
# If valid, the remaining parts (name_parts[i+1:]) will be treated as
|
|
218
|
+
# struct/map/array field access (e.g., ['i1'] is a field in column t5)
|
|
219
|
+
for qual in candidate_qualifiers:
|
|
220
|
+
if len(qual.parts) >= len(prefix_parts) and qual.parts[
|
|
221
|
+
-len(prefix_parts) :
|
|
222
|
+
] == tuple(prefix_parts):
|
|
223
|
+
is_valid_reference = True
|
|
224
|
+
break
|
|
225
|
+
|
|
226
|
+
if is_valid_reference:
|
|
227
|
+
# This is the actual column reference
|
|
228
|
+
return (i, snowpark_name, found_column_map)
|
|
229
|
+
elif i > 0:
|
|
230
|
+
# Found column but qualifier doesn't match - this is a scope violation
|
|
231
|
+
# e.g., SELECT nt1.k where k exists but nt1 is not its qualifier
|
|
232
|
+
attr_name = ".".join(name_parts)
|
|
233
|
+
scope_violation = (attr_name, ".".join(prefix_parts))
|
|
234
|
+
|
|
235
|
+
# If we detected a scope violation, throw error
|
|
236
|
+
if scope_violation:
|
|
237
|
+
attr_name, invalid_qualifier = scope_violation
|
|
238
|
+
exception = AnalysisException(
|
|
239
|
+
f'[UNRESOLVED_COLUMN] Column "{attr_name}" cannot be resolved. '
|
|
240
|
+
f'The table or alias "{invalid_qualifier}" is not in scope or does not exist.'
|
|
241
|
+
)
|
|
242
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
243
|
+
raise exception
|
|
244
|
+
|
|
245
|
+
# No valid column found
|
|
246
|
+
return (0, None, None)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _get_quoted_attr_name(name_parts: list[str]) -> str:
|
|
250
|
+
quoted_attr_name = ".".join(
|
|
251
|
+
quote_name_without_upper_casing(x) for x in name_parts[:-1]
|
|
252
|
+
)
|
|
253
|
+
if len(name_parts) > 1:
|
|
254
|
+
quoted_attr_name = f"{quoted_attr_name}.{name_parts[-1]}"
|
|
255
|
+
else:
|
|
256
|
+
quoted_attr_name = name_parts[0]
|
|
257
|
+
return quoted_attr_name
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _attribute_is_regex(original_attr_name: str) -> bool:
|
|
261
|
+
return (
|
|
262
|
+
get_is_evaluating_sql()
|
|
263
|
+
and global_config.spark_sql_parser_quotedRegexColumnNames
|
|
264
|
+
and SPARK_QUOTED.match(original_attr_name)
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _get_matching_columns(
|
|
269
|
+
column_mapping: ColumnNameMap, pattern: str
|
|
270
|
+
) -> list[ColumnNames]:
|
|
271
|
+
# Match the regex pattern against available columns
|
|
272
|
+
matched_columns = column_mapping.get_columns_matching_pattern(pattern)
|
|
273
|
+
|
|
274
|
+
if not matched_columns:
|
|
275
|
+
# Get all available column names from the column mapping
|
|
276
|
+
available_columns = column_mapping.get_spark_columns()
|
|
277
|
+
# Keep the improved error message for SQL regex patterns
|
|
278
|
+
# This is only hit for SQL queries like SELECT `(e|f)` FROM table
|
|
279
|
+
# when spark.sql.parser.quotedRegexColumnNames is enabled
|
|
280
|
+
exception = AnalysisException(
|
|
281
|
+
f"No columns match the regex pattern '{pattern}'. "
|
|
282
|
+
f"Snowflake SQL does not support SELECT statements with no columns. "
|
|
283
|
+
f"Please ensure your regex pattern matches at least one column. "
|
|
284
|
+
f"Available columns: {', '.join(available_columns[:10])}{'...' if len(available_columns) > 10 else ''}"
|
|
285
|
+
)
|
|
286
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
287
|
+
raise exception
|
|
288
|
+
|
|
289
|
+
return matched_columns
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _resolve_matched_columns(
|
|
293
|
+
matched_columns: list[ColumnNames],
|
|
294
|
+
typer: ExpressionTyper,
|
|
295
|
+
):
|
|
296
|
+
# When multiple columns match, we need to signal that this should expand to multiple columns
|
|
297
|
+
# Since map_unresolved_attribute can only return one column, we'll use a special marker
|
|
298
|
+
# to indicate that this is a multi-column regex expansion
|
|
299
|
+
if len(matched_columns) > 1:
|
|
300
|
+
# Create a special column name that indicates multi-column expansion
|
|
301
|
+
# The higher-level logic will need to handle this
|
|
302
|
+
multi_col_name = "__REGEX_MULTI_COL__"
|
|
303
|
+
# For now, return the first column but mark it specially
|
|
304
|
+
first_col = matched_columns[0]
|
|
305
|
+
snowpark_name = first_col.snowpark_name
|
|
306
|
+
col = snowpark_fn.col(snowpark_name)
|
|
307
|
+
qualifiers = first_col.qualifiers
|
|
308
|
+
typed_col = TypedColumn(col, lambda: typer.type(col))
|
|
309
|
+
typed_col.set_qualifiers(qualifiers)
|
|
310
|
+
# Store matched columns info for later use
|
|
311
|
+
typed_col._regex_matched_columns = matched_columns
|
|
312
|
+
return multi_col_name, typed_col
|
|
313
|
+
else:
|
|
314
|
+
# Single column match - return that column
|
|
315
|
+
matched_col = matched_columns[0]
|
|
316
|
+
snowpark_name = matched_col.snowpark_name
|
|
317
|
+
col = snowpark_fn.col(snowpark_name)
|
|
318
|
+
qualifiers = matched_col.qualifiers
|
|
319
|
+
typed_col = TypedColumn(col, lambda: typer.type(col))
|
|
320
|
+
typed_col.set_qualifiers(qualifiers)
|
|
321
|
+
return matched_col.spark_name, typed_col
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _resolve_attribute_with_original_snowpark_name(
|
|
325
|
+
path: list[str],
|
|
326
|
+
current_column_mapping: ColumnNameMap,
|
|
327
|
+
typer: ExpressionTyper,
|
|
328
|
+
original_snowpark_name: str,
|
|
329
|
+
) -> TypedColumn:
|
|
330
|
+
# if the column was found in the target dataframe
|
|
331
|
+
# we need to find its snowpark name in the current column mapping or any outer scope
|
|
332
|
+
# it can be the same name or an equivalent after a join rename
|
|
333
|
+
spark_name = path[0]
|
|
334
|
+
(
|
|
335
|
+
matching_snowpark_name,
|
|
336
|
+
found_column_mapping,
|
|
337
|
+
found_typer,
|
|
338
|
+
) = _try_resolve_column_in_scopes(
|
|
339
|
+
spark_name,
|
|
340
|
+
current_column_mapping,
|
|
341
|
+
original_snowpark_name=original_snowpark_name,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
if not matching_snowpark_name:
|
|
345
|
+
# the column doesn't exist in the current dataframe
|
|
346
|
+
exception = AnalysisException(
|
|
347
|
+
f'[RESOLVED_REFERENCE_COLUMN_NOT_FOUND] The column "{spark_name}" does not exist in the target dataframe.'
|
|
348
|
+
)
|
|
349
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
350
|
+
raise exception
|
|
351
|
+
|
|
352
|
+
# we need to use the typer for the dataframe where the column was resolved
|
|
353
|
+
found_typer = found_typer if found_typer else typer
|
|
354
|
+
|
|
355
|
+
col = snowpark_fn.col(matching_snowpark_name)
|
|
356
|
+
if len(path) > 1:
|
|
357
|
+
col = _resolve_struct_field(path, col, found_typer)
|
|
358
|
+
# no qualifiers for struct fields
|
|
359
|
+
return TypedColumn(col, lambda: found_typer.type(col))
|
|
360
|
+
|
|
361
|
+
typed_col = TypedColumn(col, lambda: found_typer.type(col))
|
|
362
|
+
typed_col.set_qualifiers(
|
|
363
|
+
found_column_mapping.get_qualifiers_for_snowpark_column(matching_snowpark_name)
|
|
364
|
+
)
|
|
365
|
+
return typed_col
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def _resolve_attribute_regex_with_plan_id(
|
|
369
|
+
pattern: str,
|
|
370
|
+
target_df_container: DataFrameContainer,
|
|
371
|
+
current_column_mapping: ColumnNameMap,
|
|
372
|
+
typer: ExpressionTyper,
|
|
373
|
+
) -> tuple[str, TypedColumn]:
|
|
374
|
+
"""
|
|
375
|
+
Resolves all columns matching the given pattern in the target dataframe
|
|
376
|
+
"""
|
|
377
|
+
target_column_mapping = target_df_container.column_map
|
|
378
|
+
# find all matching columns
|
|
379
|
+
matched_columns = _get_matching_columns(target_column_mapping, pattern)
|
|
380
|
+
|
|
381
|
+
if len(matched_columns) == 1 and target_column_mapping.has_spark_column(pattern):
|
|
382
|
+
# if the pattern is just the column name, we resolve the column using its equivalent snowpark name
|
|
383
|
+
spark_name = matched_columns[0].spark_name
|
|
384
|
+
snowpark_name = matched_columns[0].snowpark_name
|
|
385
|
+
return spark_name, _resolve_attribute_with_original_snowpark_name(
|
|
386
|
+
[spark_name], current_column_mapping, typer, snowpark_name
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# if the pattern is not an exact match for an existing column, we don't want to use equivalent snowpark names
|
|
390
|
+
# and we just check if the matched columns exist in the current mapping
|
|
391
|
+
available_snowpark_columns = current_column_mapping.get_snowpark_columns()
|
|
392
|
+
matched_columns = [
|
|
393
|
+
c for c in matched_columns if c.snowpark_name in available_snowpark_columns
|
|
394
|
+
]
|
|
395
|
+
if len(matched_columns) == 0:
|
|
396
|
+
return "", TypedColumn.empty()
|
|
397
|
+
return _resolve_matched_columns(matched_columns, typer)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _resolve_attribute_with_plan_id(
|
|
401
|
+
path: list[str],
|
|
402
|
+
target_df_container: DataFrameContainer,
|
|
403
|
+
current_column_mapping: ColumnNameMap,
|
|
404
|
+
typer: ExpressionTyper,
|
|
405
|
+
) -> tuple[str, TypedColumn]:
|
|
406
|
+
"""
|
|
407
|
+
Resolves a given spark name with a specific plan_id to the equivalent snowpark column in
|
|
408
|
+
the target dataframe
|
|
409
|
+
"""
|
|
410
|
+
target_column_mapping = target_df_container.column_map
|
|
411
|
+
|
|
412
|
+
quoted_attr_name = _get_quoted_attr_name(path)
|
|
413
|
+
|
|
414
|
+
# Try to resolve the full qualified name first
|
|
415
|
+
# TODO: implement better mechanism for matching qualified names
|
|
416
|
+
snowpark_name, found_column_map, _ = _try_resolve_column_in_scopes(
|
|
417
|
+
quoted_attr_name, target_column_mapping
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
if snowpark_name:
|
|
421
|
+
# we don't need the qualifiers anymore, since the original snowpark name is enough to disambiguate
|
|
422
|
+
spark_name = path[-1]
|
|
423
|
+
path = [spark_name]
|
|
424
|
+
else:
|
|
425
|
+
# in some cases the column can be qualified, so we have to match qualifiers as well
|
|
426
|
+
(
|
|
427
|
+
column_part_index,
|
|
428
|
+
snowpark_name,
|
|
429
|
+
found_column_map,
|
|
430
|
+
) = _find_column_with_qualifier_match(path, target_column_mapping)
|
|
431
|
+
# extract the column name, and remove qualifiers
|
|
432
|
+
spark_name = path[column_part_index]
|
|
433
|
+
path = path[column_part_index:]
|
|
434
|
+
|
|
435
|
+
if not snowpark_name or found_column_map is not target_column_mapping:
|
|
436
|
+
# if the column doesn't exist in the plan_id dataframe, we don't need to look further
|
|
437
|
+
exception = AnalysisException(
|
|
438
|
+
f'[RESOLVED_REFERENCE_COLUMN_NOT_FOUND] The column "{spark_name}" does not exist in the target dataframe.'
|
|
439
|
+
)
|
|
440
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
441
|
+
raise exception
|
|
442
|
+
|
|
443
|
+
matching_snowpark_col = _resolve_attribute_with_original_snowpark_name(
|
|
444
|
+
path, current_column_mapping, typer, snowpark_name
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# if resolving a struct field, we need to return the field name
|
|
448
|
+
# that's why this is path[-1] and not spark_name
|
|
449
|
+
return path[-1], matching_snowpark_col
|
|
450
|
+
|
|
451
|
+
|
|
70
452
|
def map_unresolved_attribute(
|
|
71
453
|
exp: expressions_proto.Expression,
|
|
72
454
|
column_mapping: ColumnNameMap,
|
|
@@ -74,6 +456,7 @@ def map_unresolved_attribute(
|
|
|
74
456
|
) -> tuple[str, TypedColumn]:
|
|
75
457
|
original_attr_name = exp.unresolved_attribute.unparsed_identifier
|
|
76
458
|
name_parts = split_fully_qualified_spark_name(original_attr_name)
|
|
459
|
+
attribute_is_regex = _attribute_is_regex(original_attr_name)
|
|
77
460
|
|
|
78
461
|
assert len(name_parts) > 0, f"Unable to parse input attribute: {original_attr_name}"
|
|
79
462
|
|
|
@@ -85,9 +468,11 @@ def map_unresolved_attribute(
|
|
|
85
468
|
grouping_spark_columns = get_current_grouping_columns()
|
|
86
469
|
if not grouping_spark_columns:
|
|
87
470
|
# grouping__id can only be used with GROUP BY CUBE/ROLLUP/GROUPING SETS
|
|
88
|
-
|
|
471
|
+
exception = AnalysisException(
|
|
89
472
|
"[MISSING_GROUP_BY] grouping__id can only be used with GROUP BY (CUBE | ROLLUP | GROUPING SETS)"
|
|
90
473
|
)
|
|
474
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_FUNCTION_ARGUMENT)
|
|
475
|
+
raise exception
|
|
91
476
|
# Convert to GROUPING_ID() function call with the grouping columns
|
|
92
477
|
# Map Spark column names to Snowpark column names
|
|
93
478
|
snowpark_cols = []
|
|
@@ -99,9 +484,11 @@ def map_unresolved_attribute(
|
|
|
99
484
|
)
|
|
100
485
|
)
|
|
101
486
|
if not snowpark_name:
|
|
102
|
-
|
|
487
|
+
exception = AnalysisException(
|
|
103
488
|
f"[INTERNAL_ERROR] Cannot find Snowpark column mapping for grouping column '{spark_col_name}'"
|
|
104
489
|
)
|
|
490
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
491
|
+
raise exception
|
|
105
492
|
snowpark_cols.append(snowpark_fn.col(snowpark_name))
|
|
106
493
|
|
|
107
494
|
# Call GROUPING_ID with all grouping columns using Snowpark names
|
|
@@ -155,176 +542,70 @@ def map_unresolved_attribute(
|
|
|
155
542
|
|
|
156
543
|
if is_catalog:
|
|
157
544
|
# This looks like a catalog.database.column.field pattern
|
|
158
|
-
|
|
545
|
+
exception = AnalysisException(
|
|
159
546
|
f"[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `{original_attr_name}` cannot be resolved. "
|
|
160
547
|
f"Cross-catalog column references are not supported in DataFrame API."
|
|
161
548
|
)
|
|
549
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
550
|
+
raise exception
|
|
162
551
|
|
|
163
552
|
attr_name = ".".join(name_parts)
|
|
553
|
+
capture_attribute_name(attr_name)
|
|
164
554
|
|
|
165
555
|
has_plan_id = exp.unresolved_attribute.HasField("plan_id")
|
|
166
|
-
source_qualifiers = None
|
|
167
556
|
|
|
168
557
|
if has_plan_id:
|
|
169
558
|
plan_id = exp.unresolved_attribute.plan_id
|
|
559
|
+
# get target dataframe and column mapping
|
|
170
560
|
target_df_container = get_plan_id_map(plan_id)
|
|
171
|
-
target_df = target_df_container.dataframe
|
|
172
561
|
assert (
|
|
173
|
-
|
|
562
|
+
target_df_container is not None
|
|
174
563
|
), f"resolving an attribute of a unresolved dataframe {plan_id}"
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
564
|
+
if attribute_is_regex:
|
|
565
|
+
# we should never get a struct field reference here
|
|
566
|
+
assert (
|
|
567
|
+
len(name_parts) == 1
|
|
568
|
+
), "resolving struct field for attribute regexp with plan id"
|
|
569
|
+
return _resolve_attribute_regex_with_plan_id(
|
|
570
|
+
name_parts[0], target_df_container, column_mapping, typer
|
|
180
571
|
)
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
if hasattr(column_mapping, "hidden_columns"):
|
|
184
|
-
hidden = column_mapping.hidden_columns
|
|
185
|
-
else:
|
|
186
|
-
hidden = None
|
|
187
|
-
|
|
188
|
-
column_mapping = target_df_container.column_map
|
|
189
|
-
column_mapping.hidden_columns = hidden
|
|
190
|
-
typer = ExpressionTyper(target_df)
|
|
191
|
-
|
|
192
|
-
def get_col(snowpark_name, has_hidden=False):
|
|
193
|
-
return (
|
|
194
|
-
snowpark_fn.col(snowpark_name)
|
|
195
|
-
if not has_plan_id or has_hidden
|
|
196
|
-
else target_df.col(snowpark_name)
|
|
572
|
+
return _resolve_attribute_with_plan_id(
|
|
573
|
+
name_parts, target_df_container, column_mapping, typer
|
|
197
574
|
)
|
|
198
575
|
|
|
199
576
|
# Check if regex column names are enabled and this is a quoted identifier
|
|
200
577
|
# We need to check the original attribute name before split_fully_qualified_spark_name processes it
|
|
201
|
-
if
|
|
202
|
-
get_is_evaluating_sql()
|
|
203
|
-
and global_config.spark_sql_parser_quotedRegexColumnNames
|
|
204
|
-
and SPARK_QUOTED.match(original_attr_name)
|
|
205
|
-
):
|
|
578
|
+
if attribute_is_regex:
|
|
206
579
|
# Extract regex pattern by removing backticks
|
|
207
580
|
regex_pattern = original_attr_name[1:-1] # Remove first and last backtick
|
|
581
|
+
matched_columns = _get_matching_columns(column_mapping, regex_pattern)
|
|
582
|
+
return _resolve_matched_columns(matched_columns, typer)
|
|
208
583
|
|
|
209
|
-
|
|
210
|
-
available_columns = column_mapping.get_spark_columns()
|
|
211
|
-
|
|
212
|
-
# Match the regex pattern against available columns
|
|
213
|
-
matched_columns = []
|
|
214
|
-
try:
|
|
215
|
-
compiled_regex = re.compile(
|
|
216
|
-
regex_pattern,
|
|
217
|
-
re.IGNORECASE if not global_config.spark_sql_caseSensitive else 0,
|
|
218
|
-
)
|
|
219
|
-
for col_name in available_columns:
|
|
220
|
-
if compiled_regex.fullmatch(col_name):
|
|
221
|
-
matched_columns.append(col_name)
|
|
222
|
-
except re.error as e:
|
|
223
|
-
raise AnalysisException(f"Invalid regex pattern '{regex_pattern}': {e}")
|
|
224
|
-
|
|
225
|
-
if not matched_columns:
|
|
226
|
-
# Keep the improved error message for SQL regex patterns
|
|
227
|
-
# This is only hit for SQL queries like SELECT `(e|f)` FROM table
|
|
228
|
-
# when spark.sql.parser.quotedRegexColumnNames is enabled
|
|
229
|
-
raise AnalysisException(
|
|
230
|
-
f"No columns match the regex pattern '{regex_pattern}'. "
|
|
231
|
-
f"Snowflake SQL does not support SELECT statements with no columns. "
|
|
232
|
-
f"Please ensure your regex pattern matches at least one column. "
|
|
233
|
-
f"Available columns: {', '.join(available_columns[:10])}{'...' if len(available_columns) > 10 else ''}"
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
# When multiple columns match, we need to signal that this should expand to multiple columns
|
|
237
|
-
# Since map_unresolved_attribute can only return one column, we'll use a special marker
|
|
238
|
-
# to indicate that this is a multi-column regex expansion
|
|
239
|
-
if len(matched_columns) > 1:
|
|
240
|
-
# Create a special column name that indicates multi-column expansion
|
|
241
|
-
# The higher-level logic will need to handle this
|
|
242
|
-
multi_col_name = "__REGEX_MULTI_COL__"
|
|
243
|
-
# For now, return the first column but mark it specially
|
|
244
|
-
quoted_col_name = matched_columns[0]
|
|
245
|
-
snowpark_name = (
|
|
246
|
-
column_mapping.get_snowpark_column_name_from_spark_column_name(
|
|
247
|
-
quoted_col_name
|
|
248
|
-
)
|
|
249
|
-
)
|
|
250
|
-
col = get_col(snowpark_name)
|
|
251
|
-
qualifiers = column_mapping.get_qualifier_for_spark_column(quoted_col_name)
|
|
252
|
-
typed_col = TypedColumn(col, lambda: typer.type(col))
|
|
253
|
-
typed_col.set_qualifiers(qualifiers)
|
|
254
|
-
# Store matched columns info for later use
|
|
255
|
-
typed_col._regex_matched_columns = matched_columns
|
|
256
|
-
return (multi_col_name, typed_col)
|
|
257
|
-
else:
|
|
258
|
-
# Single column match - return that column
|
|
259
|
-
quoted_col_name = matched_columns[0]
|
|
260
|
-
snowpark_name = (
|
|
261
|
-
column_mapping.get_snowpark_column_name_from_spark_column_name(
|
|
262
|
-
quoted_col_name
|
|
263
|
-
)
|
|
264
|
-
)
|
|
265
|
-
col = get_col(snowpark_name)
|
|
266
|
-
qualifiers = column_mapping.get_qualifier_for_spark_column(quoted_col_name)
|
|
267
|
-
typed_col = TypedColumn(col, lambda: typer.type(col))
|
|
268
|
-
typed_col.set_qualifiers(qualifiers)
|
|
269
|
-
return (matched_columns[0], typed_col)
|
|
584
|
+
quoted_attr_name = _get_quoted_attr_name(name_parts)
|
|
270
585
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
if len(name_parts) > 1:
|
|
275
|
-
quoted_attr_name = f"{quoted_attr_name}.{name_parts[-1]}"
|
|
276
|
-
else:
|
|
277
|
-
quoted_attr_name = name_parts[0]
|
|
278
|
-
|
|
279
|
-
snowpark_name = column_mapping.get_snowpark_column_name_from_spark_column_name(
|
|
280
|
-
quoted_attr_name,
|
|
281
|
-
allow_non_exists=True,
|
|
282
|
-
is_qualified=has_plan_id,
|
|
283
|
-
source_qualifiers=source_qualifiers if has_plan_id else None,
|
|
586
|
+
# Try to resolve the full qualified name first
|
|
587
|
+
snowpark_name, found_column_map, found_typer = _try_resolve_column_in_scopes(
|
|
588
|
+
quoted_attr_name, column_mapping
|
|
284
589
|
)
|
|
285
590
|
|
|
591
|
+
qualifiers = set()
|
|
286
592
|
if snowpark_name is not None:
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
col = get_col(snowpark_name, is_hidden)
|
|
291
|
-
qualifiers = column_mapping.get_qualifier_for_spark_column(quoted_attr_name)
|
|
593
|
+
col = snowpark_fn.col(snowpark_name)
|
|
594
|
+
qualifiers = found_column_map.get_qualifiers_for_snowpark_column(snowpark_name)
|
|
595
|
+
typer = found_typer if found_typer else typer
|
|
292
596
|
else:
|
|
293
|
-
# this means it has to be a struct column with a field name
|
|
294
|
-
snowpark_name: str | None = None
|
|
295
|
-
column_part_index: int = 0
|
|
296
|
-
|
|
297
597
|
# Get catalog/database info from column map if available
|
|
298
598
|
catalog_database_info = _get_catalog_database_from_column_map(
|
|
299
599
|
original_attr_name, column_mapping
|
|
300
600
|
)
|
|
301
601
|
|
|
302
|
-
#
|
|
303
|
-
#
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
snowpark_name
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
)
|
|
310
|
-
)
|
|
311
|
-
if snowpark_name is not None:
|
|
312
|
-
column_part_index = i
|
|
313
|
-
break
|
|
314
|
-
|
|
315
|
-
# Also try in outer dataframes
|
|
316
|
-
for outer_df in get_outer_dataframes():
|
|
317
|
-
snowpark_name = (
|
|
318
|
-
outer_df.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
319
|
-
candidate_column, allow_non_exists=True
|
|
320
|
-
)
|
|
321
|
-
)
|
|
322
|
-
if snowpark_name is not None:
|
|
323
|
-
column_part_index = i
|
|
324
|
-
break
|
|
325
|
-
|
|
326
|
-
if snowpark_name is not None:
|
|
327
|
-
break
|
|
602
|
+
# Find the column by matching qualifiers with the prefix parts
|
|
603
|
+
# Note: This may raise AnalysisException if a scope violation is detected
|
|
604
|
+
(
|
|
605
|
+
column_part_index,
|
|
606
|
+
snowpark_name,
|
|
607
|
+
found_column_map,
|
|
608
|
+
) = _find_column_with_qualifier_match(name_parts, column_mapping)
|
|
328
609
|
|
|
329
610
|
if snowpark_name is None:
|
|
330
611
|
# Attempt LCA fallback.
|
|
@@ -345,9 +626,9 @@ def map_unresolved_attribute(
|
|
|
345
626
|
)
|
|
346
627
|
)
|
|
347
628
|
if snowpark_name is not None:
|
|
348
|
-
col =
|
|
349
|
-
qualifiers = column_mapping.
|
|
350
|
-
|
|
629
|
+
col = snowpark_fn.col(snowpark_name)
|
|
630
|
+
qualifiers = column_mapping.get_qualifiers_for_snowpark_column(
|
|
631
|
+
snowpark_name
|
|
351
632
|
)
|
|
352
633
|
typed_col = TypedColumn(col, lambda: typer.type(col))
|
|
353
634
|
typed_col.set_qualifiers(qualifiers)
|
|
@@ -368,44 +649,38 @@ def map_unresolved_attribute(
|
|
|
368
649
|
)
|
|
369
650
|
if outer_col_name:
|
|
370
651
|
# This is an outer scope column being referenced inside a lambda
|
|
371
|
-
|
|
652
|
+
exception = AnalysisException(
|
|
372
653
|
f"Reference to non-lambda variable '{attr_name}' within lambda function. "
|
|
373
654
|
f"Lambda functions can only access their own parameters. "
|
|
374
655
|
f"If '{attr_name}' is a table column, it must be passed as an explicit parameter to the enclosing function."
|
|
375
656
|
)
|
|
657
|
+
attach_custom_error_code(
|
|
658
|
+
exception, ErrorCodes.UNSUPPORTED_OPERATION
|
|
659
|
+
)
|
|
660
|
+
raise exception
|
|
376
661
|
|
|
377
662
|
if has_plan_id:
|
|
378
|
-
|
|
663
|
+
exception = AnalysisException(
|
|
379
664
|
f'[RESOLVED_REFERENCE_COLUMN_NOT_FOUND] The column "{attr_name}" does not exist in the target dataframe.'
|
|
380
665
|
)
|
|
381
|
-
|
|
382
|
-
|
|
666
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
667
|
+
raise exception
|
|
668
|
+
elif attr_name.lower() in NILARY_FUNCTIONS:
|
|
383
669
|
snowpark_name = attr_name
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
try:
|
|
387
|
-
col_type = typer.type(col)[0]
|
|
388
|
-
except SnowparkSQLException as e:
|
|
389
|
-
if e.raw_message is not None and "invalid identifier" in e.raw_message:
|
|
390
|
-
raise AnalysisException(
|
|
670
|
+
else:
|
|
671
|
+
exception = AnalysisException(
|
|
391
672
|
f'[COLUMN_NOT_FOUND] The column "{attr_name}" does not exist in the target dataframe.'
|
|
392
673
|
)
|
|
393
|
-
|
|
394
|
-
raise
|
|
395
|
-
is_struct = isinstance(col_type, StructType)
|
|
396
|
-
# for struct columns when accessed, spark use just the leaf field name rather than fully attributed one
|
|
397
|
-
if is_struct:
|
|
398
|
-
attr_name = name_parts[-1]
|
|
674
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
675
|
+
raise exception
|
|
399
676
|
|
|
677
|
+
col = snowpark_fn.col(snowpark_name)
|
|
678
|
+
# Check if this is a struct field reference
|
|
400
679
|
# Calculate the field path correctly based on where we found the column
|
|
401
|
-
path = name_parts[column_part_index
|
|
402
|
-
if is_struct and not global_config.spark_sql_caseSensitive:
|
|
403
|
-
path = _match_path_to_struct(path, col_type)
|
|
404
|
-
|
|
405
|
-
for field_name in path:
|
|
406
|
-
col = col.getItem(field_name)
|
|
680
|
+
path = name_parts[column_part_index:]
|
|
407
681
|
|
|
408
|
-
|
|
682
|
+
if len(path) > 1:
|
|
683
|
+
col = _resolve_struct_field(path, col, typer)
|
|
409
684
|
|
|
410
685
|
typed_col = TypedColumn(col, lambda: typer.type(col))
|
|
411
686
|
typed_col.set_qualifiers(qualifiers)
|
|
@@ -417,10 +692,11 @@ def map_unresolved_attribute(
|
|
|
417
692
|
if final_catalog_database_info:
|
|
418
693
|
typed_col.set_catalog_database_info(final_catalog_database_info)
|
|
419
694
|
|
|
695
|
+
# for struct columns when accessed, spark use just the leaf field name rather than fully attributed one
|
|
420
696
|
return (name_parts[-1], typed_col)
|
|
421
697
|
|
|
422
698
|
|
|
423
|
-
def _match_path_to_struct(path: list[str], col_type:
|
|
699
|
+
def _match_path_to_struct(path: list[str], col_type: DataType) -> list[str]:
|
|
424
700
|
"""Takes a path of names and adjusts them to strictly match the field names in a StructType."""
|
|
425
701
|
adjusted_path = []
|
|
426
702
|
typ = col_type
|
|
@@ -438,7 +714,9 @@ def _match_path_to_struct(path: list[str], col_type: StructType) -> list[str]:
|
|
|
438
714
|
typ = typ.value_type if isinstance(typ, MapType) else typ.element_type
|
|
439
715
|
else:
|
|
440
716
|
# If the type is not a struct, map, or array, we cannot access the field.
|
|
441
|
-
|
|
717
|
+
exception = AnalysisException(
|
|
442
718
|
f"[INVALID_EXTRACT_BASE_FIELD_TYPE] Can't extract a value from \"{'.'.join(path[:i])}\". Need a complex type [STRUCT, ARRAY, MAP] but got \"{typ}\"."
|
|
443
719
|
)
|
|
720
|
+
attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
|
|
721
|
+
raise exception
|
|
444
722
|
return adjusted_path
|