snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +680 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +237 -23
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +123 -5
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +85 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2748 -746
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +196 -255
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +255 -45
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +17 -5
- snowflake/snowpark_connect/relation/read/map_read_json.py +320 -85
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +142 -27
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +11 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +82 -5
- snowflake/snowpark_connect/relation/read/map_read_text.py +18 -3
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +36 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +110 -48
- snowflake/snowpark_connect/server.py +546 -456
- snowflake/snowpark_connect/server_common/__init__.py +500 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +187 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +125 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +303 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +248 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +101 -332
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +163 -22
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-submit +2 -2
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/METADATA +14 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/RECORD +129 -167
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -13,18 +13,21 @@ from functools import cached_property
|
|
|
13
13
|
from pyspark.errors.exceptions.base import AnalysisException
|
|
14
14
|
|
|
15
15
|
from snowflake.snowpark import DataFrame
|
|
16
|
-
from snowflake.snowpark._internal.analyzer.analyzer_utils import
|
|
17
|
-
quote_name_without_upper_casing,
|
|
18
|
-
unquote_if_quoted,
|
|
19
|
-
)
|
|
16
|
+
from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
|
|
20
17
|
from snowflake.snowpark._internal.utils import quote_name
|
|
21
18
|
from snowflake.snowpark.types import StructType
|
|
19
|
+
from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
|
|
22
20
|
from snowflake.snowpark_connect.config import global_config
|
|
23
|
-
from snowflake.snowpark_connect.
|
|
24
|
-
from snowflake.snowpark_connect.
|
|
21
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
22
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
23
|
+
from snowflake.snowpark_connect.utils.context import (
|
|
24
|
+
get_current_operation_scope,
|
|
25
|
+
get_is_processing_order_by,
|
|
26
|
+
)
|
|
25
27
|
from snowflake.snowpark_connect.utils.identifiers import (
|
|
26
28
|
split_fully_qualified_spark_name,
|
|
27
29
|
)
|
|
30
|
+
from snowflake.snowpark_connect.utils.sequence import next_unique_num
|
|
28
31
|
|
|
29
32
|
ALREADY_QUOTED = re.compile('^(".+")$', re.DOTALL)
|
|
30
33
|
|
|
@@ -44,6 +47,7 @@ def set_schema_getter(df: DataFrame, get_schema: Callable[[], StructType]) -> No
|
|
|
44
47
|
df.__class__ = PatchedDataFrame
|
|
45
48
|
|
|
46
49
|
|
|
50
|
+
# TODO replace plan_id-offset with single unique value
|
|
47
51
|
def make_column_names_snowpark_compatible(
|
|
48
52
|
names: list[str], plan_id: int, offset: int = 0
|
|
49
53
|
) -> list[str]:
|
|
@@ -76,42 +80,42 @@ def make_column_names_snowpark_compatible(
|
|
|
76
80
|
In this case the function call should be `make_column_names_snowpark_compatible(['a', 'b'], 5, 2)`,
|
|
77
81
|
to avoid naming conflicts between the new columns and the old columns.
|
|
78
82
|
"""
|
|
83
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
84
|
+
METADATA_FILENAME_COLUMN,
|
|
85
|
+
)
|
|
86
|
+
|
|
79
87
|
return [
|
|
88
|
+
# Skip METADATA$FILENAME - preserve original name without quoting
|
|
89
|
+
name if name == METADATA_FILENAME_COLUMN else
|
|
80
90
|
# Use `-` in the name to force df.column to return double-quoted names
|
|
81
91
|
quote_name(f"{unquote_if_quoted(name)}-{plan_id:08x}-{i + offset}")
|
|
82
92
|
for i, name in enumerate(names)
|
|
83
93
|
]
|
|
84
94
|
|
|
85
95
|
|
|
96
|
+
def make_unique_snowpark_name(spark_name: str) -> str:
|
|
97
|
+
"""
|
|
98
|
+
Returns a snowpark column name that's guaranteed to be unique in this session,
|
|
99
|
+
by appending "#<unique number>" to the given spark name.
|
|
100
|
+
"""
|
|
101
|
+
return quote_name(f"{spark_name}-{next_unique_num():x}")
|
|
102
|
+
|
|
103
|
+
|
|
86
104
|
@dataclass(frozen=True)
|
|
87
105
|
class ColumnNames:
|
|
88
106
|
spark_name: str
|
|
89
107
|
snowpark_name: str
|
|
90
|
-
qualifiers:
|
|
108
|
+
qualifiers: set[ColumnQualifier]
|
|
109
|
+
equivalent_snowpark_names: set[str] | None = ((None,),)
|
|
91
110
|
catalog_info: str | None = None # Catalog from fully qualified name
|
|
92
111
|
database_info: str | None = None # Database from fully qualified name
|
|
112
|
+
is_hidden: bool = False # Hidden columns are only accessible via qualified names
|
|
93
113
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
For example, if the column name is 'id' and the qualifiers are ['db', 'table'],
|
|
101
|
-
then the possible Spark names are:
|
|
102
|
-
['id', 'db.table.id', 'table.id']
|
|
103
|
-
"""
|
|
104
|
-
spark_name = column_names.spark_name
|
|
105
|
-
qualifiers = column_names.qualifiers
|
|
106
|
-
|
|
107
|
-
qualifier_suffixes_list = [
|
|
108
|
-
".".join(quote_name_without_upper_casing(x) for x in qualifiers[i:])
|
|
109
|
-
for i in range(len(qualifiers))
|
|
110
|
-
]
|
|
111
|
-
return [spark_name] + [
|
|
112
|
-
f"{qualifier_suffix}.{spark_name}"
|
|
113
|
-
for qualifier_suffix in qualifier_suffixes_list
|
|
114
|
-
]
|
|
114
|
+
def all_spark_names_including_qualified_names(self):
|
|
115
|
+
all_names = [self.spark_name]
|
|
116
|
+
for qualifier in self.qualifiers:
|
|
117
|
+
all_names.extend(qualifier.all_qualified_names(self.spark_name))
|
|
118
|
+
return all_names
|
|
115
119
|
|
|
116
120
|
|
|
117
121
|
class ColumnNameMap:
|
|
@@ -123,32 +127,32 @@ class ColumnNameMap:
|
|
|
123
127
|
[], bool
|
|
124
128
|
] = lambda: global_config.spark_sql_caseSensitive,
|
|
125
129
|
column_metadata: dict | None = None,
|
|
126
|
-
column_qualifiers: list[
|
|
127
|
-
hidden_columns: set[HiddenColumn] | None = None,
|
|
130
|
+
column_qualifiers: list[set[ColumnQualifier]] = None,
|
|
128
131
|
parent_column_name_map: ColumnNameMap | None = None,
|
|
132
|
+
equivalent_snowpark_names: list[set[str]] | None = None,
|
|
133
|
+
column_is_hidden: list[bool] | None = None,
|
|
129
134
|
) -> None:
|
|
130
135
|
"""
|
|
131
136
|
spark_column_names: Original spark column names
|
|
132
137
|
snowpark_column_names: Snowpark column names
|
|
133
|
-
column_metadata: This field is used to store metadata related to columns. Since Snowpark
|
|
138
|
+
column_metadata: This field is used to store metadata related to columns. Since Snowpark's Struct type does not support metadata,
|
|
134
139
|
we use this attribute to store any metadata related to the columns.
|
|
135
140
|
The key is the original Spark column name, and the value is the metadata.
|
|
136
141
|
example: Dict('age', {'foo': 'bar'})
|
|
137
142
|
column_qualifiers: Optional qualifiers for the columns, used to handle table aliases or DataFrame aliases.
|
|
138
|
-
hidden_columns: Optional set of HiddenColumn objects.
|
|
139
143
|
parent_column_name_map: parent ColumnNameMap
|
|
144
|
+
column_is_hidden: Optional list of booleans indicating whether each column is hidden
|
|
140
145
|
"""
|
|
141
146
|
self.columns: list[ColumnNames] = []
|
|
142
|
-
self.spark_to_col = defaultdict(list)
|
|
147
|
+
self.spark_to_col: defaultdict[str, list[ColumnNames]] = defaultdict(list)
|
|
143
148
|
self.uppercase_spark_to_col = defaultdict(list)
|
|
144
149
|
self.snowpark_to_col = defaultdict(list)
|
|
145
150
|
self.is_case_sensitive = is_case_sensitive
|
|
146
151
|
self.column_metadata = column_metadata
|
|
147
|
-
self.hidden_columns = hidden_columns
|
|
148
152
|
|
|
149
153
|
# Rename chain dictionary to track column renaming history
|
|
150
154
|
self.rename_chains: dict[str, str] = {} # old_name -> new_name mapping
|
|
151
|
-
self.current_columns: set[str] = set() #
|
|
155
|
+
self.current_columns: set[str] = set() # current column names
|
|
152
156
|
|
|
153
157
|
# Parent ColumnNameMap classes
|
|
154
158
|
self._parent_column_name_map = parent_column_name_map
|
|
@@ -179,21 +183,22 @@ class ColumnNameMap:
|
|
|
179
183
|
c = ColumnNames(
|
|
180
184
|
spark_name=spark_name,
|
|
181
185
|
snowpark_name=snowpark_column_names[i],
|
|
182
|
-
qualifiers=column_qualifiers[i]
|
|
186
|
+
qualifiers=column_qualifiers[i]
|
|
187
|
+
if column_qualifiers and column_qualifiers[i]
|
|
188
|
+
else set(),
|
|
189
|
+
equivalent_snowpark_names=equivalent_snowpark_names[i]
|
|
190
|
+
if equivalent_snowpark_names and equivalent_snowpark_names[i]
|
|
191
|
+
else set(),
|
|
183
192
|
catalog_info=catalog_info,
|
|
184
193
|
database_info=database_info,
|
|
194
|
+
is_hidden=column_is_hidden[i] if column_is_hidden else False,
|
|
185
195
|
)
|
|
186
196
|
self.columns.append(c)
|
|
187
197
|
|
|
188
|
-
|
|
189
|
-
spark_names_including_qualifier = get_list_of_spark_names_for_column(c)
|
|
190
|
-
|
|
191
|
-
for spark_name_including_qualifier in spark_names_including_qualifier:
|
|
198
|
+
for spark_name in c.all_spark_names_including_qualified_names():
|
|
192
199
|
# the same spark name can map to multiple snowpark names
|
|
193
|
-
self.spark_to_col[
|
|
194
|
-
self.uppercase_spark_to_col[
|
|
195
|
-
spark_name_including_qualifier.upper()
|
|
196
|
-
].append(c)
|
|
200
|
+
self.spark_to_col[spark_name].append(c)
|
|
201
|
+
self.uppercase_spark_to_col[spark_name.upper()].append(c)
|
|
197
202
|
|
|
198
203
|
# the same snowpark name can map to multiple spark column
|
|
199
204
|
# e.g. df.select(date_format('dt', 'yyy'), date_format('dt', 'yyyy')) ->
|
|
@@ -286,9 +291,10 @@ class ColumnNameMap:
|
|
|
286
291
|
self,
|
|
287
292
|
spark_column_names: list[str],
|
|
288
293
|
return_first: bool = False,
|
|
294
|
+
original_snowpark_names: list[str] | None = None,
|
|
289
295
|
) -> list[str]:
|
|
290
296
|
snowpark_column_names = self._get_snowpark_column_names_from_spark_column_names(
|
|
291
|
-
spark_column_names, return_first
|
|
297
|
+
spark_column_names, return_first, original_snowpark_names
|
|
292
298
|
)
|
|
293
299
|
if snowpark_column_names:
|
|
294
300
|
return snowpark_column_names
|
|
@@ -302,7 +308,7 @@ class ColumnNameMap:
|
|
|
302
308
|
and self._parent_column_name_map is not None
|
|
303
309
|
):
|
|
304
310
|
snowpark_column_names = self._parent_column_name_map.get_snowpark_column_names_from_spark_column_names(
|
|
305
|
-
spark_column_names, return_first
|
|
311
|
+
spark_column_names, return_first, original_snowpark_names
|
|
306
312
|
)
|
|
307
313
|
|
|
308
314
|
return snowpark_column_names
|
|
@@ -311,9 +317,10 @@ class ColumnNameMap:
|
|
|
311
317
|
self,
|
|
312
318
|
spark_column_names: list[str],
|
|
313
319
|
return_first: bool = False,
|
|
320
|
+
original_snowpark_names: list[str] | None = None,
|
|
314
321
|
) -> list[str]:
|
|
315
322
|
snowpark_column_names = []
|
|
316
|
-
for name in spark_column_names:
|
|
323
|
+
for i, name in enumerate(spark_column_names):
|
|
317
324
|
if not global_config.spark_sql_caseSensitive:
|
|
318
325
|
name = name.upper()
|
|
319
326
|
mapping = self.uppercase_spark_to_col
|
|
@@ -325,8 +332,26 @@ class ColumnNameMap:
|
|
|
325
332
|
|
|
326
333
|
columns = mapping[name]
|
|
327
334
|
|
|
335
|
+
# make sure the column matches the original snowpark name, if given
|
|
336
|
+
if original_snowpark_names:
|
|
337
|
+
oname = original_snowpark_names[i]
|
|
338
|
+
columns = [
|
|
339
|
+
c
|
|
340
|
+
for c in columns
|
|
341
|
+
if c.snowpark_name == oname or oname in c.equivalent_snowpark_names
|
|
342
|
+
]
|
|
343
|
+
|
|
344
|
+
# Filter out hidden columns for unqualified lookups
|
|
345
|
+
# A qualified lookup contains a dot (e.g., "b.id"), unqualified doesn't (e.g., "id")
|
|
346
|
+
# Hidden columns should only be accessible via qualified names
|
|
347
|
+
is_qualified_lookup = "." in name or original_snowpark_names
|
|
348
|
+
if not is_qualified_lookup:
|
|
349
|
+
# Unqualified lookup: only include visible columns
|
|
350
|
+
columns = [c for c in columns if not c.is_hidden]
|
|
351
|
+
|
|
328
352
|
if return_first:
|
|
329
|
-
|
|
353
|
+
if columns: # Only append if we have columns after filtering
|
|
354
|
+
snowpark_column_names.append(columns[0].snowpark_name)
|
|
330
355
|
else:
|
|
331
356
|
snowpark_column_names.extend([c.snowpark_name for c in columns])
|
|
332
357
|
|
|
@@ -338,8 +363,7 @@ class ColumnNameMap:
|
|
|
338
363
|
*,
|
|
339
364
|
allow_non_exists: bool = False,
|
|
340
365
|
return_first: bool = False,
|
|
341
|
-
|
|
342
|
-
source_qualifiers: list[str] | None = None,
|
|
366
|
+
original_snowpark_name: str | None = None,
|
|
343
367
|
) -> str | None:
|
|
344
368
|
assert isinstance(spark_column_name, str)
|
|
345
369
|
resolved_name = (
|
|
@@ -347,52 +371,85 @@ class ColumnNameMap:
|
|
|
347
371
|
if self.rename_chains
|
|
348
372
|
else spark_column_name
|
|
349
373
|
)
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
# Even if this is an unqualified reference or one to the visible column, it will resolve correctly to
|
|
356
|
-
# the visible name anyway.
|
|
357
|
-
snowpark_names = []
|
|
358
|
-
# Only check hidden columns for qualified references with source qualifiers
|
|
359
|
-
if is_qualified and source_qualifiers is not None and self.hidden_columns:
|
|
360
|
-
column_name = spark_column_name
|
|
361
|
-
|
|
362
|
-
# Check each hidden column for column name AND qualifier match
|
|
363
|
-
for hidden_col in self.hidden_columns:
|
|
364
|
-
if (
|
|
365
|
-
hidden_col.spark_name == column_name
|
|
366
|
-
and hidden_col.qualifiers == source_qualifiers
|
|
367
|
-
):
|
|
368
|
-
if not global_config.spark_sql_caseSensitive:
|
|
369
|
-
if hidden_col.spark_name.upper() == column_name.upper() and [
|
|
370
|
-
q.upper() for q in hidden_col.qualifiers
|
|
371
|
-
] == [q.upper() for q in source_qualifiers]:
|
|
372
|
-
snowpark_names.append(hidden_col.visible_snowpark_name)
|
|
373
|
-
else:
|
|
374
|
-
snowpark_names.append(hidden_col.visible_snowpark_name)
|
|
375
|
-
|
|
376
|
-
# If not found in hidden columns, proceed with normal lookup
|
|
377
|
-
if not snowpark_names:
|
|
378
|
-
snowpark_names = self.get_snowpark_column_names_from_spark_column_names(
|
|
379
|
-
[resolved_name], return_first
|
|
380
|
-
)
|
|
374
|
+
snowpark_names = self.get_snowpark_column_names_from_spark_column_names(
|
|
375
|
+
[resolved_name],
|
|
376
|
+
return_first,
|
|
377
|
+
[original_snowpark_name] if original_snowpark_name else None,
|
|
378
|
+
)
|
|
381
379
|
|
|
382
380
|
snowpark_names_len = len(snowpark_names)
|
|
383
381
|
if snowpark_names_len > 1:
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
382
|
+
# Check if this is a case where we have identical expressions that can be safely resolved to the first one
|
|
383
|
+
# This commonly happens with GROUP BY expressions that also appear in SELECT clauses
|
|
384
|
+
if (
|
|
385
|
+
get_is_processing_order_by()
|
|
386
|
+
and self._can_resolve_ambiguous_identical_expressions(
|
|
387
|
+
resolved_name, snowpark_names
|
|
388
|
+
)
|
|
389
|
+
):
|
|
390
|
+
# All the ambiguous columns represent the same expression, so we can safely use the first one
|
|
391
|
+
return snowpark_names[0]
|
|
392
|
+
else:
|
|
393
|
+
exception = AnalysisException(
|
|
394
|
+
f"Ambiguous spark column name {spark_column_name}, potential snowpark column names {snowpark_names}"
|
|
395
|
+
)
|
|
396
|
+
attach_custom_error_code(exception, ErrorCodes.AMBIGUOUS_COLUMN_NAME)
|
|
397
|
+
raise exception
|
|
387
398
|
elif snowpark_names_len == 0:
|
|
388
399
|
if allow_non_exists:
|
|
389
400
|
return None
|
|
390
401
|
else:
|
|
391
|
-
|
|
402
|
+
exception = AnalysisException(
|
|
392
403
|
f"Spark column name {spark_column_name} does not exist"
|
|
393
404
|
)
|
|
405
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
406
|
+
raise exception
|
|
394
407
|
return snowpark_names[0]
|
|
395
408
|
|
|
409
|
+
def _can_resolve_ambiguous_identical_expressions(
|
|
410
|
+
self, spark_column_name: str, snowpark_names: list[str]
|
|
411
|
+
) -> bool:
|
|
412
|
+
"""
|
|
413
|
+
Determine if ambiguous columns represent identical expressions that can be safely resolved to the first one.
|
|
414
|
+
|
|
415
|
+
This handles the common case where the same expression (like a UDF call) appears multiple times
|
|
416
|
+
in a SELECT clause within a GROUP BY query. Since they're the same expression operating on the
|
|
417
|
+
same grouped data, they will have identical values, so we can safely resolve to any of them.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
spark_column_name: The Spark column name that has multiple mappings, make sure resolve this reforehand
|
|
421
|
+
snowpark_names: List of Snowpark column names that map to this Spark column name
|
|
422
|
+
|
|
423
|
+
Returns:
|
|
424
|
+
True if we can safely resolve to the first snowpark column, False otherwise
|
|
425
|
+
"""
|
|
426
|
+
if spark_column_name not in self.spark_to_col:
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
columns: list[ColumnNames] = self.spark_to_col[spark_column_name]
|
|
430
|
+
|
|
431
|
+
# If we don't have multiple columns, there's no ambiguity to resolve
|
|
432
|
+
if len(columns) <= 1:
|
|
433
|
+
return False
|
|
434
|
+
|
|
435
|
+
# Check if all the snowpark names correspond to columns that have identical underlying expressions
|
|
436
|
+
# We'll compare the actual column objects to see if they represent the same computation
|
|
437
|
+
first_column = columns[0]
|
|
438
|
+
|
|
439
|
+
for column in columns[1:]:
|
|
440
|
+
if first_column.qualifiers != column.qualifiers:
|
|
441
|
+
return False
|
|
442
|
+
|
|
443
|
+
# Additional safety check: ensure all snowpark names are actually in our mapping
|
|
444
|
+
for snowpark_name in snowpark_names:
|
|
445
|
+
if snowpark_name not in self.snowpark_to_col:
|
|
446
|
+
return False
|
|
447
|
+
|
|
448
|
+
# If we reach here, the columns appear to be identical expressions from the same context
|
|
449
|
+
# This commonly happens in GROUP BY scenarios where the same expression appears in both
|
|
450
|
+
# the grouping clause and the select clause
|
|
451
|
+
return True
|
|
452
|
+
|
|
396
453
|
def get_spark_column_names_from_snowpark_column_names(
|
|
397
454
|
self,
|
|
398
455
|
snowpark_column_names: list[str],
|
|
@@ -418,98 +475,79 @@ class ColumnNameMap:
|
|
|
418
475
|
)
|
|
419
476
|
spark_names_len = len(spark_names)
|
|
420
477
|
if spark_names_len > 1:
|
|
421
|
-
|
|
478
|
+
exception = AnalysisException(
|
|
422
479
|
f"Ambiguous snowpark column name {snowpark_column_name}, potential spark column names {spark_names}"
|
|
423
480
|
)
|
|
481
|
+
attach_custom_error_code(exception, ErrorCodes.AMBIGUOUS_COLUMN_NAME)
|
|
482
|
+
raise exception
|
|
424
483
|
elif spark_names_len == 0:
|
|
425
484
|
if allow_non_exists:
|
|
426
485
|
return None
|
|
427
486
|
else:
|
|
428
|
-
|
|
487
|
+
exception = AnalysisException(
|
|
429
488
|
f"Snowpark column name {snowpark_column_name} does not exist"
|
|
430
489
|
)
|
|
490
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
491
|
+
raise exception
|
|
431
492
|
return spark_names[0]
|
|
432
493
|
|
|
433
494
|
def get_spark_column_name(self, idx: int) -> str:
|
|
434
495
|
return self.columns[idx].spark_name
|
|
435
496
|
|
|
436
497
|
def get_spark_columns(self) -> list[str]:
|
|
437
|
-
return [c.spark_name for c in self.columns]
|
|
498
|
+
return [c.spark_name for c in self.columns if not c.is_hidden]
|
|
438
499
|
|
|
439
500
|
def get_spark_and_snowpark_columns_with_qualifier_for_qualifier(
|
|
440
|
-
self,
|
|
441
|
-
) -> tuple[list[str], list[str], list[
|
|
501
|
+
self, target_qualifier: ColumnQualifier
|
|
502
|
+
) -> tuple[list[str], list[str], list[set[ColumnQualifier]]]:
|
|
442
503
|
"""
|
|
443
|
-
Returns the Spark and Snowpark column names along with their qualifiers for the specified
|
|
444
|
-
If a column does not have a qualifier, it will be None.
|
|
504
|
+
Returns the Spark and Snowpark column names along with their qualifiers for the specified qualifier.
|
|
445
505
|
"""
|
|
446
|
-
spark_columns = []
|
|
447
|
-
snowpark_columns = []
|
|
448
|
-
qualifiers = []
|
|
506
|
+
spark_columns: list[str] = []
|
|
507
|
+
snowpark_columns: list[str] = []
|
|
508
|
+
qualifiers: list[set[ColumnQualifier]] = []
|
|
449
509
|
|
|
510
|
+
normalized_qualifier = target_qualifier
|
|
450
511
|
if not self.is_case_sensitive():
|
|
451
|
-
|
|
512
|
+
normalized_qualifier = target_qualifier.to_upper()
|
|
452
513
|
|
|
453
|
-
for
|
|
454
|
-
|
|
455
|
-
|
|
514
|
+
for column in self.columns:
|
|
515
|
+
# Normalize all qualifiers for comparison
|
|
516
|
+
column_qualifiers: set[ColumnQualifier] = (
|
|
517
|
+
{q.to_upper() for q in iter(column.qualifiers)}
|
|
456
518
|
if not self.is_case_sensitive()
|
|
457
|
-
else
|
|
519
|
+
else column.qualifiers
|
|
458
520
|
)
|
|
459
|
-
if
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
spark_columns.append(c.spark_name)
|
|
464
|
-
snowpark_columns.append(c.snowpark_name)
|
|
465
|
-
qualifiers.append(c.qualifiers)
|
|
466
|
-
|
|
467
|
-
# Note: The following code is commented out because there is a bug with handling duplicate columns in
|
|
468
|
-
# qualified select *'s. This needs to be revisited once a solution for that is found.
|
|
469
|
-
# TODO: https://snowflakecomputing.atlassian.net/browse/SNOW-2265240
|
|
470
|
-
|
|
471
|
-
# # Handles fetching/resolving the hidden columns if they also match the qualifiers
|
|
472
|
-
# # This method is only ever called for qualified references, so we need to check hidden columns as well.
|
|
473
|
-
# if self.hidden_columns:
|
|
474
|
-
# for hidden_col in self.hidden_columns:
|
|
475
|
-
# col_qualifiers = (
|
|
476
|
-
# [q.upper() for q in hidden_col.qualifiers]
|
|
477
|
-
# if not self.is_case_sensitive()
|
|
478
|
-
# else hidden_col.qualifiers
|
|
479
|
-
# )
|
|
480
|
-
# if len(col_qualifiers) < len(qualifiers_input):
|
|
481
|
-
# continue
|
|
482
|
-
# if col_qualifiers[-len(qualifiers_input) :] == qualifiers_input:
|
|
483
|
-
# # This hidden column matches! Add it to the results
|
|
484
|
-
# spark_columns.append(hidden_col.spark_name)
|
|
485
|
-
# snowpark_columns.append(hidden_col.visible_snowpark_name)
|
|
486
|
-
# qualifiers.append(hidden_col.qualifiers)
|
|
521
|
+
if any([q.matches(normalized_qualifier) for q in column_qualifiers]):
|
|
522
|
+
spark_columns.append(column.spark_name)
|
|
523
|
+
snowpark_columns.append(column.snowpark_name)
|
|
524
|
+
qualifiers.append(column.qualifiers)
|
|
487
525
|
|
|
488
526
|
return spark_columns, snowpark_columns, qualifiers
|
|
489
527
|
|
|
490
528
|
def get_snowpark_columns(self) -> list[str]:
|
|
491
|
-
return [c.snowpark_name for c in self.columns]
|
|
529
|
+
return [c.snowpark_name for c in self.columns if not c.is_hidden]
|
|
492
530
|
|
|
493
|
-
def get_snowpark_columns_after_drop(
|
|
531
|
+
def get_snowpark_columns_after_drop(
|
|
532
|
+
self, cols_to_drop: list[str]
|
|
533
|
+
) -> list[ColumnNames]:
|
|
494
534
|
return [
|
|
495
535
|
c
|
|
496
|
-
for c in self.
|
|
497
|
-
if self._quote_if_unquoted(c) not in cols_to_drop
|
|
536
|
+
for c in self.columns
|
|
537
|
+
if self._quote_if_unquoted(c.snowpark_name) not in cols_to_drop
|
|
498
538
|
]
|
|
499
539
|
|
|
500
|
-
def get_qualifiers(self) -> list[
|
|
540
|
+
def get_qualifiers(self) -> list[set[ColumnQualifier]]:
|
|
501
541
|
"""
|
|
502
542
|
Returns the qualifiers for the columns.
|
|
503
|
-
If a column does not have a qualifier, it will be None.
|
|
504
543
|
"""
|
|
505
|
-
return [c.qualifiers for c in self.columns]
|
|
544
|
+
return [c.qualifiers for c in self.columns if not c.is_hidden]
|
|
506
545
|
|
|
507
546
|
def get_qualifiers_for_columns_after_drop(
|
|
508
547
|
self, cols_to_drop: list[str]
|
|
509
|
-
) -> list[
|
|
548
|
+
) -> list[set[ColumnQualifier]]:
|
|
510
549
|
"""
|
|
511
550
|
Returns the qualifiers for the columns after dropping the specified columns.
|
|
512
|
-
If a column is dropped, its qualifier will be None.
|
|
513
551
|
"""
|
|
514
552
|
return [
|
|
515
553
|
c.qualifiers
|
|
@@ -517,27 +555,40 @@ class ColumnNameMap:
|
|
|
517
555
|
if self._quote_if_unquoted(c.snowpark_name) not in cols_to_drop
|
|
518
556
|
]
|
|
519
557
|
|
|
520
|
-
def
|
|
558
|
+
def get_qualifiers_for_snowpark_column(
|
|
521
559
|
self,
|
|
522
|
-
|
|
523
|
-
) ->
|
|
560
|
+
snowpark_name: str,
|
|
561
|
+
) -> set[ColumnQualifier]:
|
|
524
562
|
"""
|
|
525
|
-
Returns the qualifier for the specified
|
|
526
|
-
If the column does not exist, returns
|
|
563
|
+
Returns the qualifier for the specified snowpark column name.
|
|
564
|
+
If the column does not exist, returns empty ColumnQualifier.
|
|
527
565
|
"""
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
566
|
+
for c in self.columns:
|
|
567
|
+
if c.snowpark_name == snowpark_name:
|
|
568
|
+
return c.qualifiers
|
|
569
|
+
|
|
570
|
+
return set()
|
|
571
|
+
|
|
572
|
+
def get_equivalent_snowpark_names(self) -> list[set[str]]:
|
|
573
|
+
return [c.equivalent_snowpark_names for c in self.columns]
|
|
534
574
|
|
|
535
|
-
|
|
575
|
+
def get_equivalent_snowpark_names_for_snowpark_name(
|
|
576
|
+
self, snowpark_name: str | None
|
|
577
|
+
) -> set[str]:
|
|
578
|
+
"""
|
|
579
|
+
Helper method to get the set of old, equivalent snowpark names for the given column. Used to pass
|
|
580
|
+
this information to child column maps.
|
|
581
|
+
"""
|
|
582
|
+
if not snowpark_name:
|
|
583
|
+
return set()
|
|
536
584
|
|
|
537
|
-
|
|
538
|
-
|
|
585
|
+
name = self._quote_if_unquoted(snowpark_name)
|
|
586
|
+
for c in self.columns:
|
|
587
|
+
if name == c.snowpark_name:
|
|
588
|
+
return c.equivalent_snowpark_names
|
|
539
589
|
|
|
540
|
-
|
|
590
|
+
# no equivalent names found
|
|
591
|
+
return set()
|
|
541
592
|
|
|
542
593
|
@staticmethod
|
|
543
594
|
def _quote_if_unquoted(s: str) -> str:
|
|
@@ -555,19 +606,20 @@ class ColumnNameMap:
|
|
|
555
606
|
def snowpark_to_spark_map(self) -> dict[str, str]:
|
|
556
607
|
return {c.snowpark_name: c.spark_name for c in self.columns}
|
|
557
608
|
|
|
558
|
-
def
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
609
|
+
def get_columns_matching_pattern(self, pattern: str) -> list[ColumnNames]:
|
|
610
|
+
try:
|
|
611
|
+
pattern_regex = re.compile(
|
|
612
|
+
pattern, 0 if self.is_case_sensitive() else re.IGNORECASE
|
|
613
|
+
)
|
|
614
|
+
return [c for c in self.columns if pattern_regex.fullmatch(c.spark_name)]
|
|
615
|
+
except re.error as e:
|
|
616
|
+
exception = AnalysisException(f"Invalid regex pattern '{pattern}': {e}")
|
|
617
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_FUNCTION_ARGUMENT)
|
|
618
|
+
raise exception
|
|
567
619
|
|
|
568
620
|
def with_columns(
|
|
569
621
|
self, new_spark_columns: list[str], new_snowpark_columns: list[str]
|
|
570
|
-
) -> tuple[list[str], list[str], list[list[str]]]:
|
|
622
|
+
) -> tuple[list[str], list[str], list[set[ColumnQualifier]], list[set[str]]]:
|
|
571
623
|
"""
|
|
572
624
|
Returns an ordered list of spark and snowpark column names after adding the new columns through a withColumns call.
|
|
573
625
|
All replaced columns retain their ordering in the dataframe. The new columns are added to the end of the list.
|
|
@@ -588,6 +640,7 @@ class ColumnNameMap:
|
|
|
588
640
|
snowpark_columns = []
|
|
589
641
|
removed_index: set[int] = set()
|
|
590
642
|
qualifiers = []
|
|
643
|
+
equivalent_snowpark_names = []
|
|
591
644
|
|
|
592
645
|
for c in self.columns:
|
|
593
646
|
column_name = self._normalized_spark_name(c.spark_name)
|
|
@@ -596,19 +649,22 @@ class ColumnNameMap:
|
|
|
596
649
|
removed_index.add(index)
|
|
597
650
|
spark_columns.append(new_spark_columns[index])
|
|
598
651
|
snowpark_columns.append(new_snowpark_columns[index])
|
|
599
|
-
qualifiers.append(
|
|
652
|
+
qualifiers.append(set())
|
|
653
|
+
equivalent_snowpark_names.append(set())
|
|
600
654
|
else:
|
|
601
655
|
spark_columns.append(c.spark_name)
|
|
602
656
|
snowpark_columns.append(c.snowpark_name)
|
|
603
657
|
qualifiers.append(c.qualifiers)
|
|
658
|
+
equivalent_snowpark_names.append(c.equivalent_snowpark_names)
|
|
604
659
|
|
|
605
660
|
for i, _ in enumerate(new_spark_columns):
|
|
606
661
|
if i not in removed_index:
|
|
607
662
|
spark_columns.append(new_spark_columns[i])
|
|
608
663
|
snowpark_columns.append(new_snowpark_columns[i])
|
|
609
|
-
qualifiers.append(
|
|
664
|
+
qualifiers.append(set())
|
|
665
|
+
equivalent_snowpark_names.append(set())
|
|
610
666
|
|
|
611
|
-
return spark_columns, snowpark_columns, qualifiers
|
|
667
|
+
return spark_columns, snowpark_columns, qualifiers, equivalent_snowpark_names
|
|
612
668
|
|
|
613
669
|
def _normalized_spark_name(self, spark_name: str) -> str:
|
|
614
670
|
if self.is_case_sensitive():
|
|
@@ -616,34 +672,77 @@ class ColumnNameMap:
|
|
|
616
672
|
else:
|
|
617
673
|
return spark_name.upper()
|
|
618
674
|
|
|
619
|
-
def
|
|
620
|
-
self,
|
|
621
|
-
) ->
|
|
675
|
+
def get_columns_after_join(
|
|
676
|
+
self, right: ColumnNameMap, join_columns: list[str], join_type: str
|
|
677
|
+
) -> list[ColumnNames]:
|
|
622
678
|
"""
|
|
623
|
-
|
|
679
|
+
Returns a list of columns (names and qualifiers) after a using_columns join with the given column map
|
|
624
680
|
"""
|
|
625
|
-
if not self.hidden_columns or source_qualifiers is None:
|
|
626
|
-
return False
|
|
627
681
|
|
|
628
|
-
#
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
682
|
+
# first, let's gather right-side join columns for qualifier lookup
|
|
683
|
+
# and the remaining columns to append them to the result
|
|
684
|
+
join_column_names = [self._normalized_spark_name(c) for c in join_columns]
|
|
685
|
+
right_join_columns: dict[str, ColumnNames] = {}
|
|
686
|
+
right_remaining_columns: list[ColumnNames] = []
|
|
687
|
+
for oc in right.columns:
|
|
688
|
+
col_name = self._normalized_spark_name(oc.spark_name)
|
|
689
|
+
# only take the first matching column
|
|
690
|
+
if col_name in join_column_names and col_name not in right_join_columns:
|
|
691
|
+
right_join_columns[col_name] = oc
|
|
692
|
+
else:
|
|
693
|
+
right_remaining_columns.append(oc)
|
|
632
694
|
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
)
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
695
|
+
# now gather left-side columns
|
|
696
|
+
left_join_columns: dict[str, ColumnNames] = {}
|
|
697
|
+
left_remaining_columns: list[ColumnNames] = []
|
|
698
|
+
for c in self.columns:
|
|
699
|
+
col_name = self._normalized_spark_name(c.spark_name)
|
|
700
|
+
if col_name in join_column_names and col_name not in left_join_columns:
|
|
701
|
+
equivalent_snowpark_names = set()
|
|
702
|
+
# only assign join-side qualifier for outer joins
|
|
703
|
+
match join_type:
|
|
704
|
+
case "left":
|
|
705
|
+
qualifiers = c.qualifiers
|
|
706
|
+
case "right":
|
|
707
|
+
qualifiers = right_join_columns[col_name].qualifiers
|
|
708
|
+
case _:
|
|
709
|
+
qualifiers = (
|
|
710
|
+
c.qualifiers | right_join_columns[col_name].qualifiers
|
|
711
|
+
)
|
|
712
|
+
equivalent_snowpark_names.update(
|
|
713
|
+
c.equivalent_snowpark_names,
|
|
714
|
+
right_join_columns[col_name].equivalent_snowpark_names,
|
|
715
|
+
{right_join_columns[col_name].snowpark_name},
|
|
716
|
+
)
|
|
717
|
+
|
|
718
|
+
left_join_columns[col_name] = ColumnNames(
|
|
719
|
+
c.spark_name, c.snowpark_name, qualifiers, equivalent_snowpark_names
|
|
720
|
+
)
|
|
721
|
+
else:
|
|
722
|
+
left_remaining_columns.append(c)
|
|
723
|
+
|
|
724
|
+
# join columns go first in the user-given order,
|
|
725
|
+
# then the remaining left-side columns, then remaining right-side columns
|
|
726
|
+
match join_type:
|
|
727
|
+
case "right":
|
|
728
|
+
ordered_join_columns = [
|
|
729
|
+
right_join_columns[name] for name in join_column_names
|
|
730
|
+
]
|
|
731
|
+
case _:
|
|
732
|
+
ordered_join_columns = [
|
|
733
|
+
left_join_columns[name] for name in join_column_names
|
|
734
|
+
]
|
|
735
|
+
return ordered_join_columns + left_remaining_columns + right_remaining_columns
|
|
645
736
|
|
|
646
|
-
|
|
737
|
+
def get_conflicting_snowpark_columns(self, other: ColumnNameMap) -> set[str]:
|
|
738
|
+
conflicting_columns = set()
|
|
739
|
+
snowpark_names = {c.snowpark_name for c in self.columns}
|
|
740
|
+
|
|
741
|
+
for c in other.columns:
|
|
742
|
+
if c.snowpark_name in snowpark_names:
|
|
743
|
+
conflicting_columns.add(c.snowpark_name)
|
|
744
|
+
|
|
745
|
+
return conflicting_columns
|
|
647
746
|
|
|
648
747
|
|
|
649
748
|
class JoinColumnNameMap(ColumnNameMap):
|
|
@@ -654,9 +753,6 @@ class JoinColumnNameMap(ColumnNameMap):
|
|
|
654
753
|
) -> None:
|
|
655
754
|
self.left_column_mapping: ColumnNameMap = left_colmap
|
|
656
755
|
self.right_column_mapping: ColumnNameMap = right_colmap
|
|
657
|
-
# Ensure attributes expected by base-class helpers exist to avoid AttributeError
|
|
658
|
-
# when generic code paths (e.g., hidden column checks) touch them.
|
|
659
|
-
self.hidden_columns: set[HiddenColumn] | None = None
|
|
660
756
|
|
|
661
757
|
def get_snowpark_column_name_from_spark_column_name(
|
|
662
758
|
self,
|
|
@@ -664,20 +760,20 @@ class JoinColumnNameMap(ColumnNameMap):
|
|
|
664
760
|
*,
|
|
665
761
|
allow_non_exists: bool = False,
|
|
666
762
|
return_first: bool = False,
|
|
667
|
-
|
|
668
|
-
is_qualified: bool = False,
|
|
669
|
-
source_qualifiers: list[str] | None = None,
|
|
763
|
+
original_snowpark_name: str | None = None,
|
|
670
764
|
) -> str | None:
|
|
671
765
|
snowpark_column_name_in_left = (
|
|
672
766
|
self.left_column_mapping.get_snowpark_column_name_from_spark_column_name(
|
|
673
767
|
spark_column_name,
|
|
674
768
|
allow_non_exists=True,
|
|
769
|
+
original_snowpark_name=original_snowpark_name,
|
|
675
770
|
)
|
|
676
771
|
)
|
|
677
772
|
snowpark_column_name_in_right = (
|
|
678
773
|
self.right_column_mapping.get_snowpark_column_name_from_spark_column_name(
|
|
679
774
|
spark_column_name,
|
|
680
775
|
allow_non_exists=True,
|
|
776
|
+
original_snowpark_name=original_snowpark_name,
|
|
681
777
|
)
|
|
682
778
|
)
|
|
683
779
|
|
|
@@ -688,14 +784,37 @@ class JoinColumnNameMap(ColumnNameMap):
|
|
|
688
784
|
if allow_non_exists:
|
|
689
785
|
return None
|
|
690
786
|
else:
|
|
691
|
-
|
|
787
|
+
exception = AnalysisException(
|
|
692
788
|
f"Spark column name {spark_column_name} does not exist in either left or right DataFrame"
|
|
693
789
|
)
|
|
790
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
791
|
+
raise exception
|
|
694
792
|
|
|
793
|
+
# special case for join conditions, if the column has a match on both sides, and exactly one of those
|
|
794
|
+
# matches is the original snowpark name, that match should be used
|
|
695
795
|
if (snowpark_column_name_in_right is not None) and (
|
|
696
796
|
snowpark_column_name_in_left is not None
|
|
697
797
|
):
|
|
698
|
-
|
|
798
|
+
if (
|
|
799
|
+
snowpark_column_name_in_left == original_snowpark_name
|
|
800
|
+
and snowpark_column_name_in_right != original_snowpark_name
|
|
801
|
+
):
|
|
802
|
+
snowpark_column_name_in_right = None
|
|
803
|
+
|
|
804
|
+
if (
|
|
805
|
+
snowpark_column_name_in_right == original_snowpark_name
|
|
806
|
+
and snowpark_column_name_in_left != original_snowpark_name
|
|
807
|
+
):
|
|
808
|
+
snowpark_column_name_in_left = None
|
|
809
|
+
|
|
810
|
+
if (snowpark_column_name_in_right is not None) and (
|
|
811
|
+
snowpark_column_name_in_left is not None
|
|
812
|
+
):
|
|
813
|
+
exception = AnalysisException(
|
|
814
|
+
f"Ambiguous column name `{spark_column_name}` in join condition"
|
|
815
|
+
)
|
|
816
|
+
attach_custom_error_code(exception, ErrorCodes.AMBIGUOUS_COLUMN_NAME)
|
|
817
|
+
raise exception
|
|
699
818
|
|
|
700
819
|
snowpark_name = (
|
|
701
820
|
snowpark_column_name_in_right
|
|
@@ -703,86 +822,128 @@ class JoinColumnNameMap(ColumnNameMap):
|
|
|
703
822
|
else snowpark_column_name_in_left
|
|
704
823
|
)
|
|
705
824
|
|
|
706
|
-
# this means that the reference is for the column in right dataframe but same snowpark name exist in left dataframe as well
|
|
707
|
-
# or vice versa, so we need to append _left or _right to the snowpark name
|
|
708
|
-
if (
|
|
709
|
-
snowpark_name in self.left_column_mapping.get_snowpark_columns()
|
|
710
|
-
and snowpark_column_name_in_right is not None
|
|
711
|
-
):
|
|
712
|
-
snowpark_name = quote_name(f"{unquote_if_quoted(snowpark_name)}_right")
|
|
713
|
-
elif (
|
|
714
|
-
snowpark_name in self.right_column_mapping.get_snowpark_columns()
|
|
715
|
-
and snowpark_column_name_in_left is not None
|
|
716
|
-
):
|
|
717
|
-
snowpark_name = quote_name(f"{unquote_if_quoted(snowpark_name)}_left")
|
|
718
|
-
|
|
719
825
|
return snowpark_name
|
|
720
826
|
|
|
721
827
|
def get_snowpark_column_names_from_spark_column_names(
|
|
722
|
-
self,
|
|
828
|
+
self,
|
|
829
|
+
spark_column_names: list[str],
|
|
830
|
+
return_first: bool = False,
|
|
831
|
+
original_snowpark_names: list[str] | None = None,
|
|
723
832
|
) -> list[str]:
|
|
724
|
-
|
|
833
|
+
exception = NotImplementedError("Method not implemented!")
|
|
834
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
835
|
+
raise exception
|
|
725
836
|
|
|
726
837
|
def get_spark_column_names_from_snowpark_column_names(
|
|
727
838
|
self,
|
|
728
839
|
snowpark_column_names: list[str],
|
|
729
840
|
) -> list[str]:
|
|
730
|
-
|
|
841
|
+
exception = NotImplementedError("Method not implemented!")
|
|
842
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
843
|
+
raise exception
|
|
731
844
|
|
|
732
845
|
def get_spark_column_name_from_snowpark_column_name(
|
|
733
|
-
self,
|
|
846
|
+
self,
|
|
847
|
+
snowpark_column_name: str,
|
|
848
|
+
allow_non_exists: bool = False,
|
|
734
849
|
) -> str:
|
|
735
|
-
|
|
850
|
+
exception = NotImplementedError("Method not implemented!")
|
|
851
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
852
|
+
raise exception
|
|
736
853
|
|
|
737
854
|
def get_spark_columns(self) -> list[str]:
|
|
738
|
-
|
|
855
|
+
exception = NotImplementedError("Method not implemented!")
|
|
856
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
857
|
+
raise exception
|
|
739
858
|
|
|
740
859
|
def get_snowpark_columns(self) -> list[str]:
|
|
741
|
-
|
|
860
|
+
exception = NotImplementedError("Method not implemented!")
|
|
861
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
862
|
+
raise exception
|
|
742
863
|
|
|
743
|
-
def get_snowpark_columns_after_drop(
|
|
744
|
-
|
|
864
|
+
def get_snowpark_columns_after_drop(
|
|
865
|
+
self, cols_to_drop: list[str]
|
|
866
|
+
) -> list[ColumnNames]:
|
|
867
|
+
exception = NotImplementedError("Method not implemented!")
|
|
868
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
869
|
+
raise exception
|
|
745
870
|
|
|
746
871
|
def get_renamed_nested_column_name(self, name) -> str | None:
|
|
747
|
-
|
|
872
|
+
exception = NotImplementedError("Method not implemented!")
|
|
873
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
874
|
+
raise exception
|
|
748
875
|
|
|
749
876
|
def has_spark_column(self, spark_column_name: str) -> bool:
|
|
750
|
-
|
|
877
|
+
exception = NotImplementedError("Method not implemented!")
|
|
878
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
879
|
+
raise exception
|
|
751
880
|
|
|
752
881
|
def snowpark_to_spark_map(self) -> dict[str, str]:
|
|
753
|
-
|
|
882
|
+
exception = NotImplementedError("Method not implemented!")
|
|
883
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
884
|
+
raise exception
|
|
754
885
|
|
|
755
|
-
def
|
|
756
|
-
|
|
886
|
+
def get_columns_matching_pattern(self, pattern: str) -> list[tuple[str, str]]:
|
|
887
|
+
exception = NotImplementedError("Method not implemented!")
|
|
888
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
889
|
+
raise exception
|
|
757
890
|
|
|
758
891
|
def with_columns(
|
|
759
892
|
self, new_spark_columns: list[str], new_snowpark_columns: list[str]
|
|
760
|
-
) -> tuple[list[str], list[str], list[
|
|
761
|
-
|
|
893
|
+
) -> tuple[list[str], list[str], list[set[ColumnQualifier]]]:
|
|
894
|
+
exception = NotImplementedError("Method not implemented!")
|
|
895
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
896
|
+
raise exception
|
|
762
897
|
|
|
763
|
-
def get_qualifiers(self) -> list[
|
|
764
|
-
|
|
898
|
+
def get_qualifiers(self) -> list[set[ColumnQualifier]]:
|
|
899
|
+
exception = NotImplementedError("Method not implemented!")
|
|
900
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
901
|
+
raise exception
|
|
765
902
|
|
|
766
903
|
def get_qualifiers_for_columns_after_drop(
|
|
767
904
|
self, cols_to_drop: list[str]
|
|
768
|
-
) -> list[
|
|
769
|
-
|
|
905
|
+
) -> list[set[ColumnQualifier]]:
|
|
906
|
+
exception = NotImplementedError("Method not implemented!")
|
|
907
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
908
|
+
raise exception
|
|
770
909
|
|
|
771
910
|
def get_spark_and_snowpark_columns_with_qualifier_for_qualifier(
|
|
772
|
-
self,
|
|
773
|
-
) -> tuple[list[str], list[str], list[
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
911
|
+
self, target_qualifier: list[str]
|
|
912
|
+
) -> tuple[list[str], list[str], list[set[ColumnQualifier]]]:
|
|
913
|
+
exception = NotImplementedError("Method not implemented!")
|
|
914
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
915
|
+
raise exception
|
|
916
|
+
|
|
917
|
+
def get_qualifiers_for_snowpark_column(
|
|
918
|
+
self, snowpark_name: str
|
|
919
|
+
) -> set[ColumnQualifier]:
|
|
920
|
+
qualifiers_left = self.left_column_mapping.get_qualifiers_for_snowpark_column(
|
|
921
|
+
snowpark_name
|
|
780
922
|
)
|
|
781
|
-
|
|
782
|
-
|
|
923
|
+
qualifiers_right = self.right_column_mapping.get_qualifiers_for_snowpark_column(
|
|
924
|
+
snowpark_name
|
|
783
925
|
)
|
|
784
926
|
|
|
785
|
-
if (len(
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
927
|
+
if (len(qualifiers_left) > 0) and (len(qualifiers_right) > 0):
|
|
928
|
+
exception = AnalysisException(f"Ambiguous column name {snowpark_name}")
|
|
929
|
+
attach_custom_error_code(exception, ErrorCodes.AMBIGUOUS_COLUMN_NAME)
|
|
930
|
+
raise exception
|
|
931
|
+
|
|
932
|
+
return qualifiers_right if len(qualifiers_left) == 0 else qualifiers_left
|
|
933
|
+
|
|
934
|
+
def get_columns_after_join(
|
|
935
|
+
self, right: ColumnNameMap, join_columns: list[str], join_type: str
|
|
936
|
+
) -> list[ColumnNames]:
|
|
937
|
+
exception = NotImplementedError("Method not implemented!")
|
|
938
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
939
|
+
raise exception
|
|
940
|
+
|
|
941
|
+
def get_equivalent_snowpark_names_for_snowpark_name(self, snowpark_name: str):
|
|
942
|
+
exception = NotImplementedError("Method not implemented!")
|
|
943
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
944
|
+
raise exception
|
|
945
|
+
|
|
946
|
+
def get_equivalent_snowpark_names(self):
|
|
947
|
+
exception = NotImplementedError("Method not implemented!")
|
|
948
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
949
|
+
raise exception
|