snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +717 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +309 -26
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/error_utils.py +28 -0
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +224 -15
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +86 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
- snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
- snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +171 -48
- snowflake/snowpark_connect/server.py +528 -473
- snowflake/snowpark_connect/server_common/__init__.py +503 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/type_support.py +130 -0
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +195 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +192 -40
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -28,12 +28,19 @@ from snowflake.snowpark._internal.utils import generate_random_alphanumeric
|
|
|
28
28
|
from snowflake.snowpark.column import Column
|
|
29
29
|
from snowflake.snowpark.table_function import _ExplodeFunctionCall
|
|
30
30
|
from snowflake.snowpark.types import DataType, StructField, StructType, _NumericType
|
|
31
|
+
from snowflake.snowpark_connect import tcm
|
|
31
32
|
from snowflake.snowpark_connect.column_name_handler import (
|
|
33
|
+
ColumnQualifier,
|
|
32
34
|
make_column_names_snowpark_compatible,
|
|
35
|
+
make_unique_snowpark_name,
|
|
33
36
|
)
|
|
34
37
|
from snowflake.snowpark_connect.config import global_config
|
|
35
38
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
36
|
-
from snowflake.snowpark_connect.error.
|
|
39
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
40
|
+
from snowflake.snowpark_connect.error.error_utils import (
|
|
41
|
+
SparkException,
|
|
42
|
+
attach_custom_error_code,
|
|
43
|
+
)
|
|
37
44
|
from snowflake.snowpark_connect.expression.map_expression import (
|
|
38
45
|
map_alias,
|
|
39
46
|
map_expression,
|
|
@@ -42,6 +49,9 @@ from snowflake.snowpark_connect.expression.map_expression import (
|
|
|
42
49
|
from snowflake.snowpark_connect.expression.map_unresolved_function import unwrap_literal
|
|
43
50
|
from snowflake.snowpark_connect.expression.typer import ExpressionTyper
|
|
44
51
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
52
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
53
|
+
without_internal_columns,
|
|
54
|
+
)
|
|
45
55
|
from snowflake.snowpark_connect.relation.utils import (
|
|
46
56
|
TYPE_MAP_FOR_TO_SCHEMA,
|
|
47
57
|
snowpark_functions_col,
|
|
@@ -56,6 +66,9 @@ from snowflake.snowpark_connect.utils.context import (
|
|
|
56
66
|
clear_lca_alias_map,
|
|
57
67
|
register_lca_alias,
|
|
58
68
|
)
|
|
69
|
+
from snowflake.snowpark_connect.utils.expression_transformer import (
|
|
70
|
+
is_child_agg_function_expression,
|
|
71
|
+
)
|
|
59
72
|
from snowflake.snowpark_connect.utils.identifiers import (
|
|
60
73
|
split_fully_qualified_spark_name,
|
|
61
74
|
)
|
|
@@ -73,7 +86,7 @@ def map_drop(
|
|
|
73
86
|
|
|
74
87
|
The drop is a list of expressions that is applied to the DataFrame.
|
|
75
88
|
"""
|
|
76
|
-
input_container = map_relation(rel.drop.input)
|
|
89
|
+
input_container = map_relation(rel.drop.input).without_hidden_columns()
|
|
77
90
|
input_df = input_container.dataframe
|
|
78
91
|
typer = ExpressionTyper(input_df)
|
|
79
92
|
columns_to_drop_with_names = []
|
|
@@ -129,20 +142,19 @@ def map_drop(
|
|
|
129
142
|
# object to handle these cases.
|
|
130
143
|
try:
|
|
131
144
|
column_map = input_container.column_map
|
|
132
|
-
|
|
145
|
+
new_columns = column_map.get_snowpark_columns_after_drop(
|
|
133
146
|
_get_column_names_to_drop()
|
|
134
147
|
)
|
|
135
148
|
result: snowpark.DataFrame = input_df.drop(*columns_to_drop)
|
|
136
149
|
return DataFrameContainer.create_with_column_mapping(
|
|
137
150
|
dataframe=result,
|
|
138
|
-
spark_column_names=
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
snowpark_column_names=new_columns_names,
|
|
142
|
-
column_qualifiers=column_map.get_qualifiers_for_columns_after_drop(
|
|
143
|
-
_get_column_names_to_drop()
|
|
144
|
-
),
|
|
151
|
+
spark_column_names=[c.spark_name for c in new_columns],
|
|
152
|
+
snowpark_column_names=[c.snowpark_name for c in new_columns],
|
|
153
|
+
column_qualifiers=[c.qualifiers for c in new_columns],
|
|
145
154
|
parent_column_name_map=column_map,
|
|
155
|
+
equivalent_snowpark_names=[
|
|
156
|
+
c.equivalent_snowpark_names for c in new_columns
|
|
157
|
+
],
|
|
146
158
|
)
|
|
147
159
|
except snowpark.exceptions.SnowparkColumnException:
|
|
148
160
|
from snowflake.snowpark_connect.empty_dataframe import EmptyDataFrame
|
|
@@ -160,7 +172,6 @@ def map_project(
|
|
|
160
172
|
"""
|
|
161
173
|
if rel.project.HasField("input"):
|
|
162
174
|
input_container = map_relation(rel.project.input)
|
|
163
|
-
input_df = input_container.dataframe
|
|
164
175
|
else:
|
|
165
176
|
# Create a dataframe to represent a OneRowRelation AST node.
|
|
166
177
|
# XXX: Snowflake does not support 0-column tables, so create a dummy column;
|
|
@@ -200,20 +211,33 @@ def map_project(
|
|
|
200
211
|
|
|
201
212
|
# Collect aliases to batch process them
|
|
202
213
|
pending_aliases = [] # List of (spark_name, snowpark_col, aliased_col, alias_types)
|
|
214
|
+
# Track columns that might need aliasing if multi-column generators are present
|
|
215
|
+
# Format: (index_in_select_list, snowpark_column_name, mapper.col)
|
|
216
|
+
conditional_aliases = []
|
|
217
|
+
|
|
218
|
+
def _is_attribute(exp: expressions_proto.Expression) -> bool:
|
|
219
|
+
return exp.WhichOneof("expr_type") == "unresolved_attribute"
|
|
203
220
|
|
|
204
221
|
# Detect if this is a simple projection (only unqualified column references, no aliases or functions)
|
|
205
222
|
# Qualified column references (with plan_id) should NOT be considered simple projections
|
|
206
223
|
# because they've already been resolved to specific DataFrames
|
|
207
224
|
def _is_simple_projection(exp: expressions_proto.Expression) -> bool:
|
|
208
|
-
return exp.
|
|
209
|
-
"expr_type"
|
|
210
|
-
) == "unresolved_attribute" and not exp.unresolved_attribute.HasField(
|
|
225
|
+
return _is_attribute(exp) and not exp.unresolved_attribute.HasField(
|
|
211
226
|
"plan_id"
|
|
212
227
|
) # No DataFrame qualification
|
|
213
228
|
|
|
229
|
+
has_agg_in_input_df = (
|
|
230
|
+
input_df._select_statement
|
|
231
|
+
and input_df._select_statement.projection
|
|
232
|
+
and any(
|
|
233
|
+
is_child_agg_function_expression(exp)
|
|
234
|
+
for exp in input_df._select_statement.projection
|
|
235
|
+
)
|
|
236
|
+
)
|
|
214
237
|
column_types = []
|
|
215
238
|
has_multi_column_alias = False
|
|
216
|
-
qualifiers = []
|
|
239
|
+
qualifiers: list[set[ColumnQualifier]] = []
|
|
240
|
+
equivalent_snowpark_names: list[set[str]] = []
|
|
217
241
|
|
|
218
242
|
typer = ExpressionTyper(input_df)
|
|
219
243
|
|
|
@@ -233,10 +257,11 @@ def map_project(
|
|
|
233
257
|
# Check if this was a qualified column reference (like df_alias.column)
|
|
234
258
|
# by checking if the original expression was an alias lookup
|
|
235
259
|
is_qualified_reference = (
|
|
236
|
-
exp
|
|
260
|
+
_is_attribute(exp)
|
|
237
261
|
and "." in exp.unresolved_attribute.unparsed_identifier
|
|
238
262
|
)
|
|
239
263
|
|
|
264
|
+
existing_snowpark_name = None
|
|
240
265
|
if (
|
|
241
266
|
_is_simple_projection(exp)
|
|
242
267
|
and not is_qualified_reference
|
|
@@ -264,16 +289,47 @@ def map_project(
|
|
|
264
289
|
[spark_name], rel.common.plan_id, len(new_snowpark_columns)
|
|
265
290
|
)[0]
|
|
266
291
|
|
|
267
|
-
|
|
268
|
-
|
|
292
|
+
# since unresolved attributes get aliased, we need to retain their original name
|
|
293
|
+
# so that we're able to resolve it later
|
|
294
|
+
if (
|
|
295
|
+
_is_attribute(exp)
|
|
296
|
+
and mapper.col.get_name()
|
|
297
|
+
and mapper.col.get_name() not in new_snowpark_columns
|
|
298
|
+
):
|
|
299
|
+
old_name = mapper.col.get_name()
|
|
300
|
+
eq_names = set()
|
|
301
|
+
eq_names.update(
|
|
302
|
+
input_container.column_map.get_equivalent_snowpark_names_for_snowpark_name(
|
|
303
|
+
old_name
|
|
304
|
+
)
|
|
305
|
+
)
|
|
306
|
+
if old_name != snowpark_column:
|
|
307
|
+
eq_names.update({old_name})
|
|
308
|
+
equivalent_snowpark_names.append(eq_names)
|
|
309
|
+
else:
|
|
310
|
+
equivalent_snowpark_names.append(set())
|
|
311
|
+
|
|
312
|
+
# TODO: go back to using alias always once SNOW-2203826 is done
|
|
313
|
+
if existing_snowpark_name == snowpark_column and not has_agg_in_input_df:
|
|
314
|
+
aliased_col = mapper.col
|
|
315
|
+
# Store info to potentially re-alias later if multi-column generators appear
|
|
316
|
+
conditional_aliases.append(
|
|
317
|
+
(len(select_list), snowpark_column, mapper.col)
|
|
318
|
+
)
|
|
319
|
+
else:
|
|
320
|
+
aliased_col = mapper.col.alias(snowpark_column)
|
|
269
321
|
|
|
322
|
+
select_list.append(aliased_col)
|
|
270
323
|
new_snowpark_columns.append(snowpark_column)
|
|
271
324
|
new_spark_columns.append(spark_name)
|
|
272
325
|
column_types.extend(mapper.types)
|
|
273
326
|
qualifiers.append(mapper.get_qualifiers())
|
|
274
327
|
|
|
275
328
|
# Only update the DataFrame and register LCA for explicit aliases
|
|
276
|
-
if
|
|
329
|
+
if (
|
|
330
|
+
exp.WhichOneof("expr_type") == "alias"
|
|
331
|
+
and not context.is_resolving_subquery_exp()
|
|
332
|
+
):
|
|
277
333
|
# Collect alias for batch processing
|
|
278
334
|
pending_aliases.append(
|
|
279
335
|
(spark_name, snowpark_column, aliased_col, mapper.types)
|
|
@@ -283,6 +339,20 @@ def map_project(
|
|
|
283
339
|
alias_types = mapper.types
|
|
284
340
|
typed_alias = TypedColumn(aliased_col, lambda types=alias_types: types)
|
|
285
341
|
register_lca_alias(spark_name, typed_alias)
|
|
342
|
+
|
|
343
|
+
# Also register with the original qualified name if this is an alias of a column reference
|
|
344
|
+
# This handles ORDER BY referencing the original name: SELECT o.date AS order_date ... ORDER BY o.date
|
|
345
|
+
if (
|
|
346
|
+
exp.alias.HasField("expr")
|
|
347
|
+
and exp.alias.expr.WhichOneof("expr_type") == "unresolved_attribute"
|
|
348
|
+
):
|
|
349
|
+
original_name = (
|
|
350
|
+
exp.alias.expr.unresolved_attribute.unparsed_identifier
|
|
351
|
+
)
|
|
352
|
+
if (
|
|
353
|
+
original_name != spark_name
|
|
354
|
+
): # Don't register twice with the same name
|
|
355
|
+
register_lca_alias(original_name, typed_alias)
|
|
286
356
|
else:
|
|
287
357
|
# Multi-column case ('select *', posexplode, explode, inline, etc.)
|
|
288
358
|
has_multi_column_alias = True
|
|
@@ -292,6 +362,21 @@ def map_project(
|
|
|
292
362
|
new_spark_columns.extend(new_spark_names)
|
|
293
363
|
column_types.extend(mapper.types)
|
|
294
364
|
qualifiers.extend(mapper.get_multi_col_qualifiers(len(new_spark_names)))
|
|
365
|
+
equivalent_snowpark_names.extend(
|
|
366
|
+
[
|
|
367
|
+
input_container.column_map.get_equivalent_snowpark_names_for_snowpark_name(
|
|
368
|
+
snowpark_name
|
|
369
|
+
)
|
|
370
|
+
for snowpark_name in result_columns
|
|
371
|
+
]
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# TODO: go back to using alias always once SNOW-2203826 is done
|
|
375
|
+
# If we have multi-column aliases, we need to ensure ALL columns are aliased
|
|
376
|
+
# to avoid Snowpark SQL generation issues
|
|
377
|
+
if has_multi_column_alias and conditional_aliases:
|
|
378
|
+
for idx, snowpark_col_name, col_obj in conditional_aliases:
|
|
379
|
+
select_list[idx] = col_obj.alias(snowpark_col_name)
|
|
295
380
|
|
|
296
381
|
if pending_aliases:
|
|
297
382
|
# LCA case: create intermediate DataFrame with aliases, then do final projection
|
|
@@ -311,6 +396,11 @@ def map_project(
|
|
|
311
396
|
final_snowpark_columns = make_column_names_snowpark_compatible(
|
|
312
397
|
new_spark_columns, rel.common.plan_id
|
|
313
398
|
)
|
|
399
|
+
# if there are duplicate snowpark column names, we need to disambiguate them by their index
|
|
400
|
+
if len(new_spark_columns) != len(set(new_spark_columns)):
|
|
401
|
+
result = result.select(
|
|
402
|
+
[f"${i}" for i in range(1, len(new_spark_columns) + 1)]
|
|
403
|
+
)
|
|
314
404
|
result = result.toDF(*final_snowpark_columns)
|
|
315
405
|
new_snowpark_columns = final_snowpark_columns
|
|
316
406
|
|
|
@@ -324,6 +414,7 @@ def map_project(
|
|
|
324
414
|
parent_column_name_map=input_container.column_map,
|
|
325
415
|
table_name=input_container.table_name,
|
|
326
416
|
alias=input_container.alias,
|
|
417
|
+
equivalent_snowpark_names=equivalent_snowpark_names,
|
|
327
418
|
)
|
|
328
419
|
|
|
329
420
|
|
|
@@ -369,56 +460,87 @@ def map_sort(
|
|
|
369
460
|
for col in input_container.column_map.get_spark_columns()
|
|
370
461
|
]
|
|
371
462
|
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
463
|
+
# Process ORDER BY expressions with a context flag to enable column reuse optimization
|
|
464
|
+
from snowflake.snowpark_connect.utils.context import push_processing_order_by_scope
|
|
465
|
+
|
|
466
|
+
with push_processing_order_by_scope():
|
|
467
|
+
for so in sort_order:
|
|
468
|
+
if so.child.HasField("literal"):
|
|
469
|
+
column_index = unwrap_literal(so.child)
|
|
470
|
+
try:
|
|
471
|
+
if column_index <= 0:
|
|
472
|
+
exception = IndexError()
|
|
473
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
474
|
+
raise exception
|
|
475
|
+
col = input_df[column_index - 1]
|
|
476
|
+
except IndexError:
|
|
477
|
+
exception = AnalysisException(
|
|
478
|
+
f"""[ORDER_BY_POS_OUT_OF_RANGE] ORDER BY position {column_index} is not in select list (valid range is [1, {len(input_df.columns)})])."""
|
|
479
|
+
)
|
|
480
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
481
|
+
raise exception
|
|
482
|
+
else:
|
|
483
|
+
# Check if input came from an aggregate - if so, use hybrid resolution
|
|
484
|
+
# This handles ORDER BY expressions that reference pre-aggregation columns
|
|
485
|
+
# e.g., ORDER BY year(date) when the aggregated result only has 'year' alias
|
|
486
|
+
if (
|
|
487
|
+
hasattr(input_container, "_aggregate_metadata")
|
|
488
|
+
and input_container._aggregate_metadata is not None
|
|
489
|
+
):
|
|
490
|
+
from snowflake.snowpark_connect.expression.hybrid_column_map import (
|
|
491
|
+
create_hybrid_column_map_for_order_by,
|
|
492
|
+
)
|
|
388
493
|
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
case (
|
|
396
|
-
expressions_proto.Expression.SortOrder.SORT_DIRECTION_ASCENDING,
|
|
397
|
-
expressions_proto.Expression.SortOrder.SORT_NULLS_LAST,
|
|
398
|
-
):
|
|
399
|
-
col = col.asc_nulls_last()
|
|
400
|
-
case (
|
|
401
|
-
expressions_proto.Expression.SortOrder.SORT_DIRECTION_DESCENDING,
|
|
402
|
-
expressions_proto.Expression.SortOrder.SORT_NULLS_FIRST,
|
|
403
|
-
):
|
|
404
|
-
col = col.desc_nulls_first()
|
|
405
|
-
case (
|
|
406
|
-
expressions_proto.Expression.SortOrder.SORT_DIRECTION_DESCENDING,
|
|
407
|
-
expressions_proto.Expression.SortOrder.SORT_NULLS_LAST,
|
|
408
|
-
):
|
|
409
|
-
col = col.desc_nulls_last()
|
|
494
|
+
# Create hybrid map for resolving ORDER BY expressions
|
|
495
|
+
hybrid_map = create_hybrid_column_map_for_order_by(
|
|
496
|
+
aggregate_metadata=input_container._aggregate_metadata,
|
|
497
|
+
aggregated_df=input_df,
|
|
498
|
+
aggregated_column_map=input_container.column_map,
|
|
499
|
+
)
|
|
410
500
|
|
|
411
|
-
|
|
501
|
+
# Resolve using hybrid context (can access both input and aggregated columns)
|
|
502
|
+
_, typed_column = hybrid_map.resolve_expression(so.child)
|
|
503
|
+
col = typed_column.col
|
|
504
|
+
else:
|
|
505
|
+
# Normal resolution for non-aggregate inputs
|
|
506
|
+
_, typed_column = map_single_column_expression(
|
|
507
|
+
so.child, input_container.column_map, typer
|
|
508
|
+
)
|
|
509
|
+
col = typed_column.col
|
|
412
510
|
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
511
|
+
match (so.direction, so.null_ordering):
|
|
512
|
+
case (
|
|
513
|
+
expressions_proto.Expression.SortOrder.SORT_DIRECTION_ASCENDING,
|
|
514
|
+
expressions_proto.Expression.SortOrder.SORT_NULLS_FIRST,
|
|
515
|
+
):
|
|
516
|
+
col = col.asc_nulls_first()
|
|
517
|
+
case (
|
|
518
|
+
expressions_proto.Expression.SortOrder.SORT_DIRECTION_ASCENDING,
|
|
519
|
+
expressions_proto.Expression.SortOrder.SORT_NULLS_LAST,
|
|
520
|
+
):
|
|
521
|
+
col = col.asc_nulls_last()
|
|
522
|
+
case (
|
|
523
|
+
expressions_proto.Expression.SortOrder.SORT_DIRECTION_DESCENDING,
|
|
524
|
+
expressions_proto.Expression.SortOrder.SORT_NULLS_FIRST,
|
|
525
|
+
):
|
|
526
|
+
col = col.desc_nulls_first()
|
|
527
|
+
case (
|
|
528
|
+
expressions_proto.Expression.SortOrder.SORT_DIRECTION_DESCENDING,
|
|
529
|
+
expressions_proto.Expression.SortOrder.SORT_NULLS_LAST,
|
|
530
|
+
):
|
|
531
|
+
col = col.desc_nulls_last()
|
|
532
|
+
|
|
533
|
+
cols.append(col)
|
|
534
|
+
|
|
535
|
+
ascending.append(
|
|
536
|
+
so.direction
|
|
537
|
+
== expressions_proto.Expression.SortOrder.SORT_DIRECTION_ASCENDING
|
|
538
|
+
)
|
|
539
|
+
if (
|
|
540
|
+
so.direction
|
|
541
|
+
!= expressions_proto.Expression.SortOrder.SORT_DIRECTION_UNSPECIFIED
|
|
542
|
+
):
|
|
543
|
+
order_specified = True
|
|
422
544
|
|
|
423
545
|
# TODO: sort.isglobal.
|
|
424
546
|
if not order_specified:
|
|
@@ -440,15 +562,17 @@ def map_to_df(
|
|
|
440
562
|
"""
|
|
441
563
|
Transform the column names of the input DataFrame and return a container.
|
|
442
564
|
"""
|
|
443
|
-
input_container = map_relation(rel.to_df.input)
|
|
565
|
+
input_container = without_internal_columns(map_relation(rel.to_df.input))
|
|
444
566
|
input_df = input_container.dataframe
|
|
445
567
|
|
|
446
568
|
new_column_names = list(rel.to_df.column_names)
|
|
447
569
|
if len(new_column_names) != len(input_container.column_map.columns):
|
|
448
570
|
# TODO: Check error type here
|
|
449
|
-
|
|
571
|
+
exception = ValueError(
|
|
450
572
|
"Number of column names must match number of columns in DataFrame"
|
|
451
573
|
)
|
|
574
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
575
|
+
raise exception
|
|
452
576
|
snowpark_new_column_names = make_column_names_snowpark_compatible(
|
|
453
577
|
new_column_names, rel.common.plan_id
|
|
454
578
|
)
|
|
@@ -475,6 +599,7 @@ def map_to_df(
|
|
|
475
599
|
table_name=input_container.table_name,
|
|
476
600
|
alias=input_container.alias,
|
|
477
601
|
cached_schema_getter=_get_schema,
|
|
602
|
+
equivalent_snowpark_names=[set()] * len(new_column_names),
|
|
478
603
|
)
|
|
479
604
|
context.set_df_before_projection(result_container)
|
|
480
605
|
return result_container
|
|
@@ -507,9 +632,11 @@ def map_to_schema(
|
|
|
507
632
|
for field in rel.to_schema.schema.struct.fields:
|
|
508
633
|
if field.name in already_existing_columns:
|
|
509
634
|
if count_case_insensitive_column_names[field.name.lower()] > 1:
|
|
510
|
-
|
|
635
|
+
exception = AnalysisException(
|
|
511
636
|
f"[AMBIGUOUS_COLUMN_OR_FIELD] Column or field `{field.name}` is ambiguous and has {len(input_container.column_map.spark_to_col[field.name])} matches."
|
|
512
637
|
)
|
|
638
|
+
attach_custom_error_code(exception, ErrorCodes.AMBIGUOUS_COLUMN_NAME)
|
|
639
|
+
raise exception
|
|
513
640
|
snowpark_name = None
|
|
514
641
|
for name in input_container.column_map.spark_to_col:
|
|
515
642
|
if name.lower() == field.name.lower():
|
|
@@ -526,17 +653,23 @@ def map_to_schema(
|
|
|
526
653
|
and snowpark_field.nullable
|
|
527
654
|
and not isinstance(snowpark_field.datatype, StructType)
|
|
528
655
|
):
|
|
529
|
-
|
|
656
|
+
exception = AnalysisException(
|
|
530
657
|
f"[NULLABLE_COLUMN_OR_FIELD] Column or field `{field.name}` is nullable while it's required to be non-nullable."
|
|
531
658
|
)
|
|
659
|
+
attach_custom_error_code(
|
|
660
|
+
exception, ErrorCodes.INVALID_OPERATION
|
|
661
|
+
)
|
|
662
|
+
raise exception
|
|
532
663
|
|
|
533
664
|
# Check type casting validation
|
|
534
665
|
if not _can_cast_column_in_schema(
|
|
535
666
|
snowpark_field.datatype, proto_to_snowpark_type(field.data_type)
|
|
536
667
|
):
|
|
537
|
-
|
|
668
|
+
exception = AnalysisException(
|
|
538
669
|
f"""[INVALID_COLUMN_OR_FIELD_DATA_TYPE] Column or field `{field.name}` is of type "{map_snowpark_to_pyspark_types(proto_to_snowpark_type(field.data_type))}" while it's required to be "{map_snowpark_to_pyspark_types(snowpark_field.datatype)}"."""
|
|
539
670
|
)
|
|
671
|
+
attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
|
|
672
|
+
raise exception
|
|
540
673
|
if len(already_existing_columns) == len(new_column_names):
|
|
541
674
|
# All columns already exist, we're doing a simple update.
|
|
542
675
|
snowpark_new_column_names = []
|
|
@@ -615,6 +748,7 @@ def map_to_schema(
|
|
|
615
748
|
snowpark_column_types=[field.datatype for field in snowpark_schema.fields],
|
|
616
749
|
column_metadata=column_metadata,
|
|
617
750
|
parent_column_name_map=input_container.column_map,
|
|
751
|
+
equivalent_snowpark_names=[set()] * len(new_column_names),
|
|
618
752
|
)
|
|
619
753
|
|
|
620
754
|
|
|
@@ -624,7 +758,9 @@ def map_with_columns_renamed(
|
|
|
624
758
|
"""
|
|
625
759
|
Rename columns in a DataFrame and return a container.
|
|
626
760
|
"""
|
|
627
|
-
input_container =
|
|
761
|
+
input_container = without_internal_columns(
|
|
762
|
+
map_relation(rel.with_columns_renamed.input)
|
|
763
|
+
)
|
|
628
764
|
input_df = input_container.dataframe
|
|
629
765
|
rename_columns_map = dict(rel.with_columns_renamed.rename_columns_map)
|
|
630
766
|
|
|
@@ -660,8 +796,6 @@ def map_with_columns_renamed(
|
|
|
660
796
|
# This just copies the renames from previous computed dataframe
|
|
661
797
|
rename_columns_map[key] = value
|
|
662
798
|
|
|
663
|
-
existing_columns = input_container.column_map.get_spark_columns()
|
|
664
|
-
|
|
665
799
|
def _column_exists_error(name: str) -> AnalysisException:
|
|
666
800
|
return AnalysisException(
|
|
667
801
|
f"[COLUMN_ALREADY_EXISTS] The column `{name}` already exists. Consider to choose another name or rename the existing column."
|
|
@@ -700,27 +834,42 @@ def map_with_columns_renamed(
|
|
|
700
834
|
raise _column_exists_error(new_name)
|
|
701
835
|
seen.add(new_name)
|
|
702
836
|
|
|
703
|
-
|
|
704
|
-
|
|
837
|
+
new_spark_names = []
|
|
838
|
+
new_snowpark_names = []
|
|
839
|
+
qualifiers = []
|
|
840
|
+
equivalent_snowpark_names = []
|
|
841
|
+
for c in column_map.columns:
|
|
842
|
+
spark_name = c.spark_name
|
|
843
|
+
new_spark_name = None
|
|
705
844
|
if global_config.spark_sql_caseSensitive:
|
|
706
|
-
|
|
707
|
-
elif rename_columns_map.get(
|
|
708
|
-
|
|
709
|
-
|
|
845
|
+
new_spark_name = rename_columns_map.get(spark_name, None)
|
|
846
|
+
elif rename_columns_map.get(spark_name.lower(), None) is not None:
|
|
847
|
+
new_spark_name = rename_columns_map_original.get(
|
|
848
|
+
rename_columns_map.get(spark_name.lower())
|
|
710
849
|
)
|
|
850
|
+
|
|
851
|
+
if new_spark_name:
|
|
852
|
+
new_spark_names.append(new_spark_name)
|
|
853
|
+
new_snowpark_names.append(make_unique_snowpark_name(new_spark_name))
|
|
854
|
+
qualifiers.append(set())
|
|
855
|
+
equivalent_snowpark_names.append(set())
|
|
711
856
|
else:
|
|
712
|
-
|
|
857
|
+
new_spark_names.append(c.spark_name)
|
|
858
|
+
new_snowpark_names.append(c.snowpark_name)
|
|
859
|
+
qualifiers.append(c.qualifiers)
|
|
860
|
+
equivalent_snowpark_names.append(c.equivalent_snowpark_names)
|
|
713
861
|
|
|
714
862
|
# Creating a new df to avoid updating the state of cached dataframe.
|
|
715
863
|
new_df = input_df.select("*")
|
|
716
864
|
result_container = DataFrameContainer.create_with_column_mapping(
|
|
717
865
|
dataframe=new_df,
|
|
718
|
-
spark_column_names=
|
|
866
|
+
spark_column_names=new_spark_names,
|
|
719
867
|
snowpark_column_names=input_container.column_map.get_snowpark_columns(),
|
|
720
868
|
column_qualifiers=input_container.column_map.get_qualifiers(),
|
|
721
869
|
parent_column_name_map=input_container.column_map.get_parent_column_name_map(),
|
|
722
870
|
table_name=input_container.table_name,
|
|
723
871
|
alias=input_container.alias,
|
|
872
|
+
equivalent_snowpark_names=equivalent_snowpark_names,
|
|
724
873
|
)
|
|
725
874
|
result_container.column_map.rename_chains = rename_columns_map
|
|
726
875
|
|
|
@@ -733,7 +882,7 @@ def map_with_columns(
|
|
|
733
882
|
"""
|
|
734
883
|
Add columns to a DataFrame and return a container.
|
|
735
884
|
"""
|
|
736
|
-
input_container = map_relation(rel.with_columns.input)
|
|
885
|
+
input_container = without_internal_columns(map_relation(rel.with_columns.input))
|
|
737
886
|
input_df = input_container.dataframe
|
|
738
887
|
with_columns = []
|
|
739
888
|
for alias in rel.with_columns.aliases:
|
|
@@ -761,9 +910,11 @@ def map_with_columns(
|
|
|
761
910
|
name = names_list[0]
|
|
762
911
|
name_normalized = input_container.column_map._normalized_spark_name(name)
|
|
763
912
|
if name_normalized in seen_columns:
|
|
764
|
-
|
|
913
|
+
exception = ValueError(
|
|
765
914
|
f"[COLUMN_ALREADY_EXISTS] The column `{name}` already exists."
|
|
766
915
|
)
|
|
916
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
917
|
+
raise exception
|
|
767
918
|
seen_columns.add(name_normalized)
|
|
768
919
|
# If the column name is already in the DataFrame, we replace it, so we use the
|
|
769
920
|
# mapping to get the correct column name.
|
|
@@ -772,7 +923,9 @@ def map_with_columns(
|
|
|
772
923
|
[name]
|
|
773
924
|
)
|
|
774
925
|
if len(all_instances_of_spark_column_name) == 0:
|
|
775
|
-
|
|
926
|
+
exception = KeyError(f"Spark column name {name} does not exist")
|
|
927
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
928
|
+
raise exception
|
|
776
929
|
with_columns_names.extend(all_instances_of_spark_column_name)
|
|
777
930
|
with_columns_exprs.extend(
|
|
778
931
|
[expr.col] * len(all_instances_of_spark_column_name)
|
|
@@ -796,6 +949,7 @@ def map_with_columns(
|
|
|
796
949
|
new_spark_columns,
|
|
797
950
|
new_snowpark_columns,
|
|
798
951
|
qualifiers,
|
|
952
|
+
equivalent_snowpark_names,
|
|
799
953
|
) = input_container.column_map.with_columns(new_spark_names, with_columns_names)
|
|
800
954
|
|
|
801
955
|
# dedup the change in columns at snowpark name level, this is required by the with columns functions
|
|
@@ -843,6 +997,7 @@ def map_with_columns(
|
|
|
843
997
|
parent_column_name_map=input_container.column_map,
|
|
844
998
|
table_name=input_container.table_name,
|
|
845
999
|
alias=input_container.alias,
|
|
1000
|
+
equivalent_snowpark_names=equivalent_snowpark_names,
|
|
846
1001
|
)
|
|
847
1002
|
|
|
848
1003
|
|
|
@@ -852,7 +1007,9 @@ def map_unpivot(
|
|
|
852
1007
|
# Spark API: df.unpivot([id_columns], [unpivot_columns], var_column, val_column)
|
|
853
1008
|
# Snowpark API: df.unpivot(val_column, var_column, [unpivot_columns])
|
|
854
1009
|
if rel.unpivot.HasField("values") and len(rel.unpivot.values.values) == 0:
|
|
855
|
-
|
|
1010
|
+
exception = SparkException.unpivot_requires_value_columns()
|
|
1011
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
1012
|
+
raise exception
|
|
856
1013
|
|
|
857
1014
|
input_container = map_relation(rel.unpivot.input)
|
|
858
1015
|
input_df = input_container.dataframe
|
|
@@ -893,7 +1050,7 @@ def map_unpivot(
|
|
|
893
1050
|
)
|
|
894
1051
|
if not get_lease_common_ancestor_classes(type_list):
|
|
895
1052
|
# TODO: match exactly how spark shows mismatched columns
|
|
896
|
-
|
|
1053
|
+
exception = SparkException.unpivot_value_data_type_mismatch(
|
|
897
1054
|
", ".join(
|
|
898
1055
|
[
|
|
899
1056
|
f"{dtype} {column_name}"
|
|
@@ -901,6 +1058,8 @@ def map_unpivot(
|
|
|
901
1058
|
]
|
|
902
1059
|
)
|
|
903
1060
|
)
|
|
1061
|
+
attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
|
|
1062
|
+
raise exception
|
|
904
1063
|
return not is_same_type and contains_numeric_type
|
|
905
1064
|
|
|
906
1065
|
def get_column_names(
|
|
@@ -984,7 +1143,7 @@ def map_unpivot(
|
|
|
984
1143
|
column_project = []
|
|
985
1144
|
column_reverse_project = []
|
|
986
1145
|
snowpark_columns = []
|
|
987
|
-
qualifiers = []
|
|
1146
|
+
qualifiers: list[set[ColumnQualifier]] = []
|
|
988
1147
|
for c in input_container.column_map.get_snowpark_columns():
|
|
989
1148
|
c_name = snowpark_functions_col(c, input_container.column_map).get_name()
|
|
990
1149
|
if c_name in unpivot_col_names:
|
|
@@ -1012,7 +1171,7 @@ def map_unpivot(
|
|
|
1012
1171
|
)
|
|
1013
1172
|
snowpark_columns.append(c)
|
|
1014
1173
|
qualifiers.append(
|
|
1015
|
-
input_container.column_map.
|
|
1174
|
+
input_container.column_map.get_qualifiers_for_snowpark_column(c)
|
|
1016
1175
|
)
|
|
1017
1176
|
|
|
1018
1177
|
# Without the case when postprocessing, the result Spark dataframe is:
|
|
@@ -1057,7 +1216,7 @@ def map_unpivot(
|
|
|
1057
1216
|
snowpark_functions_col(snowpark_value_column_name, input_container.column_map)
|
|
1058
1217
|
)
|
|
1059
1218
|
snowpark_columns.append(snowpark_value_column_name)
|
|
1060
|
-
qualifiers.extend([
|
|
1219
|
+
qualifiers.extend([set() for _ in range(2)])
|
|
1061
1220
|
|
|
1062
1221
|
result = (
|
|
1063
1222
|
input_df.select(*column_project)
|
|
@@ -1075,6 +1234,7 @@ def map_unpivot(
|
|
|
1075
1234
|
snowpark_column_names=snowpark_columns,
|
|
1076
1235
|
column_qualifiers=qualifiers,
|
|
1077
1236
|
parent_column_name_map=input_container.column_map,
|
|
1237
|
+
equivalent_snowpark_names=[set()] * len(snowpark_columns),
|
|
1078
1238
|
)
|
|
1079
1239
|
|
|
1080
1240
|
|
|
@@ -1097,7 +1257,9 @@ def map_group_map(
|
|
|
1097
1257
|
snowpark_grouping_expressions.append(snowpark_column.col)
|
|
1098
1258
|
group_name_list.append(new_name)
|
|
1099
1259
|
if rel.group_map.func.python_udf is None:
|
|
1100
|
-
|
|
1260
|
+
exception = ValueError("group_map relation without python udf is not supported")
|
|
1261
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
1262
|
+
raise exception
|
|
1101
1263
|
|
|
1102
1264
|
python_major, python_minor = rel.group_map.func.python_udf.python_ver.split(".")
|
|
1103
1265
|
is_compatible_python = sys.version_info.major == int(
|
|
@@ -1106,7 +1268,7 @@ def map_group_map(
|
|
|
1106
1268
|
|
|
1107
1269
|
output_type = proto_to_snowpark_type(rel.group_map.func.python_udf.output_type)
|
|
1108
1270
|
|
|
1109
|
-
if not is_compatible_python or TEST_FLAG_FORCE_CREATE_SPROC:
|
|
1271
|
+
if not is_compatible_python or TEST_FLAG_FORCE_CREATE_SPROC or tcm.TCM_MODE:
|
|
1110
1272
|
original_columns = None
|
|
1111
1273
|
if input_container.column_map is not None:
|
|
1112
1274
|
original_columns = [
|
|
@@ -1124,10 +1286,15 @@ def map_group_map(
|
|
|
1124
1286
|
group_by_df = input_df.group_by(*snowpark_grouping_expressions)
|
|
1125
1287
|
inner_df = group_by_df._dataframe
|
|
1126
1288
|
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1289
|
+
renamed_columns = [f"snowflake_jtf_{column}" for column in input_df.columns]
|
|
1290
|
+
tfc = snowpark_fn.call_table_function(
|
|
1291
|
+
apply_udtf_temp_name, *renamed_columns
|
|
1292
|
+
).over(partition_by=snowpark_grouping_expressions)
|
|
1293
|
+
|
|
1294
|
+
result = (
|
|
1295
|
+
inner_df.to_df(renamed_columns)
|
|
1296
|
+
.join_table_function(tfc)
|
|
1297
|
+
.drop(*renamed_columns)
|
|
1131
1298
|
)
|
|
1132
1299
|
else:
|
|
1133
1300
|
(
|
|
@@ -1146,6 +1313,7 @@ def map_group_map(
|
|
|
1146
1313
|
snowpark_column_names=result.columns,
|
|
1147
1314
|
column_qualifiers=None,
|
|
1148
1315
|
parent_column_name_map=input_container.column_map,
|
|
1316
|
+
equivalent_snowpark_names=None,
|
|
1149
1317
|
)
|
|
1150
1318
|
|
|
1151
1319
|
|