snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +680 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +237 -23
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +123 -5
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +85 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2748 -746
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +196 -255
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +255 -45
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +17 -5
- snowflake/snowpark_connect/relation/read/map_read_json.py +320 -85
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +142 -27
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +11 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +82 -5
- snowflake/snowpark_connect/relation/read/map_read_text.py +18 -3
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +36 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +110 -48
- snowflake/snowpark_connect/server.py +546 -456
- snowflake/snowpark_connect/server_common/__init__.py +500 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +187 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +125 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +303 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +248 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +101 -332
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +163 -22
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-submit +2 -2
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/METADATA +14 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/RECORD +129 -167
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
#
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
import pandas
|
|
6
5
|
import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
|
|
7
6
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
8
7
|
from pyspark.errors.exceptions.base import AnalysisException, IllegalArgumentException
|
|
9
8
|
|
|
10
9
|
import snowflake.snowpark_connect.relation.utils as utils
|
|
11
10
|
from snowflake import snowpark
|
|
12
|
-
from snowflake.snowpark.
|
|
11
|
+
from snowflake.snowpark._internal.error_message import SnowparkClientExceptionMessages
|
|
12
|
+
from snowflake.snowpark.functions import col, expr as snowpark_expr, lit
|
|
13
13
|
from snowflake.snowpark.types import (
|
|
14
14
|
BooleanType,
|
|
15
15
|
ByteType,
|
|
@@ -20,21 +20,90 @@ from snowflake.snowpark.types import (
|
|
|
20
20
|
LongType,
|
|
21
21
|
NullType,
|
|
22
22
|
ShortType,
|
|
23
|
+
StructField,
|
|
24
|
+
StructType,
|
|
25
|
+
)
|
|
26
|
+
from snowflake.snowpark_connect.column_name_handler import (
|
|
27
|
+
ColumnNameMap,
|
|
28
|
+
schema_getter,
|
|
29
|
+
set_schema_getter,
|
|
23
30
|
)
|
|
24
|
-
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap, schema_getter
|
|
25
31
|
from snowflake.snowpark_connect.config import global_config
|
|
26
32
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
33
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
34
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
27
35
|
from snowflake.snowpark_connect.expression.literal import get_literal_field_and_name
|
|
28
36
|
from snowflake.snowpark_connect.expression.map_expression import (
|
|
29
37
|
map_single_column_expression,
|
|
30
38
|
)
|
|
31
39
|
from snowflake.snowpark_connect.expression.typer import ExpressionTyper
|
|
32
40
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
41
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
42
|
+
without_internal_columns,
|
|
43
|
+
)
|
|
44
|
+
from snowflake.snowpark_connect.utils.identifiers import (
|
|
45
|
+
split_fully_qualified_spark_name,
|
|
46
|
+
)
|
|
33
47
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
34
48
|
SnowparkConnectNotImplementedError,
|
|
35
49
|
)
|
|
36
50
|
|
|
37
51
|
|
|
52
|
+
def cast_columns(
|
|
53
|
+
df_container: DataFrameContainer,
|
|
54
|
+
df_dtypes: list[snowpark.types.DataType],
|
|
55
|
+
target_dtypes: list[snowpark.types.DataType],
|
|
56
|
+
column_map: ColumnNameMap,
|
|
57
|
+
):
|
|
58
|
+
df: snowpark.DataFrame = df_container.dataframe
|
|
59
|
+
if df_dtypes == target_dtypes:
|
|
60
|
+
return df_container
|
|
61
|
+
# Use cached schema if available to avoid triggering extra queries
|
|
62
|
+
if (
|
|
63
|
+
hasattr(df_container, "cached_schema_getter")
|
|
64
|
+
and df_container.cached_schema_getter is not None
|
|
65
|
+
):
|
|
66
|
+
df_schema = df_container.cached_schema_getter()
|
|
67
|
+
else:
|
|
68
|
+
df_schema = df.schema # Get current schema
|
|
69
|
+
new_columns = []
|
|
70
|
+
|
|
71
|
+
for i, field in enumerate(df_schema.fields):
|
|
72
|
+
col_name = field.name
|
|
73
|
+
current_type = field.datatype
|
|
74
|
+
target_type = target_dtypes[i]
|
|
75
|
+
|
|
76
|
+
if current_type != target_type:
|
|
77
|
+
new_columns.append(df[col_name].cast(target_type).alias(col_name))
|
|
78
|
+
else:
|
|
79
|
+
new_columns.append(df[col_name])
|
|
80
|
+
|
|
81
|
+
new_df = df.select(new_columns)
|
|
82
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
83
|
+
dataframe=new_df,
|
|
84
|
+
spark_column_names=column_map.get_spark_columns(),
|
|
85
|
+
snowpark_column_names=column_map.get_snowpark_columns(),
|
|
86
|
+
snowpark_column_types=target_dtypes,
|
|
87
|
+
column_metadata=column_map.column_metadata,
|
|
88
|
+
parent_column_name_map=column_map,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_schema_from_result(
|
|
93
|
+
result: DataFrameContainer,
|
|
94
|
+
) -> StructType:
|
|
95
|
+
"""
|
|
96
|
+
Get schema from a DataFrameContainer, using cached schema if available to avoid extra queries.
|
|
97
|
+
"""
|
|
98
|
+
if (
|
|
99
|
+
hasattr(result, "cached_schema_getter")
|
|
100
|
+
and result.cached_schema_getter is not None
|
|
101
|
+
):
|
|
102
|
+
return result.cached_schema_getter()
|
|
103
|
+
else:
|
|
104
|
+
return result.dataframe.schema
|
|
105
|
+
|
|
106
|
+
|
|
38
107
|
def map_deduplicate(
|
|
39
108
|
rel: relation_proto.Relation,
|
|
40
109
|
) -> DataFrameContainer:
|
|
@@ -43,16 +112,18 @@ def map_deduplicate(
|
|
|
43
112
|
|
|
44
113
|
The deduplicate is a list of columns that is applied to the DataFrame.
|
|
45
114
|
"""
|
|
46
|
-
input_container = map_relation(rel.deduplicate.input)
|
|
115
|
+
input_container = without_internal_columns(map_relation(rel.deduplicate.input))
|
|
47
116
|
input_df = input_container.dataframe
|
|
48
117
|
|
|
49
118
|
if (
|
|
50
119
|
rel.deduplicate.HasField("within_watermark")
|
|
51
120
|
and rel.deduplicate.within_watermark
|
|
52
121
|
):
|
|
53
|
-
|
|
122
|
+
exception = AnalysisException(
|
|
54
123
|
"dropDuplicatesWithinWatermark is not supported with batch DataFrames/DataSets"
|
|
55
124
|
)
|
|
125
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
126
|
+
raise exception
|
|
56
127
|
|
|
57
128
|
if (
|
|
58
129
|
rel.deduplicate.HasField("all_columns_as_keys")
|
|
@@ -81,7 +152,7 @@ def map_dropna(
|
|
|
81
152
|
"""
|
|
82
153
|
Drop NA values from the input DataFrame.
|
|
83
154
|
"""
|
|
84
|
-
input_container = map_relation(rel.drop_na.input)
|
|
155
|
+
input_container = without_internal_columns(map_relation(rel.drop_na.input))
|
|
85
156
|
input_df = input_container.dataframe
|
|
86
157
|
|
|
87
158
|
if rel.drop_na.HasField("min_non_nulls"):
|
|
@@ -122,15 +193,23 @@ def map_fillna(
|
|
|
122
193
|
|
|
123
194
|
The `fill_value` is a scalar value that will be used to replace NaN values.
|
|
124
195
|
"""
|
|
125
|
-
input_container = map_relation(rel.fill_na.input)
|
|
196
|
+
input_container = without_internal_columns(map_relation(rel.fill_na.input))
|
|
126
197
|
input_df = input_container.dataframe
|
|
127
198
|
|
|
128
199
|
if len(rel.fill_na.cols) > 0:
|
|
200
|
+
if rel.fill_na.cols == ["*"]:
|
|
201
|
+
# Expand "*" to all columns
|
|
202
|
+
spark_col_names = input_container.column_map.get_spark_columns()
|
|
203
|
+
else:
|
|
204
|
+
spark_col_names = list(rel.fill_na.cols)
|
|
205
|
+
|
|
206
|
+
# We don't validate the fully qualified spark name here as fillNa is no-op for structured type colums.
|
|
207
|
+
# It only works for scalar type columns like float, int, string or bool.
|
|
129
208
|
columns: list[str] = [
|
|
130
209
|
input_container.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
131
|
-
c
|
|
210
|
+
split_fully_qualified_spark_name(c)[0]
|
|
132
211
|
)
|
|
133
|
-
for c in
|
|
212
|
+
for c in spark_col_names
|
|
134
213
|
]
|
|
135
214
|
values = [get_literal_field_and_name(v)[0] for v in rel.fill_na.values]
|
|
136
215
|
if len(values) == 1:
|
|
@@ -177,29 +256,16 @@ def map_union(
|
|
|
177
256
|
|
|
178
257
|
The two DataFrames must have the same schema.
|
|
179
258
|
"""
|
|
180
|
-
left_result = map_relation(rel.set_op.left_input)
|
|
181
|
-
right_result = map_relation(rel.set_op.right_input)
|
|
259
|
+
left_result = without_internal_columns(map_relation(rel.set_op.left_input))
|
|
260
|
+
right_result = without_internal_columns(map_relation(rel.set_op.right_input))
|
|
182
261
|
left_df = left_result.dataframe
|
|
183
262
|
right_df = right_result.dataframe
|
|
184
263
|
allow_missing_columns = bool(rel.set_op.allow_missing_columns)
|
|
185
264
|
|
|
186
265
|
# workaround for unstructured type vs structured type
|
|
187
266
|
# Use cached schema if available to avoid triggering extra queries
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
and left_result.cached_schema_getter is not None
|
|
191
|
-
):
|
|
192
|
-
left_schema = left_result.cached_schema_getter()
|
|
193
|
-
else:
|
|
194
|
-
left_schema = left_df.schema
|
|
195
|
-
|
|
196
|
-
if (
|
|
197
|
-
hasattr(right_result, "cached_schema_getter")
|
|
198
|
-
and right_result.cached_schema_getter is not None
|
|
199
|
-
):
|
|
200
|
-
right_schema = right_result.cached_schema_getter()
|
|
201
|
-
else:
|
|
202
|
-
right_schema = right_df.schema
|
|
267
|
+
left_schema = get_schema_from_result(left_result)
|
|
268
|
+
right_schema = get_schema_from_result(right_result)
|
|
203
269
|
|
|
204
270
|
left_dtypes = [field.datatype for field in left_schema.fields]
|
|
205
271
|
right_dtypes = [field.datatype for field in right_schema.fields]
|
|
@@ -207,7 +273,9 @@ def map_union(
|
|
|
207
273
|
spark_sql_ansi_enabled = global_config.spark_sql_ansi_enabled
|
|
208
274
|
if left_dtypes != right_dtypes and not rel.set_op.by_name:
|
|
209
275
|
if len(left_dtypes) != len(right_dtypes):
|
|
210
|
-
|
|
276
|
+
exception = AnalysisException("UNION: the number of columns must match")
|
|
277
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
278
|
+
raise exception
|
|
211
279
|
target_left_dtypes, target_right_dtypes = [], []
|
|
212
280
|
for left_type, right_type in zip(left_dtypes, right_dtypes):
|
|
213
281
|
match (left_type, right_type):
|
|
@@ -235,6 +303,29 @@ def map_union(
|
|
|
235
303
|
# Union of any type with null type is of the other type
|
|
236
304
|
target_left_dtypes.append(other_t)
|
|
237
305
|
target_right_dtypes.append(other_t)
|
|
306
|
+
case (snowpark.types.DecimalType(), snowpark.types.DecimalType()):
|
|
307
|
+
# Widen decimal types to accommodate both sides
|
|
308
|
+
# Calculate the maximum scale and maximum integer digits
|
|
309
|
+
left_integer_digits = left_type.precision - left_type.scale
|
|
310
|
+
right_integer_digits = right_type.precision - right_type.scale
|
|
311
|
+
|
|
312
|
+
# The common type needs to accommodate:
|
|
313
|
+
# - The maximum number of digits after the decimal point (scale)
|
|
314
|
+
# - The maximum number of digits before the decimal point (integer digits)
|
|
315
|
+
common_scale = max(left_type.scale, right_type.scale)
|
|
316
|
+
common_integer_digits = max(
|
|
317
|
+
left_integer_digits, right_integer_digits
|
|
318
|
+
)
|
|
319
|
+
common_precision = min(38, common_scale + common_integer_digits)
|
|
320
|
+
|
|
321
|
+
# Ensure scale doesn't exceed precision
|
|
322
|
+
common_scale = min(common_scale, common_precision)
|
|
323
|
+
|
|
324
|
+
common_type = snowpark.types.DecimalType(
|
|
325
|
+
common_precision, common_scale
|
|
326
|
+
)
|
|
327
|
+
target_left_dtypes.append(common_type)
|
|
328
|
+
target_right_dtypes.append(common_type)
|
|
238
329
|
case (snowpark.types.BooleanType(), _) | (
|
|
239
330
|
_,
|
|
240
331
|
snowpark.types.BooleanType(),
|
|
@@ -243,54 +334,31 @@ def map_union(
|
|
|
243
334
|
not spark_sql_ansi_enabled
|
|
244
335
|
or snowpark.types.StringType() not in [left_type, right_type]
|
|
245
336
|
): # In ansi mode , string type union boolean type is acceptable
|
|
246
|
-
|
|
337
|
+
exception = AnalysisException(
|
|
247
338
|
f"""[INCOMPATIBLE_COLUMN_TYPE] UNION can only be performed on tables with compatible column types. "{str(left_type)}" type which is not compatible with "{str(right_type)}". """
|
|
248
339
|
)
|
|
340
|
+
attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
|
|
341
|
+
raise exception
|
|
249
342
|
target_left_dtypes.append(left_type)
|
|
250
343
|
target_right_dtypes.append(right_type)
|
|
344
|
+
case (
|
|
345
|
+
snowpark.types.TimestampType()
|
|
346
|
+
| snowpark.types.DateType()
|
|
347
|
+
| snowpark.types._NumericType(),
|
|
348
|
+
snowpark.types.StringType(),
|
|
349
|
+
) | (
|
|
350
|
+
snowpark.types.StringType(),
|
|
351
|
+
snowpark.types.TimestampType()
|
|
352
|
+
| snowpark.types.DateType()
|
|
353
|
+
| snowpark.types._NumericType(),
|
|
354
|
+
) if not spark_sql_ansi_enabled:
|
|
355
|
+
common_type = snowpark.types.StringType()
|
|
356
|
+
target_left_dtypes.append(common_type)
|
|
357
|
+
target_right_dtypes.append(common_type)
|
|
251
358
|
case _:
|
|
252
359
|
target_left_dtypes.append(left_type)
|
|
253
360
|
target_right_dtypes.append(right_type)
|
|
254
361
|
|
|
255
|
-
def cast_columns(
|
|
256
|
-
df_container: DataFrameContainer,
|
|
257
|
-
df_dtypes: list[snowpark.types.DataType],
|
|
258
|
-
target_dtypes: list[snowpark.types.DataType],
|
|
259
|
-
column_map: ColumnNameMap,
|
|
260
|
-
):
|
|
261
|
-
df: snowpark.DataFrame = df_container.dataframe
|
|
262
|
-
if df_dtypes == target_dtypes:
|
|
263
|
-
return df_container
|
|
264
|
-
# Use cached schema if available to avoid triggering extra queries
|
|
265
|
-
if (
|
|
266
|
-
hasattr(df_container, "cached_schema_getter")
|
|
267
|
-
and df_container.cached_schema_getter is not None
|
|
268
|
-
):
|
|
269
|
-
df_schema = df_container.cached_schema_getter()
|
|
270
|
-
else:
|
|
271
|
-
df_schema = df.schema # Get current schema
|
|
272
|
-
new_columns = []
|
|
273
|
-
|
|
274
|
-
for i, field in enumerate(df_schema.fields):
|
|
275
|
-
col_name = field.name
|
|
276
|
-
current_type = field.datatype
|
|
277
|
-
target_type = target_dtypes[i]
|
|
278
|
-
|
|
279
|
-
if current_type != target_type:
|
|
280
|
-
new_columns.append(df[col_name].cast(target_type).alias(col_name))
|
|
281
|
-
else:
|
|
282
|
-
new_columns.append(df[col_name])
|
|
283
|
-
|
|
284
|
-
new_df = df.select(new_columns)
|
|
285
|
-
return DataFrameContainer.create_with_column_mapping(
|
|
286
|
-
dataframe=new_df,
|
|
287
|
-
spark_column_names=column_map.get_spark_columns(),
|
|
288
|
-
snowpark_column_names=column_map.get_snowpark_columns(),
|
|
289
|
-
snowpark_column_types=target_dtypes,
|
|
290
|
-
column_metadata=column_map.column_metadata,
|
|
291
|
-
parent_column_name_map=column_map,
|
|
292
|
-
)
|
|
293
|
-
|
|
294
362
|
left_result = cast_columns(
|
|
295
363
|
left_result,
|
|
296
364
|
left_dtypes,
|
|
@@ -318,23 +386,37 @@ def map_union(
|
|
|
318
386
|
right_column_map = right_result.column_map
|
|
319
387
|
columns_to_restore: dict[str, tuple[str, str]] = {}
|
|
320
388
|
|
|
321
|
-
|
|
389
|
+
original_right_schema = right_df.schema
|
|
390
|
+
right_renamed_fields = []
|
|
391
|
+
for field in original_right_schema.fields:
|
|
322
392
|
spark_name = (
|
|
323
|
-
right_column_map.get_spark_column_name_from_snowpark_column_name(
|
|
393
|
+
right_column_map.get_spark_column_name_from_snowpark_column_name(
|
|
394
|
+
field.name
|
|
395
|
+
)
|
|
396
|
+
)
|
|
397
|
+
right_df = right_df.withColumnRenamed(field.name, spark_name)
|
|
398
|
+
columns_to_restore[spark_name.upper()] = (spark_name, field.name)
|
|
399
|
+
right_renamed_fields.append(
|
|
400
|
+
StructField(spark_name, field.datatype, field.nullable)
|
|
324
401
|
)
|
|
325
|
-
|
|
326
|
-
columns_to_restore[spark_name.upper()] = (spark_name, column)
|
|
402
|
+
set_schema_getter(right_df, lambda: StructType(right_renamed_fields))
|
|
327
403
|
|
|
328
|
-
|
|
404
|
+
original_left_schema = left_df.schema
|
|
405
|
+
left_renamed_fields = []
|
|
406
|
+
for field in original_left_schema.fields:
|
|
329
407
|
spark_name = (
|
|
330
|
-
left_column_map.get_spark_column_name_from_snowpark_column_name(
|
|
408
|
+
left_column_map.get_spark_column_name_from_snowpark_column_name(
|
|
409
|
+
field.name
|
|
410
|
+
)
|
|
411
|
+
)
|
|
412
|
+
left_df = left_df.withColumnRenamed(field.name, spark_name)
|
|
413
|
+
columns_to_restore[spark_name.upper()] = (spark_name, field.name)
|
|
414
|
+
left_renamed_fields.append(
|
|
415
|
+
StructField(spark_name, field.datatype, field.nullable)
|
|
331
416
|
)
|
|
332
|
-
|
|
333
|
-
columns_to_restore[spark_name.upper()] = (spark_name, column)
|
|
417
|
+
set_schema_getter(left_df, lambda: StructType(left_renamed_fields))
|
|
334
418
|
|
|
335
|
-
result = left_df
|
|
336
|
-
right_df, allow_missing_columns=allow_missing_columns
|
|
337
|
-
)
|
|
419
|
+
result = _union_by_name_optimized(left_df, right_df, allow_missing_columns)
|
|
338
420
|
|
|
339
421
|
if allow_missing_columns:
|
|
340
422
|
spark_columns = []
|
|
@@ -421,8 +503,8 @@ def map_intersect(
|
|
|
421
503
|
| b| 3|
|
|
422
504
|
+---+---+
|
|
423
505
|
"""
|
|
424
|
-
left_result = map_relation(rel.set_op.left_input)
|
|
425
|
-
right_result = map_relation(rel.set_op.right_input)
|
|
506
|
+
left_result = without_internal_columns(map_relation(rel.set_op.left_input))
|
|
507
|
+
right_result = without_internal_columns(map_relation(rel.set_op.right_input))
|
|
426
508
|
left_df = left_result.dataframe
|
|
427
509
|
right_df = right_result.dataframe
|
|
428
510
|
|
|
@@ -484,11 +566,53 @@ def map_except(
|
|
|
484
566
|
| c| 4|
|
|
485
567
|
+---+---+
|
|
486
568
|
"""
|
|
487
|
-
left_result = map_relation(rel.set_op.left_input)
|
|
488
|
-
right_result = map_relation(rel.set_op.right_input)
|
|
569
|
+
left_result = without_internal_columns(map_relation(rel.set_op.left_input))
|
|
570
|
+
right_result = without_internal_columns(map_relation(rel.set_op.right_input))
|
|
489
571
|
left_df = left_result.dataframe
|
|
490
572
|
right_df = right_result.dataframe
|
|
491
573
|
|
|
574
|
+
# workaround for unstructured type vs structured type
|
|
575
|
+
# Use cached schema if available to avoid triggering extra queries
|
|
576
|
+
left_schema = get_schema_from_result(left_result)
|
|
577
|
+
right_schema = get_schema_from_result(right_result)
|
|
578
|
+
|
|
579
|
+
left_dtypes = [field.datatype for field in left_schema.fields]
|
|
580
|
+
right_dtypes = [field.datatype for field in right_schema.fields]
|
|
581
|
+
|
|
582
|
+
if left_dtypes != right_dtypes and not rel.set_op.by_name:
|
|
583
|
+
if len(left_dtypes) != len(right_dtypes):
|
|
584
|
+
exception = AnalysisException("UNION: the number of columns must match")
|
|
585
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
586
|
+
raise exception
|
|
587
|
+
target_left_dtypes, target_right_dtypes = [], []
|
|
588
|
+
for left_type, right_type in zip(left_dtypes, right_dtypes):
|
|
589
|
+
match (left_type, right_type):
|
|
590
|
+
case (snowpark.types._NumericType(), snowpark.types.StringType()) | (
|
|
591
|
+
snowpark.types.StringType(),
|
|
592
|
+
snowpark.types._NumericType(),
|
|
593
|
+
):
|
|
594
|
+
common_type = snowpark.types.StringType()
|
|
595
|
+
target_left_dtypes.append(common_type)
|
|
596
|
+
target_right_dtypes.append(common_type)
|
|
597
|
+
case _:
|
|
598
|
+
target_left_dtypes.append(left_type)
|
|
599
|
+
target_right_dtypes.append(right_type)
|
|
600
|
+
|
|
601
|
+
left_result = cast_columns(
|
|
602
|
+
left_result,
|
|
603
|
+
left_dtypes,
|
|
604
|
+
target_left_dtypes,
|
|
605
|
+
left_result.column_map,
|
|
606
|
+
)
|
|
607
|
+
right_result = cast_columns(
|
|
608
|
+
right_result,
|
|
609
|
+
right_dtypes,
|
|
610
|
+
target_right_dtypes,
|
|
611
|
+
right_result.column_map,
|
|
612
|
+
)
|
|
613
|
+
left_df = left_result.dataframe
|
|
614
|
+
right_df = right_result.dataframe
|
|
615
|
+
|
|
492
616
|
if rel.set_op.is_all:
|
|
493
617
|
# Snowflake except removes all duplicated rows. In order to handle the case,
|
|
494
618
|
# we add a partition row number column to the df to make duplicated rows unique to
|
|
@@ -573,13 +697,18 @@ def map_filter(
|
|
|
573
697
|
|
|
574
698
|
def map_limit(
|
|
575
699
|
rel: relation_proto.Relation,
|
|
576
|
-
) -> DataFrameContainer:
|
|
700
|
+
) -> DataFrameContainer | pandas.DataFrame:
|
|
577
701
|
"""
|
|
578
702
|
Limit a DataFrame based on a Relation's limit.
|
|
579
703
|
|
|
580
704
|
The limit is an integer that is applied to the DataFrame.
|
|
581
705
|
"""
|
|
582
|
-
|
|
706
|
+
|
|
707
|
+
input_container = without_internal_columns(map_relation(rel.limit.input))
|
|
708
|
+
|
|
709
|
+
if isinstance(input_container, pandas.DataFrame):
|
|
710
|
+
return input_container.head(rel.limit.limit)
|
|
711
|
+
|
|
583
712
|
input_df = input_container.dataframe
|
|
584
713
|
|
|
585
714
|
result: snowpark.DataFrame = input_df.limit(rel.limit.limit)
|
|
@@ -601,7 +730,7 @@ def map_offset(
|
|
|
601
730
|
|
|
602
731
|
The offset is an integer that is applied to the DataFrame.
|
|
603
732
|
"""
|
|
604
|
-
input_container = map_relation(rel.offset.input)
|
|
733
|
+
input_container = without_internal_columns(map_relation(rel.offset.input))
|
|
605
734
|
input_df = input_container.dataframe
|
|
606
735
|
|
|
607
736
|
# TODO: This is a terrible way to have to do this, but Snowpark does not
|
|
@@ -629,7 +758,7 @@ def map_replace(
|
|
|
629
758
|
values to replace. The values in the dictionary are the values to replace
|
|
630
759
|
and the keys are the values to replace them with.
|
|
631
760
|
"""
|
|
632
|
-
result = map_relation(rel.replace.input)
|
|
761
|
+
result = without_internal_columns(map_relation(rel.replace.input))
|
|
633
762
|
input_df = result.dataframe
|
|
634
763
|
ordered_columns = input_df.columns
|
|
635
764
|
column_map = result.column_map
|
|
@@ -752,12 +881,14 @@ def map_sample(
|
|
|
752
881
|
"""
|
|
753
882
|
Sample a DataFrame based on a Relation's sample.
|
|
754
883
|
"""
|
|
755
|
-
input_container = map_relation(rel.sample.input)
|
|
884
|
+
input_container = without_internal_columns(map_relation(rel.sample.input))
|
|
756
885
|
input_df = input_container.dataframe
|
|
757
886
|
|
|
758
887
|
frac = rel.sample.upper_bound - rel.sample.lower_bound
|
|
759
888
|
if frac < 0 or frac > 1:
|
|
760
|
-
|
|
889
|
+
exception = IllegalArgumentException("Sample fraction must be between 0 and 1")
|
|
890
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
891
|
+
raise exception
|
|
761
892
|
# The seed argument is not supported here. There are a number of reasons that implementing
|
|
762
893
|
# this will be complicated in Snowflake. Here is a list of complications:
|
|
763
894
|
#
|
|
@@ -772,9 +903,11 @@ def map_sample(
|
|
|
772
903
|
# these issues.
|
|
773
904
|
if rel.sample.with_replacement:
|
|
774
905
|
# TODO: Use a random number generator with ROW_NUMBER and SELECT.
|
|
775
|
-
|
|
906
|
+
exception = SnowparkConnectNotImplementedError(
|
|
776
907
|
"Sample with replacement is not supported"
|
|
777
908
|
)
|
|
909
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
910
|
+
raise exception
|
|
778
911
|
else:
|
|
779
912
|
result: snowpark.DataFrame = input_df.sample(frac=frac)
|
|
780
913
|
return DataFrameContainer(
|
|
@@ -794,7 +927,7 @@ def map_tail(
|
|
|
794
927
|
|
|
795
928
|
The tail is an integer that is applied to the DataFrame.
|
|
796
929
|
"""
|
|
797
|
-
input_container = map_relation(rel.tail.input)
|
|
930
|
+
input_container = without_internal_columns(map_relation(rel.tail.input))
|
|
798
931
|
input_df = input_container.dataframe
|
|
799
932
|
|
|
800
933
|
num_rows = input_df.count()
|
|
@@ -809,3 +942,89 @@ def map_tail(
|
|
|
809
942
|
alias=input_container.alias,
|
|
810
943
|
cached_schema_getter=lambda: input_df.schema,
|
|
811
944
|
)
|
|
945
|
+
|
|
946
|
+
|
|
947
|
+
def _union_by_name_optimized(
|
|
948
|
+
left_df: snowpark.DataFrame,
|
|
949
|
+
right_df: snowpark.DataFrame,
|
|
950
|
+
allow_missing_columns: bool = False,
|
|
951
|
+
) -> snowpark.DataFrame:
|
|
952
|
+
"""
|
|
953
|
+
This implementation is an optimized version of Snowpark's Dataframe::_union_by_name_internal.
|
|
954
|
+
The only change is, that it avoids redundant schema queries that occur in the standard Snowpark,
|
|
955
|
+
by reusing already-fetched/calculated schemas.
|
|
956
|
+
"""
|
|
957
|
+
|
|
958
|
+
left_schema = left_df.schema
|
|
959
|
+
right_schema = right_df.schema
|
|
960
|
+
|
|
961
|
+
left_cols = {field.name for field in left_schema.fields}
|
|
962
|
+
right_cols = {field.name for field in right_schema.fields}
|
|
963
|
+
right_field_map = {field.name: field for field in right_schema.fields}
|
|
964
|
+
|
|
965
|
+
missing_left = right_cols - left_cols
|
|
966
|
+
missing_right = left_cols - right_cols
|
|
967
|
+
|
|
968
|
+
def add_nulls(
|
|
969
|
+
missing_cols: set[str], to_df: snowpark.DataFrame, from_df: snowpark.DataFrame
|
|
970
|
+
) -> snowpark.DataFrame:
|
|
971
|
+
dt_map = {field.name: field.datatype for field in from_df.schema.fields}
|
|
972
|
+
result = to_df.select(
|
|
973
|
+
"*",
|
|
974
|
+
*[lit(None).cast(dt_map[col]).alias(col) for col in missing_cols],
|
|
975
|
+
)
|
|
976
|
+
|
|
977
|
+
result_fields = []
|
|
978
|
+
for field in to_df.schema.fields:
|
|
979
|
+
result_fields.append(
|
|
980
|
+
StructField(field.name, field.datatype, field.nullable)
|
|
981
|
+
)
|
|
982
|
+
for col_name in missing_cols:
|
|
983
|
+
from_field = next(
|
|
984
|
+
field for field in from_df.schema.fields if field.name == col_name
|
|
985
|
+
)
|
|
986
|
+
result_fields.append(
|
|
987
|
+
StructField(col_name, from_field.datatype, from_field.nullable)
|
|
988
|
+
)
|
|
989
|
+
|
|
990
|
+
set_schema_getter(result, lambda: StructType(result_fields))
|
|
991
|
+
|
|
992
|
+
return result
|
|
993
|
+
|
|
994
|
+
if missing_left or missing_right:
|
|
995
|
+
if allow_missing_columns:
|
|
996
|
+
left = left_df
|
|
997
|
+
right = right_df
|
|
998
|
+
if missing_left:
|
|
999
|
+
left = add_nulls(missing_left, left, right)
|
|
1000
|
+
if missing_right:
|
|
1001
|
+
right = add_nulls(missing_right, right, left)
|
|
1002
|
+
result = left._union_by_name_internal(right, is_all=True)
|
|
1003
|
+
|
|
1004
|
+
result_fields = []
|
|
1005
|
+
for field in left_schema.fields:
|
|
1006
|
+
result_fields.append(
|
|
1007
|
+
StructField(field.name, field.datatype, field.nullable)
|
|
1008
|
+
)
|
|
1009
|
+
for col_name in missing_left:
|
|
1010
|
+
right_field = right_field_map[col_name]
|
|
1011
|
+
result_fields.append(
|
|
1012
|
+
StructField(col_name, right_field.datatype, right_field.nullable)
|
|
1013
|
+
)
|
|
1014
|
+
|
|
1015
|
+
set_schema_getter(result, lambda: StructType(result_fields))
|
|
1016
|
+
return result
|
|
1017
|
+
else:
|
|
1018
|
+
exception = (
|
|
1019
|
+
SnowparkClientExceptionMessages.DF_CANNOT_RESOLVE_COLUMN_NAME_AMONG(
|
|
1020
|
+
missing_left, missing_right
|
|
1021
|
+
)
|
|
1022
|
+
)
|
|
1023
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
1024
|
+
raise exception
|
|
1025
|
+
|
|
1026
|
+
result = left_df.unionAllByName(
|
|
1027
|
+
right_df, allow_missing_columns=allow_missing_columns
|
|
1028
|
+
)
|
|
1029
|
+
set_schema_getter(result, lambda: left_df.schema)
|
|
1030
|
+
return result
|
|
@@ -15,6 +15,9 @@ from snowflake.snowpark_connect.column_name_handler import set_schema_getter
|
|
|
15
15
|
from snowflake.snowpark_connect.config import global_config
|
|
16
16
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
17
17
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
18
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
19
|
+
without_internal_columns,
|
|
20
|
+
)
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
def map_show_string(rel: relation_proto.Relation) -> pandas.DataFrame:
|
|
@@ -26,14 +29,17 @@ def map_show_string(rel: relation_proto.Relation) -> pandas.DataFrame:
|
|
|
26
29
|
Buffer object as a single cell.
|
|
27
30
|
"""
|
|
28
31
|
input_df_container: DataFrameContainer = map_relation(rel.show_string.input)
|
|
29
|
-
|
|
30
|
-
|
|
32
|
+
filtered_container = without_internal_columns(input_df_container)
|
|
33
|
+
display_df = filtered_container.dataframe
|
|
34
|
+
display_spark_columns = filtered_container.column_map.get_spark_columns()
|
|
35
|
+
|
|
36
|
+
input_df = _handle_datetype_columns(display_df)
|
|
31
37
|
|
|
32
38
|
show_string = input_df._show_string_spark(
|
|
33
39
|
num_rows=rel.show_string.num_rows,
|
|
34
40
|
truncate=rel.show_string.truncate,
|
|
35
41
|
vertical=rel.show_string.vertical,
|
|
36
|
-
_spark_column_names=
|
|
42
|
+
_spark_column_names=display_spark_columns,
|
|
37
43
|
_spark_session_tz=global_config.spark_sql_session_timeZone,
|
|
38
44
|
)
|
|
39
45
|
return pandas.DataFrame({"show_string": [show_string]})
|
|
@@ -44,14 +50,15 @@ def map_repr_html(rel: relation_proto.Relation) -> pandas.DataFrame:
|
|
|
44
50
|
Generate the html string representation of the input dataframe.
|
|
45
51
|
"""
|
|
46
52
|
input_df_container: DataFrameContainer = map_relation(rel.html_string.input)
|
|
47
|
-
input_df = input_df_container.dataframe
|
|
48
53
|
|
|
54
|
+
filtered_container = without_internal_columns(input_df_container)
|
|
55
|
+
input_df = filtered_container.dataframe
|
|
49
56
|
input_panda = input_df.toPandas()
|
|
50
57
|
input_panda.rename(
|
|
51
58
|
columns={
|
|
52
59
|
analyzer_utils.unquote_if_quoted(
|
|
53
|
-
|
|
54
|
-
):
|
|
60
|
+
filtered_container.column_map.get_snowpark_columns()[i]
|
|
61
|
+
): filtered_container.column_map.get_spark_columns()[i]
|
|
55
62
|
for i in range(len(input_panda.columns))
|
|
56
63
|
},
|
|
57
64
|
inplace=True,
|