snowpark-connect 0.20.2__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +3 -2
- snowflake/snowpark_connect/column_name_handler.py +6 -65
- snowflake/snowpark_connect/config.py +28 -14
- snowflake/snowpark_connect/dataframe_container.py +242 -0
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +13 -23
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +9 -5
- snowflake/snowpark_connect/expression/map_extension.py +2 -1
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +8 -7
- snowflake/snowpark_connect/expression/map_unresolved_function.py +279 -43
- snowflake/snowpark_connect/expression/map_unresolved_star.py +8 -8
- snowflake/snowpark_connect/expression/map_update_fields.py +1 -1
- snowflake/snowpark_connect/expression/typer.py +6 -6
- snowflake/snowpark_connect/proto/control_pb2.py +17 -16
- snowflake/snowpark_connect/proto/control_pb2.pyi +17 -17
- snowflake/snowpark_connect/proto/control_pb2_grpc.py +12 -63
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +15 -14
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +19 -14
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +27 -26
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +74 -68
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +5 -5
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +25 -17
- snowflake/snowpark_connect/relation/map_aggregate.py +72 -47
- snowflake/snowpark_connect/relation/map_catalog.py +2 -2
- snowflake/snowpark_connect/relation/map_column_ops.py +207 -144
- snowflake/snowpark_connect/relation/map_crosstab.py +25 -6
- snowflake/snowpark_connect/relation/map_extension.py +81 -56
- snowflake/snowpark_connect/relation/map_join.py +72 -63
- snowflake/snowpark_connect/relation/map_local_relation.py +35 -20
- snowflake/snowpark_connect/relation/map_map_partitions.py +21 -16
- snowflake/snowpark_connect/relation/map_relation.py +22 -16
- snowflake/snowpark_connect/relation/map_row_ops.py +232 -146
- snowflake/snowpark_connect/relation/map_sample_by.py +15 -8
- snowflake/snowpark_connect/relation/map_show_string.py +42 -5
- snowflake/snowpark_connect/relation/map_sql.py +155 -78
- snowflake/snowpark_connect/relation/map_stats.py +88 -39
- snowflake/snowpark_connect/relation/map_subquery_alias.py +13 -14
- snowflake/snowpark_connect/relation/map_udtf.py +6 -9
- snowflake/snowpark_connect/relation/read/map_read.py +8 -3
- snowflake/snowpark_connect/relation/read/map_read_csv.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_json.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_socket.py +7 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +25 -16
- snowflake/snowpark_connect/relation/read/map_read_text.py +7 -7
- snowflake/snowpark_connect/relation/utils.py +11 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +15 -12
- snowflake/snowpark_connect/relation/write/map_write.py +199 -40
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +3 -2
- snowflake/snowpark_connect/server.py +34 -4
- snowflake/snowpark_connect/type_mapping.py +2 -23
- snowflake/snowpark_connect/utils/cache.py +27 -22
- snowflake/snowpark_connect/utils/context.py +33 -17
- snowflake/snowpark_connect/utils/{attribute_handling.py → identifiers.py} +47 -0
- snowflake/snowpark_connect/utils/session.py +41 -34
- snowflake/snowpark_connect/utils/telemetry.py +1 -2
- snowflake/snowpark_connect/version.py +1 -1
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/METADATA +5 -3
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/RECORD +67 -64
- snowpark_connect-0.21.0.dist-info/licenses/LICENSE-binary +568 -0
- snowpark_connect-0.21.0.dist-info/licenses/NOTICE-binary +1533 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ import typing
|
|
|
7
7
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
8
8
|
|
|
9
9
|
from snowflake import snowpark
|
|
10
|
-
from snowflake.snowpark_connect.
|
|
10
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
11
11
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
12
12
|
get_spark_column_names_from_snowpark_columns,
|
|
13
13
|
rename_columns_as_snowflake_standard,
|
|
@@ -71,7 +71,7 @@ def map_read_text(
|
|
|
71
71
|
schema: snowpark.types.StructType | None,
|
|
72
72
|
session: snowpark.Session,
|
|
73
73
|
paths: list[str],
|
|
74
|
-
) ->
|
|
74
|
+
) -> DataFrameContainer:
|
|
75
75
|
"""
|
|
76
76
|
Read a TEXT file into a Snowpark DataFrame.
|
|
77
77
|
"""
|
|
@@ -98,9 +98,9 @@ def map_read_text(
|
|
|
98
98
|
renamed_df, snowpark_column_names = rename_columns_as_snowflake_standard(
|
|
99
99
|
df, rel.common.plan_id
|
|
100
100
|
)
|
|
101
|
-
return
|
|
102
|
-
renamed_df,
|
|
103
|
-
spark_column_names,
|
|
104
|
-
snowpark_column_names,
|
|
105
|
-
[f.datatype for f in df.schema.fields],
|
|
101
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
102
|
+
dataframe=renamed_df,
|
|
103
|
+
spark_column_names=spark_column_names,
|
|
104
|
+
snowpark_column_names=snowpark_column_names,
|
|
105
|
+
snowpark_column_types=[f.datatype for f in df.schema.fields],
|
|
106
106
|
)
|
|
@@ -32,6 +32,7 @@ from snowflake.snowpark_connect.column_name_handler import (
|
|
|
32
32
|
ColumnNameMap,
|
|
33
33
|
make_column_names_snowpark_compatible,
|
|
34
34
|
)
|
|
35
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
35
36
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
36
37
|
|
|
37
38
|
TYPE_MAP_FOR_TO_SCHEMA = {
|
|
@@ -91,7 +92,9 @@ TYPE_MAP_FOR_TO_SCHEMA = {
|
|
|
91
92
|
|
|
92
93
|
|
|
93
94
|
def get_df_with_partition_row_number(
|
|
94
|
-
|
|
95
|
+
container: DataFrameContainer,
|
|
96
|
+
plan_id: int | None,
|
|
97
|
+
row_number_column_name: str,
|
|
95
98
|
) -> snowpark.DataFrame:
|
|
96
99
|
"""
|
|
97
100
|
Add a row number for each row in each partition for the given df, where
|
|
@@ -106,21 +109,24 @@ def get_df_with_partition_row_number(
|
|
|
106
109
|
| c| 4| | c| 4| 0 |
|
|
107
110
|
+---+---+ +---+---+------------+
|
|
108
111
|
"""
|
|
112
|
+
df = container.dataframe
|
|
113
|
+
column_map = container.column_map
|
|
114
|
+
|
|
109
115
|
row_number_snowpark_column_name = make_column_names_snowpark_compatible(
|
|
110
|
-
[row_number_column_name], plan_id, len(
|
|
116
|
+
[row_number_column_name], plan_id, len(column_map.get_spark_columns())
|
|
111
117
|
)[0]
|
|
112
118
|
row_number_snowpark_column = (
|
|
113
119
|
snowpark_fn.row_number()
|
|
114
120
|
.over(
|
|
115
121
|
snowpark.window.Window.partition_by(
|
|
116
|
-
*
|
|
122
|
+
*column_map.get_snowpark_columns()
|
|
117
123
|
).order_by(snowpark_fn.lit(1))
|
|
118
124
|
)
|
|
119
125
|
.alias(row_number_snowpark_column_name)
|
|
120
126
|
)
|
|
121
127
|
|
|
122
128
|
df_with_partition_number = df.select(
|
|
123
|
-
*
|
|
129
|
+
*column_map.get_snowpark_columns(), row_number_snowpark_column
|
|
124
130
|
)
|
|
125
131
|
return df_with_partition_number
|
|
126
132
|
|
|
@@ -197,7 +203,7 @@ def get_semantic_string(rel: relation_proto.Relation) -> str:
|
|
|
197
203
|
"""
|
|
198
204
|
queries = [
|
|
199
205
|
query
|
|
200
|
-
for query_list in map_relation(rel)._plan.execution_queries.values()
|
|
206
|
+
for query_list in map_relation(rel).dataframe._plan.execution_queries.values()
|
|
201
207
|
for query in query_list
|
|
202
208
|
]
|
|
203
209
|
|
|
@@ -10,6 +10,7 @@ import snowflake.snowpark
|
|
|
10
10
|
from snowflake import snowpark
|
|
11
11
|
from snowflake.snowpark import DataFrameWriter
|
|
12
12
|
from snowflake.snowpark.dataframe import DataFrame
|
|
13
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
13
14
|
from snowflake.snowpark_connect.relation.read import jdbc_read_dbapi
|
|
14
15
|
from snowflake.snowpark_connect.relation.read.jdbc_read_dbapi import JdbcDialect
|
|
15
16
|
from snowflake.snowpark_connect.relation.read.utils import Connection
|
|
@@ -36,7 +37,7 @@ class JdbcDataFrameWriter(DataFrameWriter):
|
|
|
36
37
|
|
|
37
38
|
def jdbc_write_dbapi(
|
|
38
39
|
self,
|
|
39
|
-
|
|
40
|
+
container: DataFrameContainer,
|
|
40
41
|
create_connection: Callable[[dict[str, str]], "Connection"],
|
|
41
42
|
close_connection: Callable[[Connection], None],
|
|
42
43
|
table: str,
|
|
@@ -46,6 +47,7 @@ class JdbcDataFrameWriter(DataFrameWriter):
|
|
|
46
47
|
Write a Snowpark Dataframe data into table of a JDBC datasource.
|
|
47
48
|
"""
|
|
48
49
|
|
|
50
|
+
input_df = container.dataframe
|
|
49
51
|
conn = create_connection(self.jdbc_options)
|
|
50
52
|
try:
|
|
51
53
|
url = self.jdbc_options.get("url", None)
|
|
@@ -53,32 +55,32 @@ class JdbcDataFrameWriter(DataFrameWriter):
|
|
|
53
55
|
|
|
54
56
|
table_exist = self._does_table_exist(conn, table)
|
|
55
57
|
insert_query = self._generate_insert_query(
|
|
56
|
-
|
|
58
|
+
container,
|
|
57
59
|
table,
|
|
58
60
|
)
|
|
59
61
|
|
|
60
62
|
match write_mode:
|
|
61
63
|
case "append":
|
|
62
64
|
if not table_exist:
|
|
63
|
-
self._create_table(conn, table,
|
|
65
|
+
self._create_table(conn, table, container, jdbc_dialect)
|
|
64
66
|
case "errorifexists":
|
|
65
67
|
if table_exist:
|
|
66
68
|
raise ValueError(
|
|
67
69
|
"table is already exist and write mode is ERROR_IF_EXISTS"
|
|
68
70
|
)
|
|
69
71
|
else:
|
|
70
|
-
self._create_table(conn, table,
|
|
72
|
+
self._create_table(conn, table, container, jdbc_dialect)
|
|
71
73
|
case "overwrite":
|
|
72
74
|
if table_exist:
|
|
73
75
|
self._drop_table(conn, table)
|
|
74
|
-
self._create_table(conn, table,
|
|
76
|
+
self._create_table(conn, table, container, jdbc_dialect)
|
|
75
77
|
case "ignore":
|
|
76
78
|
if table_exist:
|
|
77
79
|
# With Ignore write mode, if table already exists, the save operation is expected
|
|
78
80
|
# to not save the contents of the DataFrame and to not change the existing data.
|
|
79
81
|
return
|
|
80
82
|
else:
|
|
81
|
-
self._create_table(conn, table,
|
|
83
|
+
self._create_table(conn, table, container, jdbc_dialect)
|
|
82
84
|
case _:
|
|
83
85
|
raise ValueError(f"Invalid write mode value{write_mode}")
|
|
84
86
|
|
|
@@ -92,14 +94,14 @@ class JdbcDataFrameWriter(DataFrameWriter):
|
|
|
92
94
|
finally:
|
|
93
95
|
close_connection(conn)
|
|
94
96
|
|
|
95
|
-
def _generate_insert_query(self,
|
|
97
|
+
def _generate_insert_query(self, container: DataFrameContainer, table: str) -> str:
|
|
96
98
|
"""
|
|
97
99
|
Generates INSERT statement with placeholders.
|
|
98
|
-
:param
|
|
100
|
+
:param container: Snowpark dataframe container
|
|
99
101
|
:param table: JDBC datasource table name
|
|
100
102
|
:return: INSERT SQL statement
|
|
101
103
|
"""
|
|
102
|
-
true_names =
|
|
104
|
+
true_names = container.column_map.get_spark_columns()
|
|
103
105
|
# quote each column name to match PySpark's case-sensitive column naming behavior.
|
|
104
106
|
quoted_column_names = ",".join([f'"{col}"' for col in true_names])
|
|
105
107
|
place_holders = ",".join(["?"] * len(true_names))
|
|
@@ -145,7 +147,7 @@ class JdbcDataFrameWriter(DataFrameWriter):
|
|
|
145
147
|
self,
|
|
146
148
|
conn: Connection,
|
|
147
149
|
table: str,
|
|
148
|
-
|
|
150
|
+
container,
|
|
149
151
|
jdbc_dialect: JdbcDialect,
|
|
150
152
|
) -> None:
|
|
151
153
|
"""
|
|
@@ -154,14 +156,15 @@ class JdbcDataFrameWriter(DataFrameWriter):
|
|
|
154
156
|
|
|
155
157
|
:param conn: A Python DBAPI connection over JDBC connection
|
|
156
158
|
:param table: DBC datasource table name
|
|
157
|
-
:param
|
|
159
|
+
:param container: Snowpark dataframe container
|
|
158
160
|
:param jdbc_dialect: JDBC specific dialect
|
|
159
161
|
:return: None
|
|
160
162
|
"""
|
|
163
|
+
input_df = container.dataframe
|
|
161
164
|
columns_str = ""
|
|
162
165
|
fields = input_df.schema.fields
|
|
163
166
|
total_columns = len(fields)
|
|
164
|
-
column_map =
|
|
167
|
+
column_map = container.column_map
|
|
165
168
|
|
|
166
169
|
column_index = 0
|
|
167
170
|
for field in fields:
|
|
@@ -9,20 +9,30 @@ from pathlib import Path
|
|
|
9
9
|
import pyspark.sql.connect.proto.base_pb2 as proto_base
|
|
10
10
|
import pyspark.sql.connect.proto.commands_pb2 as commands_proto
|
|
11
11
|
from pyspark.errors.exceptions.base import AnalysisException
|
|
12
|
-
from pyspark.sql.connect.types import StructType
|
|
13
12
|
|
|
14
13
|
from snowflake import snowpark
|
|
15
14
|
from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
16
15
|
quote_name_without_upper_casing,
|
|
17
16
|
unquote_if_quoted,
|
|
18
17
|
)
|
|
18
|
+
from snowflake.snowpark.exceptions import SnowparkSQLException
|
|
19
19
|
from snowflake.snowpark.functions import col, lit, object_construct
|
|
20
|
+
from snowflake.snowpark.types import (
|
|
21
|
+
ArrayType,
|
|
22
|
+
DataType,
|
|
23
|
+
DateType,
|
|
24
|
+
MapType,
|
|
25
|
+
StringType,
|
|
26
|
+
StructType,
|
|
27
|
+
TimestampType,
|
|
28
|
+
_NumericType,
|
|
29
|
+
)
|
|
20
30
|
from snowflake.snowpark_connect.config import (
|
|
21
|
-
auto_uppercase_ddl,
|
|
22
31
|
global_config,
|
|
23
32
|
sessions_config,
|
|
24
33
|
str_to_bool,
|
|
25
34
|
)
|
|
35
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
26
36
|
from snowflake.snowpark_connect.relation.io_utils import (
|
|
27
37
|
convert_file_prefix_path,
|
|
28
38
|
is_cloud_path,
|
|
@@ -32,16 +42,19 @@ from snowflake.snowpark_connect.relation.read.reader_config import CsvWriterConf
|
|
|
32
42
|
from snowflake.snowpark_connect.relation.stage_locator import get_paths_from_stage
|
|
33
43
|
from snowflake.snowpark_connect.relation.utils import random_string
|
|
34
44
|
from snowflake.snowpark_connect.type_mapping import snowpark_to_iceberg_type
|
|
35
|
-
from snowflake.snowpark_connect.utils.
|
|
45
|
+
from snowflake.snowpark_connect.utils.context import get_session_id
|
|
46
|
+
from snowflake.snowpark_connect.utils.identifiers import (
|
|
47
|
+
spark_to_sf_single_id,
|
|
36
48
|
split_fully_qualified_spark_name,
|
|
37
49
|
)
|
|
38
|
-
from snowflake.snowpark_connect.utils.context import get_session_id
|
|
39
50
|
from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
|
|
40
51
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
41
52
|
SnowparkConnectNotImplementedError,
|
|
42
53
|
telemetry,
|
|
43
54
|
)
|
|
44
55
|
|
|
56
|
+
_column_order_for_write = "name"
|
|
57
|
+
|
|
45
58
|
|
|
46
59
|
# TODO: We will revise/refactor this after changes for all formats are finalized.
|
|
47
60
|
def clean_params(params):
|
|
@@ -85,14 +98,9 @@ def get_param_from_options(params, options, source):
|
|
|
85
98
|
params["format_type_options"]["NULL_IF"] = options["nullValue"]
|
|
86
99
|
|
|
87
100
|
|
|
88
|
-
def _spark_to_snowflake_single_id(name: str) -> str:
|
|
89
|
-
name = quote_name_without_upper_casing(name)
|
|
90
|
-
return name.upper() if auto_uppercase_ddl() else name
|
|
91
|
-
|
|
92
|
-
|
|
93
101
|
def _spark_to_snowflake(multipart_id: str) -> str:
|
|
94
102
|
return ".".join(
|
|
95
|
-
|
|
103
|
+
spark_to_sf_single_id(part)
|
|
96
104
|
for part in split_fully_qualified_spark_name(multipart_id)
|
|
97
105
|
)
|
|
98
106
|
|
|
@@ -115,9 +123,8 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
115
123
|
case commands_proto.WriteOperation.SaveMode.SAVE_MODE_IGNORE:
|
|
116
124
|
write_mode = "ignore"
|
|
117
125
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
)
|
|
126
|
+
result = map_relation(write_op.input)
|
|
127
|
+
input_df: snowpark.DataFrame = handle_column_names(result, write_op.source)
|
|
121
128
|
session: snowpark.Session = get_or_create_snowpark_session()
|
|
122
129
|
|
|
123
130
|
# Snowflake saveAsTable doesn't support format
|
|
@@ -198,7 +205,7 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
198
205
|
options = dict(write_op.options)
|
|
199
206
|
if write_mode is None:
|
|
200
207
|
write_mode = "errorifexists"
|
|
201
|
-
map_write_jdbc(
|
|
208
|
+
map_write_jdbc(result, session, options, write_mode)
|
|
202
209
|
case "iceberg":
|
|
203
210
|
table_name = (
|
|
204
211
|
write_op.path
|
|
@@ -220,7 +227,14 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
220
227
|
snowpark_session=session,
|
|
221
228
|
)
|
|
222
229
|
write_mode = "append"
|
|
223
|
-
|
|
230
|
+
|
|
231
|
+
_validate_schema_and_get_writer(
|
|
232
|
+
input_df, write_mode, snowpark_table_name
|
|
233
|
+
).saveAsTable(
|
|
234
|
+
table_name=snowpark_table_name,
|
|
235
|
+
mode=write_mode,
|
|
236
|
+
column_order=_column_order_for_write,
|
|
237
|
+
)
|
|
224
238
|
case _:
|
|
225
239
|
snowpark_table_name = _spark_to_snowflake(write_op.table.table_name)
|
|
226
240
|
|
|
@@ -228,17 +242,23 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
228
242
|
write_op.table.save_method
|
|
229
243
|
== commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE
|
|
230
244
|
):
|
|
231
|
-
|
|
245
|
+
_validate_schema_and_get_writer(
|
|
246
|
+
input_df, write_mode, snowpark_table_name
|
|
247
|
+
).saveAsTable(
|
|
232
248
|
table_name=snowpark_table_name,
|
|
233
249
|
mode=write_mode,
|
|
250
|
+
column_order=_column_order_for_write,
|
|
234
251
|
)
|
|
235
252
|
elif (
|
|
236
253
|
write_op.table.save_method
|
|
237
254
|
== commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO
|
|
238
255
|
):
|
|
239
|
-
|
|
256
|
+
_validate_schema_and_get_writer(
|
|
257
|
+
input_df, write_mode, snowpark_table_name
|
|
258
|
+
).saveAsTable(
|
|
240
259
|
table_name=snowpark_table_name,
|
|
241
260
|
mode=write_mode or "append",
|
|
261
|
+
column_order=_column_order_for_write,
|
|
242
262
|
)
|
|
243
263
|
else:
|
|
244
264
|
raise SnowparkConnectNotImplementedError(
|
|
@@ -265,10 +285,8 @@ def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
|
265
285
|
)
|
|
266
286
|
|
|
267
287
|
snowpark_table_name = _spark_to_snowflake(write_op.table_name)
|
|
268
|
-
|
|
269
|
-
input_df: snowpark.DataFrame = handle_column_names(
|
|
270
|
-
map_relation(write_op.input), "table"
|
|
271
|
-
)
|
|
288
|
+
result = map_relation(write_op.input)
|
|
289
|
+
input_df: snowpark.DataFrame = handle_column_names(result, "table")
|
|
272
290
|
session: snowpark.Session = get_or_create_snowpark_session()
|
|
273
291
|
|
|
274
292
|
if write_op.table_name is None or write_op.table_name == "":
|
|
@@ -304,18 +322,163 @@ def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
|
304
322
|
schema=input_df.schema,
|
|
305
323
|
snowpark_session=session,
|
|
306
324
|
)
|
|
307
|
-
|
|
308
|
-
|
|
325
|
+
_validate_schema_and_get_writer(
|
|
326
|
+
input_df, write_mode, snowpark_table_name
|
|
327
|
+
).saveAsTable(
|
|
309
328
|
table_name=snowpark_table_name,
|
|
310
329
|
mode="append",
|
|
330
|
+
column_order=_column_order_for_write,
|
|
311
331
|
)
|
|
312
332
|
else:
|
|
313
|
-
|
|
333
|
+
_validate_schema_and_get_writer(
|
|
334
|
+
input_df, write_mode, snowpark_table_name
|
|
335
|
+
).saveAsTable(
|
|
314
336
|
table_name=snowpark_table_name,
|
|
315
337
|
mode=write_mode,
|
|
338
|
+
column_order=_column_order_for_write,
|
|
316
339
|
)
|
|
317
340
|
|
|
318
341
|
|
|
342
|
+
def _validate_schema_and_get_writer(
|
|
343
|
+
input_df: snowpark.DataFrame, write_mode: str, snowpark_table_name: str
|
|
344
|
+
) -> snowpark.DataFrameWriter:
|
|
345
|
+
if write_mode == "overwrite":
|
|
346
|
+
return input_df.write
|
|
347
|
+
|
|
348
|
+
table_schema = None
|
|
349
|
+
try:
|
|
350
|
+
table_schema = (
|
|
351
|
+
get_or_create_snowpark_session().table(snowpark_table_name).schema
|
|
352
|
+
)
|
|
353
|
+
except SnowparkSQLException as e:
|
|
354
|
+
msg = e.message
|
|
355
|
+
if "SQL compilation error" in msg and "does not exist" in msg:
|
|
356
|
+
pass
|
|
357
|
+
else:
|
|
358
|
+
raise e
|
|
359
|
+
|
|
360
|
+
if table_schema is None:
|
|
361
|
+
# If table does not exist, we can skip the schema validation
|
|
362
|
+
return input_df.write
|
|
363
|
+
|
|
364
|
+
_validate_schema_for_append(table_schema, input_df.schema, snowpark_table_name)
|
|
365
|
+
|
|
366
|
+
# if table exists and case sensitivity is not enabled, we need to rename the columns to match existing table schema
|
|
367
|
+
if not global_config.spark_sql_caseSensitive:
|
|
368
|
+
|
|
369
|
+
for field in input_df.schema.fields:
|
|
370
|
+
# Find the matching field in the table schema (case-insensitive)
|
|
371
|
+
col_name = field.name
|
|
372
|
+
renamed = col_name
|
|
373
|
+
matching_field = next(
|
|
374
|
+
(f for f in table_schema.fields if f.name.lower() == col_name.lower()),
|
|
375
|
+
None,
|
|
376
|
+
)
|
|
377
|
+
if matching_field is not None and matching_field != col_name:
|
|
378
|
+
renamed = matching_field.name
|
|
379
|
+
input_df = input_df.withColumnRenamed(col_name, renamed)
|
|
380
|
+
# Cast column if type does not match
|
|
381
|
+
|
|
382
|
+
if field.datatype != matching_field.datatype:
|
|
383
|
+
if isinstance(matching_field.datatype, StructType):
|
|
384
|
+
input_df = input_df.withColumn(
|
|
385
|
+
renamed,
|
|
386
|
+
col(renamed).cast(matching_field.datatype, rename_fields=True),
|
|
387
|
+
)
|
|
388
|
+
else:
|
|
389
|
+
input_df = input_df.withColumn(
|
|
390
|
+
renamed, col(renamed).cast(matching_field.datatype)
|
|
391
|
+
)
|
|
392
|
+
return input_df.write
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _validate_schema_for_append(
|
|
396
|
+
table_schema: DataType, data_schema: DataType, snowpark_table_name: str
|
|
397
|
+
):
|
|
398
|
+
match (table_schema, data_schema):
|
|
399
|
+
case (_, _) if table_schema == data_schema:
|
|
400
|
+
return
|
|
401
|
+
|
|
402
|
+
case (StructType() as table_struct, StructType() as data_struct):
|
|
403
|
+
|
|
404
|
+
def _comparable_col_name(col: str) -> str:
|
|
405
|
+
return col if global_config.spark_sql_caseSensitive else col.lower()
|
|
406
|
+
|
|
407
|
+
def invalid_struct_schema():
|
|
408
|
+
raise AnalysisException(
|
|
409
|
+
f"Cannot resolve columns for the existing table {snowpark_table_name} ({table_schema.simple_string()}) with the data schema ({data_schema.simple_string()})."
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
if len(table_struct.fields) != len(data_struct.fields):
|
|
413
|
+
raise AnalysisException(
|
|
414
|
+
f"The column number of the existing table {snowpark_table_name} ({table_schema.simple_string()}) doesn't match the data schema ({data_schema.simple_string()}).)"
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
table_field_names = {
|
|
418
|
+
_comparable_col_name(field.name) for field in table_struct.fields
|
|
419
|
+
}
|
|
420
|
+
data_field_names = {
|
|
421
|
+
_comparable_col_name(field.name) for field in data_struct.fields
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
if table_field_names != data_field_names:
|
|
425
|
+
invalid_struct_schema()
|
|
426
|
+
|
|
427
|
+
for data_field in data_struct.fields:
|
|
428
|
+
matching_table_field = next(
|
|
429
|
+
(
|
|
430
|
+
f
|
|
431
|
+
for f in table_struct.fields
|
|
432
|
+
if _comparable_col_name(f.name)
|
|
433
|
+
== _comparable_col_name(data_field.name)
|
|
434
|
+
),
|
|
435
|
+
None,
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
if matching_table_field is None:
|
|
439
|
+
invalid_struct_schema()
|
|
440
|
+
else:
|
|
441
|
+
_validate_schema_for_append(
|
|
442
|
+
matching_table_field.datatype,
|
|
443
|
+
data_field.datatype,
|
|
444
|
+
snowpark_table_name,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
return
|
|
448
|
+
|
|
449
|
+
case (StringType(), _) if not isinstance(
|
|
450
|
+
data_schema, (StructType, ArrayType, MapType, TimestampType, DateType)
|
|
451
|
+
):
|
|
452
|
+
return
|
|
453
|
+
|
|
454
|
+
case (_, _) if isinstance(table_schema, _NumericType) and isinstance(
|
|
455
|
+
data_schema, _NumericType
|
|
456
|
+
):
|
|
457
|
+
return
|
|
458
|
+
|
|
459
|
+
case (ArrayType() as table_array, ArrayType() as data_array):
|
|
460
|
+
_validate_schema_for_append(
|
|
461
|
+
table_array.element_type, data_array.element_type, snowpark_table_name
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
case (MapType() as table_map, MapType() as data_map):
|
|
465
|
+
_validate_schema_for_append(
|
|
466
|
+
table_map.key_type, data_map.key_type, snowpark_table_name
|
|
467
|
+
)
|
|
468
|
+
_validate_schema_for_append(
|
|
469
|
+
table_map.value_type, data_map.value_type, snowpark_table_name
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
case (TimestampType(), _) if isinstance(data_schema, (DateType, TimestampType)):
|
|
473
|
+
return
|
|
474
|
+
case (DateType(), _) if isinstance(data_schema, (DateType, TimestampType)):
|
|
475
|
+
return
|
|
476
|
+
case (_, _):
|
|
477
|
+
raise AnalysisException(
|
|
478
|
+
f"[INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_SAFELY_CAST] Cannot write incompatible data for the table {snowpark_table_name}: Cannot safely cast {data_schema.simple_string()} to {table_schema.simple_string()}"
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
|
|
319
482
|
def create_iceberg_table(
|
|
320
483
|
snowpark_table_name: str,
|
|
321
484
|
location: str,
|
|
@@ -323,7 +486,7 @@ def create_iceberg_table(
|
|
|
323
486
|
snowpark_session: snowpark.Session,
|
|
324
487
|
):
|
|
325
488
|
table_schema = [
|
|
326
|
-
f"{
|
|
489
|
+
f"{spark_to_sf_single_id(unquote_if_quoted(field.name), is_column = True)} {snowpark_to_iceberg_type(field.datatype)}"
|
|
327
490
|
for field in schema.fields
|
|
328
491
|
]
|
|
329
492
|
|
|
@@ -374,26 +537,22 @@ def rewrite_df(input_df: snowpark.DataFrame, source: str) -> snowpark.DataFrame:
|
|
|
374
537
|
return rewritten_df.select(object_construct(*construct_key_values))
|
|
375
538
|
|
|
376
539
|
|
|
377
|
-
def handle_column_names(
|
|
540
|
+
def handle_column_names(
|
|
541
|
+
container: DataFrameContainer, source: str
|
|
542
|
+
) -> snowpark.DataFrame:
|
|
378
543
|
"""
|
|
379
|
-
Handle column names.
|
|
380
|
-
|
|
381
|
-
Quote column name in these scenarios:
|
|
382
|
-
0. Not write to table
|
|
383
|
-
1. Customer enabled case sensitivity in config
|
|
544
|
+
Handle column names before write so they match spark schema.
|
|
384
545
|
"""
|
|
385
|
-
|
|
546
|
+
df = container.dataframe
|
|
547
|
+
if source == "jdbc":
|
|
386
548
|
# don't change column names for jdbc sources as we directly use spark column names for writing to the destination tables.
|
|
387
549
|
return df
|
|
388
|
-
column_map =
|
|
389
|
-
|
|
390
|
-
for column in
|
|
391
|
-
|
|
392
|
-
|
|
550
|
+
column_map = container.column_map
|
|
551
|
+
|
|
552
|
+
for column in column_map.columns:
|
|
553
|
+
df = df.withColumnRenamed(
|
|
554
|
+
column.snowpark_name, quote_name_without_upper_casing(column.spark_name)
|
|
393
555
|
)
|
|
394
|
-
if source in ("csv", "parquet", "json") or case_sensitive:
|
|
395
|
-
spark_column_name = f'"{spark_column_name}"'
|
|
396
|
-
df = df.withColumnRenamed(column, spark_column_name)
|
|
397
556
|
return df
|
|
398
557
|
|
|
399
558
|
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#
|
|
4
4
|
|
|
5
5
|
from snowflake import snowpark
|
|
6
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
6
7
|
from snowflake.snowpark_connect.relation.read.map_read_jdbc import (
|
|
7
8
|
close_connection,
|
|
8
9
|
create_connection,
|
|
@@ -14,7 +15,7 @@ from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def map_write_jdbc(
|
|
17
|
-
|
|
18
|
+
container: DataFrameContainer,
|
|
18
19
|
session: snowpark.Session,
|
|
19
20
|
options: dict[str, str],
|
|
20
21
|
write_mode: str,
|
|
@@ -38,7 +39,7 @@ def map_write_jdbc(
|
|
|
38
39
|
|
|
39
40
|
try:
|
|
40
41
|
JdbcDataFrameWriter(session, jdbc_options).jdbc_write_dbapi(
|
|
41
|
-
|
|
42
|
+
container,
|
|
42
43
|
create_connection,
|
|
43
44
|
close_connection,
|
|
44
45
|
table=dbtable,
|
|
@@ -112,10 +112,38 @@ _SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE = 128 * 1024 * 1024
|
|
|
112
112
|
_SPARK_CONNECT_GRPC_MAX_METADATA_SIZE = 64 * 1024 # 64kb
|
|
113
113
|
|
|
114
114
|
|
|
115
|
+
def _sanitize_file_paths(text: str) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Sanitize file paths in error messages by replacing them with placeholders.
|
|
118
|
+
Only matches actual file paths, not module names or class names.
|
|
119
|
+
"""
|
|
120
|
+
import re
|
|
121
|
+
|
|
122
|
+
# Pattern to match file paths in traceback "File" lines only
|
|
123
|
+
# This targets the specific format: File "/path/to/file.py", line XX
|
|
124
|
+
file_line_pattern = r'(File\s+["\'])([^"\']+)(["\'],\s+line\s+\d+)'
|
|
125
|
+
|
|
126
|
+
def replace_file_path(match):
|
|
127
|
+
return f"{match.group(1)}<redacted_file_path>{match.group(3)}"
|
|
128
|
+
|
|
129
|
+
return re.sub(file_line_pattern, replace_file_path, text)
|
|
130
|
+
|
|
131
|
+
|
|
115
132
|
def _handle_exception(context, e: Exception):
|
|
116
133
|
import traceback
|
|
117
134
|
|
|
118
|
-
traceback.print_exc()
|
|
135
|
+
# traceback.print_exc()
|
|
136
|
+
# SNOWFLAKE_SHOW_ERROR_TRACE controls sanitized traceback printing (default: false)
|
|
137
|
+
show_traceback = os.getenv("SNOWFLAKE_SHOW_ERROR_TRACE", "false").lower() == "true"
|
|
138
|
+
|
|
139
|
+
if show_traceback:
|
|
140
|
+
# Show detailed traceback (includes error info naturally)
|
|
141
|
+
error_traceback = traceback.format_exc()
|
|
142
|
+
sanitized_traceback = _sanitize_file_paths(error_traceback)
|
|
143
|
+
logger.error(sanitized_traceback)
|
|
144
|
+
else:
|
|
145
|
+
# Show only basic error information, no traceback
|
|
146
|
+
logger.error("Error: %s - %s", type(e).__name__, str(e))
|
|
119
147
|
|
|
120
148
|
telemetry.report_request_failure(e)
|
|
121
149
|
|
|
@@ -195,12 +223,13 @@ class SnowflakeConnectServicer(proto_base_grpc.SparkConnectServiceServicer):
|
|
|
195
223
|
telemetry.initialize_request_summary(request)
|
|
196
224
|
match request.WhichOneof("analyze"):
|
|
197
225
|
case "schema":
|
|
198
|
-
|
|
226
|
+
result = map_relation(request.schema.plan.root)
|
|
227
|
+
snowpark_df = result.dataframe
|
|
199
228
|
snowpark_schema: snowpark.types.StructType = snowpark_df.schema
|
|
200
229
|
schema = proto_base.AnalyzePlanResponse.Schema(
|
|
201
230
|
schema=types_proto.DataType(
|
|
202
231
|
**snowpark_to_proto_type(
|
|
203
|
-
snowpark_schema,
|
|
232
|
+
snowpark_schema, result.column_map, snowpark_df
|
|
204
233
|
)
|
|
205
234
|
)
|
|
206
235
|
)
|
|
@@ -262,7 +291,8 @@ class SnowflakeConnectServicer(proto_base_grpc.SparkConnectServiceServicer):
|
|
|
262
291
|
# Snowflake only exposes simplified execution plans, similar to Spark's optimized logical plans.
|
|
263
292
|
# Snowpark provides the execution plan IFF the dataframe maps to a single query.
|
|
264
293
|
# TODO: Do we need to return a Spark-like plan?
|
|
265
|
-
|
|
294
|
+
result = map_relation(request.explain.plan.root)
|
|
295
|
+
snowpark_df = result.dataframe
|
|
266
296
|
return proto_base.AnalyzePlanResponse(
|
|
267
297
|
session_id=request.session_id,
|
|
268
298
|
explain=proto_base.AnalyzePlanResponse.Explain(
|