snowpark-connect 0.20.2__py3-none-any.whl → 0.22.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +3 -2
- snowflake/snowpark_connect/column_name_handler.py +6 -65
- snowflake/snowpark_connect/config.py +47 -17
- snowflake/snowpark_connect/dataframe_container.py +242 -0
- snowflake/snowpark_connect/error/error_utils.py +25 -0
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +13 -23
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +9 -5
- snowflake/snowpark_connect/expression/map_extension.py +2 -1
- snowflake/snowpark_connect/expression/map_udf.py +4 -4
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +8 -7
- snowflake/snowpark_connect/expression/map_unresolved_function.py +481 -170
- snowflake/snowpark_connect/expression/map_unresolved_star.py +8 -8
- snowflake/snowpark_connect/expression/map_update_fields.py +1 -1
- snowflake/snowpark_connect/expression/typer.py +6 -6
- snowflake/snowpark_connect/proto/control_pb2.py +17 -16
- snowflake/snowpark_connect/proto/control_pb2.pyi +17 -17
- snowflake/snowpark_connect/proto/control_pb2_grpc.py +12 -63
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +15 -14
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +19 -14
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2_grpc.py +4 -0
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +27 -26
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +74 -68
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2_grpc.py +4 -0
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +5 -5
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +25 -17
- snowflake/snowpark_connect/relation/map_aggregate.py +170 -61
- snowflake/snowpark_connect/relation/map_catalog.py +2 -2
- snowflake/snowpark_connect/relation/map_column_ops.py +227 -145
- snowflake/snowpark_connect/relation/map_crosstab.py +25 -6
- snowflake/snowpark_connect/relation/map_extension.py +81 -56
- snowflake/snowpark_connect/relation/map_join.py +72 -63
- snowflake/snowpark_connect/relation/map_local_relation.py +35 -20
- snowflake/snowpark_connect/relation/map_map_partitions.py +24 -17
- snowflake/snowpark_connect/relation/map_relation.py +22 -16
- snowflake/snowpark_connect/relation/map_row_ops.py +232 -146
- snowflake/snowpark_connect/relation/map_sample_by.py +15 -8
- snowflake/snowpark_connect/relation/map_show_string.py +42 -5
- snowflake/snowpark_connect/relation/map_sql.py +141 -237
- snowflake/snowpark_connect/relation/map_stats.py +88 -39
- snowflake/snowpark_connect/relation/map_subquery_alias.py +13 -14
- snowflake/snowpark_connect/relation/map_udtf.py +10 -13
- snowflake/snowpark_connect/relation/read/map_read.py +8 -3
- snowflake/snowpark_connect/relation/read/map_read_csv.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_json.py +19 -8
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_socket.py +7 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +25 -16
- snowflake/snowpark_connect/relation/read/map_read_text.py +7 -7
- snowflake/snowpark_connect/relation/read/reader_config.py +1 -0
- snowflake/snowpark_connect/relation/utils.py +11 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +15 -12
- snowflake/snowpark_connect/relation/write/map_write.py +259 -56
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +3 -2
- snowflake/snowpark_connect/server.py +43 -4
- snowflake/snowpark_connect/type_mapping.py +6 -23
- snowflake/snowpark_connect/utils/cache.py +27 -22
- snowflake/snowpark_connect/utils/context.py +33 -17
- snowflake/snowpark_connect/utils/describe_query_cache.py +2 -9
- snowflake/snowpark_connect/utils/{attribute_handling.py → identifiers.py} +47 -0
- snowflake/snowpark_connect/utils/session.py +41 -38
- snowflake/snowpark_connect/utils/telemetry.py +214 -63
- snowflake/snowpark_connect/utils/udxf_import_utils.py +14 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/__init__.py +0 -0
- snowflake/snowpark_decoder/_internal/proto/generated/DataframeProcessorMsg_pb2.py +36 -0
- snowflake/snowpark_decoder/_internal/proto/generated/DataframeProcessorMsg_pb2.pyi +156 -0
- snowflake/snowpark_decoder/dp_session.py +111 -0
- snowflake/snowpark_decoder/spark_decoder.py +76 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.22.1.dist-info}/METADATA +6 -4
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.22.1.dist-info}/RECORD +83 -69
- snowpark_connect-0.22.1.dist-info/licenses/LICENSE-binary +568 -0
- snowpark_connect-0.22.1.dist-info/licenses/NOTICE-binary +1533 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.22.1.dist-info}/top_level.txt +1 -0
- spark/__init__.py +0 -0
- spark/connect/__init__.py +0 -0
- spark/connect/envelope_pb2.py +31 -0
- spark/connect/envelope_pb2.pyi +46 -0
- snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.22.1.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.22.1.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.22.1.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.22.1.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.22.1.dist-info}/licenses/LICENSE.txt +0 -0
|
@@ -9,20 +9,30 @@ from pathlib import Path
|
|
|
9
9
|
import pyspark.sql.connect.proto.base_pb2 as proto_base
|
|
10
10
|
import pyspark.sql.connect.proto.commands_pb2 as commands_proto
|
|
11
11
|
from pyspark.errors.exceptions.base import AnalysisException
|
|
12
|
-
from pyspark.sql.connect.types import StructType
|
|
13
12
|
|
|
14
13
|
from snowflake import snowpark
|
|
15
14
|
from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
16
15
|
quote_name_without_upper_casing,
|
|
17
16
|
unquote_if_quoted,
|
|
18
17
|
)
|
|
18
|
+
from snowflake.snowpark.exceptions import SnowparkSQLException
|
|
19
19
|
from snowflake.snowpark.functions import col, lit, object_construct
|
|
20
|
+
from snowflake.snowpark.types import (
|
|
21
|
+
ArrayType,
|
|
22
|
+
DataType,
|
|
23
|
+
DateType,
|
|
24
|
+
MapType,
|
|
25
|
+
StringType,
|
|
26
|
+
StructType,
|
|
27
|
+
TimestampType,
|
|
28
|
+
_NumericType,
|
|
29
|
+
)
|
|
20
30
|
from snowflake.snowpark_connect.config import (
|
|
21
|
-
auto_uppercase_ddl,
|
|
22
31
|
global_config,
|
|
23
32
|
sessions_config,
|
|
24
33
|
str_to_bool,
|
|
25
34
|
)
|
|
35
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
26
36
|
from snowflake.snowpark_connect.relation.io_utils import (
|
|
27
37
|
convert_file_prefix_path,
|
|
28
38
|
is_cloud_path,
|
|
@@ -32,16 +42,19 @@ from snowflake.snowpark_connect.relation.read.reader_config import CsvWriterConf
|
|
|
32
42
|
from snowflake.snowpark_connect.relation.stage_locator import get_paths_from_stage
|
|
33
43
|
from snowflake.snowpark_connect.relation.utils import random_string
|
|
34
44
|
from snowflake.snowpark_connect.type_mapping import snowpark_to_iceberg_type
|
|
35
|
-
from snowflake.snowpark_connect.utils.
|
|
45
|
+
from snowflake.snowpark_connect.utils.context import get_session_id
|
|
46
|
+
from snowflake.snowpark_connect.utils.identifiers import (
|
|
47
|
+
spark_to_sf_single_id,
|
|
36
48
|
split_fully_qualified_spark_name,
|
|
37
49
|
)
|
|
38
|
-
from snowflake.snowpark_connect.utils.context import get_session_id
|
|
39
50
|
from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
|
|
40
51
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
41
52
|
SnowparkConnectNotImplementedError,
|
|
42
53
|
telemetry,
|
|
43
54
|
)
|
|
44
55
|
|
|
56
|
+
_column_order_for_write = "name"
|
|
57
|
+
|
|
45
58
|
|
|
46
59
|
# TODO: We will revise/refactor this after changes for all formats are finalized.
|
|
47
60
|
def clean_params(params):
|
|
@@ -85,14 +98,9 @@ def get_param_from_options(params, options, source):
|
|
|
85
98
|
params["format_type_options"]["NULL_IF"] = options["nullValue"]
|
|
86
99
|
|
|
87
100
|
|
|
88
|
-
def _spark_to_snowflake_single_id(name: str) -> str:
|
|
89
|
-
name = quote_name_without_upper_casing(name)
|
|
90
|
-
return name.upper() if auto_uppercase_ddl() else name
|
|
91
|
-
|
|
92
|
-
|
|
93
101
|
def _spark_to_snowflake(multipart_id: str) -> str:
|
|
94
102
|
return ".".join(
|
|
95
|
-
|
|
103
|
+
spark_to_sf_single_id(part)
|
|
96
104
|
for part in split_fully_qualified_spark_name(multipart_id)
|
|
97
105
|
)
|
|
98
106
|
|
|
@@ -115,9 +123,8 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
115
123
|
case commands_proto.WriteOperation.SaveMode.SAVE_MODE_IGNORE:
|
|
116
124
|
write_mode = "ignore"
|
|
117
125
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
)
|
|
126
|
+
result = map_relation(write_op.input)
|
|
127
|
+
input_df: snowpark.DataFrame = handle_column_names(result, write_op.source)
|
|
121
128
|
session: snowpark.Session = get_or_create_snowpark_session()
|
|
122
129
|
|
|
123
130
|
# Snowflake saveAsTable doesn't support format
|
|
@@ -198,7 +205,7 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
198
205
|
options = dict(write_op.options)
|
|
199
206
|
if write_mode is None:
|
|
200
207
|
write_mode = "errorifexists"
|
|
201
|
-
map_write_jdbc(
|
|
208
|
+
map_write_jdbc(result, session, options, write_mode)
|
|
202
209
|
case "iceberg":
|
|
203
210
|
table_name = (
|
|
204
211
|
write_op.path
|
|
@@ -207,20 +214,71 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
207
214
|
)
|
|
208
215
|
snowpark_table_name = _spark_to_snowflake(table_name)
|
|
209
216
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
217
|
+
match write_mode:
|
|
218
|
+
case None | "error" | "errorifexists":
|
|
219
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
220
|
+
raise AnalysisException(
|
|
221
|
+
f"Table {snowpark_table_name} already exists"
|
|
222
|
+
)
|
|
223
|
+
create_iceberg_table(
|
|
224
|
+
snowpark_table_name=snowpark_table_name,
|
|
225
|
+
location=write_op.options.get("location", None),
|
|
226
|
+
schema=input_df.schema,
|
|
227
|
+
snowpark_session=session,
|
|
228
|
+
)
|
|
229
|
+
_validate_schema_and_get_writer(
|
|
230
|
+
input_df, "append", snowpark_table_name
|
|
231
|
+
).saveAsTable(
|
|
232
|
+
table_name=snowpark_table_name,
|
|
233
|
+
mode="append",
|
|
234
|
+
column_order=_column_order_for_write,
|
|
235
|
+
)
|
|
236
|
+
case "append":
|
|
237
|
+
_validate_schema_and_get_writer(
|
|
238
|
+
input_df, "append", snowpark_table_name
|
|
239
|
+
).saveAsTable(
|
|
240
|
+
table_name=snowpark_table_name,
|
|
241
|
+
mode="append",
|
|
242
|
+
column_order=_column_order_for_write,
|
|
243
|
+
)
|
|
244
|
+
case "ignore":
|
|
245
|
+
if not check_snowflake_table_existence(
|
|
246
|
+
snowpark_table_name, session
|
|
247
|
+
):
|
|
248
|
+
create_iceberg_table(
|
|
249
|
+
snowpark_table_name=snowpark_table_name,
|
|
250
|
+
location=write_op.options.get("location", None),
|
|
251
|
+
schema=input_df.schema,
|
|
252
|
+
snowpark_session=session,
|
|
253
|
+
)
|
|
254
|
+
_validate_schema_and_get_writer(
|
|
255
|
+
input_df, "append", snowpark_table_name
|
|
256
|
+
).saveAsTable(
|
|
257
|
+
table_name=snowpark_table_name,
|
|
258
|
+
mode="append",
|
|
259
|
+
column_order=_column_order_for_write,
|
|
260
|
+
)
|
|
261
|
+
case "overwrite":
|
|
262
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
263
|
+
session.sql(f"DELETE FROM {snowpark_table_name}").collect()
|
|
264
|
+
else:
|
|
265
|
+
create_iceberg_table(
|
|
266
|
+
snowpark_table_name=snowpark_table_name,
|
|
267
|
+
location=write_op.options.get("location", None),
|
|
268
|
+
schema=input_df.schema,
|
|
269
|
+
snowpark_session=session,
|
|
270
|
+
)
|
|
271
|
+
_validate_schema_and_get_writer(
|
|
272
|
+
input_df, "append", snowpark_table_name
|
|
273
|
+
).saveAsTable(
|
|
274
|
+
table_name=snowpark_table_name,
|
|
275
|
+
mode="append",
|
|
276
|
+
column_order=_column_order_for_write,
|
|
277
|
+
)
|
|
278
|
+
case _:
|
|
279
|
+
raise SnowparkConnectNotImplementedError(
|
|
280
|
+
f"Write mode {write_mode} is not supported"
|
|
281
|
+
)
|
|
224
282
|
case _:
|
|
225
283
|
snowpark_table_name = _spark_to_snowflake(write_op.table.table_name)
|
|
226
284
|
|
|
@@ -228,17 +286,23 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
228
286
|
write_op.table.save_method
|
|
229
287
|
== commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE
|
|
230
288
|
):
|
|
231
|
-
|
|
289
|
+
_validate_schema_and_get_writer(
|
|
290
|
+
input_df, write_mode, snowpark_table_name
|
|
291
|
+
).saveAsTable(
|
|
232
292
|
table_name=snowpark_table_name,
|
|
233
293
|
mode=write_mode,
|
|
294
|
+
column_order=_column_order_for_write,
|
|
234
295
|
)
|
|
235
296
|
elif (
|
|
236
297
|
write_op.table.save_method
|
|
237
298
|
== commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO
|
|
238
299
|
):
|
|
239
|
-
|
|
300
|
+
_validate_schema_and_get_writer(
|
|
301
|
+
input_df, write_mode, snowpark_table_name
|
|
302
|
+
).saveAsTable(
|
|
240
303
|
table_name=snowpark_table_name,
|
|
241
304
|
mode=write_mode or "append",
|
|
305
|
+
column_order=_column_order_for_write,
|
|
242
306
|
)
|
|
243
307
|
else:
|
|
244
308
|
raise SnowparkConnectNotImplementedError(
|
|
@@ -265,10 +329,8 @@ def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
|
265
329
|
)
|
|
266
330
|
|
|
267
331
|
snowpark_table_name = _spark_to_snowflake(write_op.table_name)
|
|
268
|
-
|
|
269
|
-
input_df: snowpark.DataFrame = handle_column_names(
|
|
270
|
-
map_relation(write_op.input), "table"
|
|
271
|
-
)
|
|
332
|
+
result = map_relation(write_op.input)
|
|
333
|
+
input_df: snowpark.DataFrame = handle_column_names(result, "table")
|
|
272
334
|
session: snowpark.Session = get_or_create_snowpark_session()
|
|
273
335
|
|
|
274
336
|
if write_op.table_name is None or write_op.table_name == "":
|
|
@@ -281,14 +343,14 @@ def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
|
281
343
|
commands_proto.WriteOperationV2.MODE_OVERWRITE,
|
|
282
344
|
commands_proto.WriteOperationV2.MODE_APPEND,
|
|
283
345
|
):
|
|
284
|
-
if not
|
|
346
|
+
if not check_snowflake_table_existence(snowpark_table_name, session):
|
|
285
347
|
raise AnalysisException(
|
|
286
348
|
f"[TABLE_OR_VIEW_NOT_FOUND] The table or view `{write_op.table_name}` cannot be found. "
|
|
287
349
|
f"Verify the spelling and correctness of the schema and catalog.\n"
|
|
288
350
|
)
|
|
289
351
|
|
|
290
352
|
if write_op.provider.lower() == "iceberg":
|
|
291
|
-
if write_mode == "overwrite" and
|
|
353
|
+
if write_mode == "overwrite" and check_snowflake_table_existence(
|
|
292
354
|
snowpark_table_name, session
|
|
293
355
|
):
|
|
294
356
|
session.sql(f"DELETE FROM {snowpark_table_name}").collect()
|
|
@@ -304,16 +366,161 @@ def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
|
304
366
|
schema=input_df.schema,
|
|
305
367
|
snowpark_session=session,
|
|
306
368
|
)
|
|
307
|
-
|
|
308
|
-
|
|
369
|
+
_validate_schema_and_get_writer(
|
|
370
|
+
input_df, write_mode, snowpark_table_name
|
|
371
|
+
).saveAsTable(
|
|
309
372
|
table_name=snowpark_table_name,
|
|
310
373
|
mode="append",
|
|
374
|
+
column_order=_column_order_for_write,
|
|
311
375
|
)
|
|
312
376
|
else:
|
|
313
|
-
|
|
377
|
+
_validate_schema_and_get_writer(
|
|
378
|
+
input_df, write_mode, snowpark_table_name
|
|
379
|
+
).saveAsTable(
|
|
314
380
|
table_name=snowpark_table_name,
|
|
315
381
|
mode=write_mode,
|
|
382
|
+
column_order=_column_order_for_write,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _validate_schema_and_get_writer(
|
|
387
|
+
input_df: snowpark.DataFrame, write_mode: str, snowpark_table_name: str
|
|
388
|
+
) -> snowpark.DataFrameWriter:
|
|
389
|
+
if write_mode == "overwrite":
|
|
390
|
+
return input_df.write
|
|
391
|
+
|
|
392
|
+
table_schema = None
|
|
393
|
+
try:
|
|
394
|
+
table_schema = (
|
|
395
|
+
get_or_create_snowpark_session().table(snowpark_table_name).schema
|
|
316
396
|
)
|
|
397
|
+
except SnowparkSQLException as e:
|
|
398
|
+
msg = e.message
|
|
399
|
+
if "SQL compilation error" in msg and "does not exist" in msg:
|
|
400
|
+
pass
|
|
401
|
+
else:
|
|
402
|
+
raise e
|
|
403
|
+
|
|
404
|
+
if table_schema is None:
|
|
405
|
+
# If table does not exist, we can skip the schema validation
|
|
406
|
+
return input_df.write
|
|
407
|
+
|
|
408
|
+
_validate_schema_for_append(table_schema, input_df.schema, snowpark_table_name)
|
|
409
|
+
|
|
410
|
+
# if table exists and case sensitivity is not enabled, we need to rename the columns to match existing table schema
|
|
411
|
+
if not global_config.spark_sql_caseSensitive:
|
|
412
|
+
|
|
413
|
+
for field in input_df.schema.fields:
|
|
414
|
+
# Find the matching field in the table schema (case-insensitive)
|
|
415
|
+
col_name = field.name
|
|
416
|
+
renamed = col_name
|
|
417
|
+
matching_field = next(
|
|
418
|
+
(f for f in table_schema.fields if f.name.lower() == col_name.lower()),
|
|
419
|
+
None,
|
|
420
|
+
)
|
|
421
|
+
if matching_field is not None and matching_field != col_name:
|
|
422
|
+
renamed = matching_field.name
|
|
423
|
+
input_df = input_df.withColumnRenamed(col_name, renamed)
|
|
424
|
+
# Cast column if type does not match
|
|
425
|
+
|
|
426
|
+
if field.datatype != matching_field.datatype:
|
|
427
|
+
if isinstance(matching_field.datatype, StructType):
|
|
428
|
+
input_df = input_df.withColumn(
|
|
429
|
+
renamed,
|
|
430
|
+
col(renamed).cast(matching_field.datatype, rename_fields=True),
|
|
431
|
+
)
|
|
432
|
+
else:
|
|
433
|
+
input_df = input_df.withColumn(
|
|
434
|
+
renamed, col(renamed).cast(matching_field.datatype)
|
|
435
|
+
)
|
|
436
|
+
return input_df.write
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _validate_schema_for_append(
|
|
440
|
+
table_schema: DataType, data_schema: DataType, snowpark_table_name: str
|
|
441
|
+
):
|
|
442
|
+
match (table_schema, data_schema):
|
|
443
|
+
case (_, _) if table_schema == data_schema:
|
|
444
|
+
return
|
|
445
|
+
|
|
446
|
+
case (StructType() as table_struct, StructType() as data_struct):
|
|
447
|
+
|
|
448
|
+
def _comparable_col_name(col: str) -> str:
|
|
449
|
+
return col if global_config.spark_sql_caseSensitive else col.lower()
|
|
450
|
+
|
|
451
|
+
def invalid_struct_schema():
|
|
452
|
+
raise AnalysisException(
|
|
453
|
+
f"Cannot resolve columns for the existing table {snowpark_table_name} ({table_schema.simple_string()}) with the data schema ({data_schema.simple_string()})."
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
if len(table_struct.fields) != len(data_struct.fields):
|
|
457
|
+
raise AnalysisException(
|
|
458
|
+
f"The column number of the existing table {snowpark_table_name} ({table_schema.simple_string()}) doesn't match the data schema ({data_schema.simple_string()}).)"
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
table_field_names = {
|
|
462
|
+
_comparable_col_name(field.name) for field in table_struct.fields
|
|
463
|
+
}
|
|
464
|
+
data_field_names = {
|
|
465
|
+
_comparable_col_name(field.name) for field in data_struct.fields
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
if table_field_names != data_field_names:
|
|
469
|
+
invalid_struct_schema()
|
|
470
|
+
|
|
471
|
+
for data_field in data_struct.fields:
|
|
472
|
+
matching_table_field = next(
|
|
473
|
+
(
|
|
474
|
+
f
|
|
475
|
+
for f in table_struct.fields
|
|
476
|
+
if _comparable_col_name(f.name)
|
|
477
|
+
== _comparable_col_name(data_field.name)
|
|
478
|
+
),
|
|
479
|
+
None,
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
if matching_table_field is None:
|
|
483
|
+
invalid_struct_schema()
|
|
484
|
+
else:
|
|
485
|
+
_validate_schema_for_append(
|
|
486
|
+
matching_table_field.datatype,
|
|
487
|
+
data_field.datatype,
|
|
488
|
+
snowpark_table_name,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
return
|
|
492
|
+
|
|
493
|
+
case (StringType(), _) if not isinstance(
|
|
494
|
+
data_schema, (StructType, ArrayType, MapType, TimestampType, DateType)
|
|
495
|
+
):
|
|
496
|
+
return
|
|
497
|
+
|
|
498
|
+
case (_, _) if isinstance(table_schema, _NumericType) and isinstance(
|
|
499
|
+
data_schema, _NumericType
|
|
500
|
+
):
|
|
501
|
+
return
|
|
502
|
+
|
|
503
|
+
case (ArrayType() as table_array, ArrayType() as data_array):
|
|
504
|
+
_validate_schema_for_append(
|
|
505
|
+
table_array.element_type, data_array.element_type, snowpark_table_name
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
case (MapType() as table_map, MapType() as data_map):
|
|
509
|
+
_validate_schema_for_append(
|
|
510
|
+
table_map.key_type, data_map.key_type, snowpark_table_name
|
|
511
|
+
)
|
|
512
|
+
_validate_schema_for_append(
|
|
513
|
+
table_map.value_type, data_map.value_type, snowpark_table_name
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
case (TimestampType(), _) if isinstance(data_schema, (DateType, TimestampType)):
|
|
517
|
+
return
|
|
518
|
+
case (DateType(), _) if isinstance(data_schema, (DateType, TimestampType)):
|
|
519
|
+
return
|
|
520
|
+
case (_, _):
|
|
521
|
+
raise AnalysisException(
|
|
522
|
+
f"[INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_SAFELY_CAST] Cannot write incompatible data for the table {snowpark_table_name}: Cannot safely cast {data_schema.simple_string()} to {table_schema.simple_string()}"
|
|
523
|
+
)
|
|
317
524
|
|
|
318
525
|
|
|
319
526
|
def create_iceberg_table(
|
|
@@ -323,7 +530,7 @@ def create_iceberg_table(
|
|
|
323
530
|
snowpark_session: snowpark.Session,
|
|
324
531
|
):
|
|
325
532
|
table_schema = [
|
|
326
|
-
f"{
|
|
533
|
+
f"{spark_to_sf_single_id(unquote_if_quoted(field.name), is_column = True)} {snowpark_to_iceberg_type(field.datatype)}"
|
|
327
534
|
for field in schema.fields
|
|
328
535
|
]
|
|
329
536
|
|
|
@@ -374,26 +581,22 @@ def rewrite_df(input_df: snowpark.DataFrame, source: str) -> snowpark.DataFrame:
|
|
|
374
581
|
return rewritten_df.select(object_construct(*construct_key_values))
|
|
375
582
|
|
|
376
583
|
|
|
377
|
-
def handle_column_names(
|
|
584
|
+
def handle_column_names(
|
|
585
|
+
container: DataFrameContainer, source: str
|
|
586
|
+
) -> snowpark.DataFrame:
|
|
378
587
|
"""
|
|
379
|
-
Handle column names.
|
|
380
|
-
|
|
381
|
-
Quote column name in these scenarios:
|
|
382
|
-
0. Not write to table
|
|
383
|
-
1. Customer enabled case sensitivity in config
|
|
588
|
+
Handle column names before write so they match spark schema.
|
|
384
589
|
"""
|
|
385
|
-
|
|
590
|
+
df = container.dataframe
|
|
591
|
+
if source == "jdbc":
|
|
386
592
|
# don't change column names for jdbc sources as we directly use spark column names for writing to the destination tables.
|
|
387
593
|
return df
|
|
388
|
-
column_map =
|
|
389
|
-
|
|
390
|
-
for column in
|
|
391
|
-
|
|
392
|
-
|
|
594
|
+
column_map = container.column_map
|
|
595
|
+
|
|
596
|
+
for column in column_map.columns:
|
|
597
|
+
df = df.withColumnRenamed(
|
|
598
|
+
column.snowpark_name, quote_name_without_upper_casing(column.spark_name)
|
|
393
599
|
)
|
|
394
|
-
if source in ("csv", "parquet", "json") or case_sensitive:
|
|
395
|
-
spark_column_name = f'"{spark_column_name}"'
|
|
396
|
-
df = df.withColumnRenamed(column, spark_column_name)
|
|
397
600
|
return df
|
|
398
601
|
|
|
399
602
|
|
|
@@ -425,7 +628,7 @@ def _truncate_directory(directory_path: Path) -> None:
|
|
|
425
628
|
shutil.rmtree(file)
|
|
426
629
|
|
|
427
630
|
|
|
428
|
-
def
|
|
631
|
+
def check_snowflake_table_existence(
|
|
429
632
|
snowpark_table_name: str,
|
|
430
633
|
snowpark_session: snowpark.Session,
|
|
431
634
|
):
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
#
|
|
4
4
|
|
|
5
5
|
from snowflake import snowpark
|
|
6
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
6
7
|
from snowflake.snowpark_connect.relation.read.map_read_jdbc import (
|
|
7
8
|
close_connection,
|
|
8
9
|
create_connection,
|
|
@@ -14,7 +15,7 @@ from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def map_write_jdbc(
|
|
17
|
-
|
|
18
|
+
container: DataFrameContainer,
|
|
18
19
|
session: snowpark.Session,
|
|
19
20
|
options: dict[str, str],
|
|
20
21
|
write_mode: str,
|
|
@@ -38,7 +39,7 @@ def map_write_jdbc(
|
|
|
38
39
|
|
|
39
40
|
try:
|
|
40
41
|
JdbcDataFrameWriter(session, jdbc_options).jdbc_write_dbapi(
|
|
41
|
-
|
|
42
|
+
container,
|
|
42
43
|
create_connection,
|
|
43
44
|
close_connection,
|
|
44
45
|
table=dbtable,
|
|
@@ -112,10 +112,38 @@ _SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE = 128 * 1024 * 1024
|
|
|
112
112
|
_SPARK_CONNECT_GRPC_MAX_METADATA_SIZE = 64 * 1024 # 64kb
|
|
113
113
|
|
|
114
114
|
|
|
115
|
+
def _sanitize_file_paths(text: str) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Sanitize file paths in error messages by replacing them with placeholders.
|
|
118
|
+
Only matches actual file paths, not module names or class names.
|
|
119
|
+
"""
|
|
120
|
+
import re
|
|
121
|
+
|
|
122
|
+
# Pattern to match file paths in traceback "File" lines only
|
|
123
|
+
# This targets the specific format: File "/path/to/file.py", line XX
|
|
124
|
+
file_line_pattern = r'(File\s+["\'])([^"\']+)(["\'],\s+line\s+\d+)'
|
|
125
|
+
|
|
126
|
+
def replace_file_path(match):
|
|
127
|
+
return f"{match.group(1)}<redacted_file_path>{match.group(3)}"
|
|
128
|
+
|
|
129
|
+
return re.sub(file_line_pattern, replace_file_path, text)
|
|
130
|
+
|
|
131
|
+
|
|
115
132
|
def _handle_exception(context, e: Exception):
|
|
116
133
|
import traceback
|
|
117
134
|
|
|
118
|
-
traceback.print_exc()
|
|
135
|
+
# traceback.print_exc()
|
|
136
|
+
# SNOWFLAKE_SHOW_ERROR_TRACE controls sanitized traceback printing (default: false)
|
|
137
|
+
show_traceback = os.getenv("SNOWFLAKE_SHOW_ERROR_TRACE", "false").lower() == "true"
|
|
138
|
+
|
|
139
|
+
if show_traceback:
|
|
140
|
+
# Show detailed traceback (includes error info naturally)
|
|
141
|
+
error_traceback = traceback.format_exc()
|
|
142
|
+
sanitized_traceback = _sanitize_file_paths(error_traceback)
|
|
143
|
+
logger.error(sanitized_traceback)
|
|
144
|
+
else:
|
|
145
|
+
# Show only basic error information, no traceback
|
|
146
|
+
logger.error("Error: %s - %s", type(e).__name__, str(e))
|
|
119
147
|
|
|
120
148
|
telemetry.report_request_failure(e)
|
|
121
149
|
|
|
@@ -195,12 +223,13 @@ class SnowflakeConnectServicer(proto_base_grpc.SparkConnectServiceServicer):
|
|
|
195
223
|
telemetry.initialize_request_summary(request)
|
|
196
224
|
match request.WhichOneof("analyze"):
|
|
197
225
|
case "schema":
|
|
198
|
-
|
|
226
|
+
result = map_relation(request.schema.plan.root)
|
|
227
|
+
snowpark_df = result.dataframe
|
|
199
228
|
snowpark_schema: snowpark.types.StructType = snowpark_df.schema
|
|
200
229
|
schema = proto_base.AnalyzePlanResponse.Schema(
|
|
201
230
|
schema=types_proto.DataType(
|
|
202
231
|
**snowpark_to_proto_type(
|
|
203
|
-
snowpark_schema,
|
|
232
|
+
snowpark_schema, result.column_map, snowpark_df
|
|
204
233
|
)
|
|
205
234
|
)
|
|
206
235
|
)
|
|
@@ -262,7 +291,8 @@ class SnowflakeConnectServicer(proto_base_grpc.SparkConnectServiceServicer):
|
|
|
262
291
|
# Snowflake only exposes simplified execution plans, similar to Spark's optimized logical plans.
|
|
263
292
|
# Snowpark provides the execution plan IFF the dataframe maps to a single query.
|
|
264
293
|
# TODO: Do we need to return a Spark-like plan?
|
|
265
|
-
|
|
294
|
+
result = map_relation(request.explain.plan.root)
|
|
295
|
+
snowpark_df = result.dataframe
|
|
266
296
|
return proto_base.AnalyzePlanResponse(
|
|
267
297
|
session_id=request.session_id,
|
|
268
298
|
explain=proto_base.AnalyzePlanResponse.Explain(
|
|
@@ -951,6 +981,7 @@ def start_session(
|
|
|
951
981
|
stop_event: threading.Event = None,
|
|
952
982
|
snowpark_session: Optional[snowpark.Session] = None,
|
|
953
983
|
connection_parameters: Optional[Dict[str, str]] = None,
|
|
984
|
+
max_grpc_message_size: int = _SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE,
|
|
954
985
|
) -> threading.Thread | None:
|
|
955
986
|
"""
|
|
956
987
|
Starts Spark Connect server connected to Snowflake. No-op if the Server is already running.
|
|
@@ -973,6 +1004,14 @@ def start_session(
|
|
|
973
1004
|
provided, the `snowpark_session` parameter must be None.
|
|
974
1005
|
"""
|
|
975
1006
|
try:
|
|
1007
|
+
# Changing the value of our global variable based on the grpc message size provided by the user.
|
|
1008
|
+
global _SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE
|
|
1009
|
+
_SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE = max_grpc_message_size
|
|
1010
|
+
|
|
1011
|
+
from pyspark.sql.connect.client import ChannelBuilder
|
|
1012
|
+
|
|
1013
|
+
ChannelBuilder.MAX_MESSAGE_LENGTH = max_grpc_message_size
|
|
1014
|
+
|
|
976
1015
|
if os.environ.get("SPARK_ENV_LOADED"):
|
|
977
1016
|
raise RuntimeError(
|
|
978
1017
|
"Snowpark Connect cannot be run inside of a Spark environment"
|
|
@@ -52,10 +52,6 @@ SNOWPARK_TYPE_NAME_TO_PYSPARK_TYPE_NAME = {
|
|
|
52
52
|
snowpark.types.TimestampType.__name__: pyspark.sql.types.TimestampType.typeName(),
|
|
53
53
|
}
|
|
54
54
|
|
|
55
|
-
_STRUCT_MATCH_PATTERN = re.compile(r"struct<(.+)>", re.IGNORECASE)
|
|
56
|
-
_STRUCT_REPLACE_PATTERN = re.compile(r"struct<[^>]*>", re.IGNORECASE)
|
|
57
|
-
_MAP_REPLACE_PATTERN = re.compile(r"map<[^>]*>", re.IGNORECASE)
|
|
58
|
-
|
|
59
55
|
|
|
60
56
|
@cache
|
|
61
57
|
def _get_struct_type_class():
|
|
@@ -206,7 +202,7 @@ def snowpark_to_proto_type(
|
|
|
206
202
|
if (
|
|
207
203
|
metadata is None
|
|
208
204
|
and df
|
|
209
|
-
and field.name in
|
|
205
|
+
and field.name in column_name_map.get_snowpark_columns()
|
|
210
206
|
):
|
|
211
207
|
try:
|
|
212
208
|
# check for collision using expr_id
|
|
@@ -328,6 +324,8 @@ def cast_to_match_snowpark_type(
|
|
|
328
324
|
return str(content)
|
|
329
325
|
case snowpark.types.VariantType:
|
|
330
326
|
return str(content)
|
|
327
|
+
case snowpark.types.TimestampType:
|
|
328
|
+
return str(content)
|
|
331
329
|
case _:
|
|
332
330
|
raise SnowparkConnectNotImplementedError(
|
|
333
331
|
f"Unsupported snowpark data type in casting: {data_type}"
|
|
@@ -499,7 +497,7 @@ def map_snowpark_types_to_pyarrow_types(
|
|
|
499
497
|
return pa.string()
|
|
500
498
|
if pa.types.is_struct(pa_type):
|
|
501
499
|
return pa.struct(
|
|
502
|
-
|
|
500
|
+
[
|
|
503
501
|
pa.field(
|
|
504
502
|
field.name if not rename_struct_columns else str(i),
|
|
505
503
|
map_snowpark_types_to_pyarrow_types(
|
|
@@ -783,6 +781,8 @@ def map_simple_types(simple_type: str) -> snowpark.types.DataType:
|
|
|
783
781
|
return snowpark.types.TimestampType()
|
|
784
782
|
case "timestamp_ntz":
|
|
785
783
|
return snowpark.types.TimestampType(snowpark.types.TimestampTimeZone.NTZ)
|
|
784
|
+
case "timestamp_ltz":
|
|
785
|
+
return snowpark.types.TimestampType(snowpark.types.TimestampTimeZone.LTZ)
|
|
786
786
|
case "day_time_interval":
|
|
787
787
|
# this is not a column type in snowflake so there won't be a dataframe column
|
|
788
788
|
# with this, for now this type won't make any sense
|
|
@@ -869,23 +869,6 @@ def map_json_schema_to_snowpark(
|
|
|
869
869
|
return map_simple_types(schema["type"])
|
|
870
870
|
|
|
871
871
|
|
|
872
|
-
def _replace_complex_patterns(type_string):
|
|
873
|
-
# Check if entire string matches struct pattern "struct<col1 int, col2 int, col3 int, col4 int>"
|
|
874
|
-
type_string = re.sub(
|
|
875
|
-
r"decimal\s*\(\s*\d+\s*,\s*\d+\s*\)",
|
|
876
|
-
"decimal",
|
|
877
|
-
type_string,
|
|
878
|
-
flags=re.IGNORECASE,
|
|
879
|
-
)
|
|
880
|
-
struct_match = _STRUCT_MATCH_PATTERN.match(type_string)
|
|
881
|
-
if struct_match:
|
|
882
|
-
return struct_match.group(1).replace(":", " ")
|
|
883
|
-
# Replace 'struct<[^>]*>' with 'struct' and map<*> with map as we are only interested in column names.
|
|
884
|
-
type_string = _STRUCT_REPLACE_PATTERN.sub("struct", type_string)
|
|
885
|
-
type_string = _MAP_REPLACE_PATTERN.sub("map", type_string)
|
|
886
|
-
return type_string.replace(":", " ")
|
|
887
|
-
|
|
888
|
-
|
|
889
872
|
def map_type_string_to_snowpark_type(type_string: str) -> snowpark.types.DataType:
|
|
890
873
|
"""
|
|
891
874
|
Converts a pyspark type string like x: int or struct<x: int, y: string> etc. to a snowpark type.
|