snowpark-connect 0.22.1__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/config.py +0 -11
- snowflake/snowpark_connect/error/error_utils.py +7 -0
- snowflake/snowpark_connect/error/exceptions.py +4 -0
- snowflake/snowpark_connect/expression/hybrid_column_map.py +192 -0
- snowflake/snowpark_connect/expression/literal.py +9 -12
- snowflake/snowpark_connect/expression/map_cast.py +20 -4
- snowflake/snowpark_connect/expression/map_expression.py +8 -1
- snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +32 -5
- snowflake/snowpark_connect/expression/map_unresolved_function.py +66 -6
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +8 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +4 -2
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +127 -21
- snowflake/snowpark_connect/relation/map_aggregate.py +57 -5
- snowflake/snowpark_connect/relation/map_column_ops.py +38 -6
- snowflake/snowpark_connect/relation/map_extension.py +58 -24
- snowflake/snowpark_connect/relation/map_local_relation.py +8 -1
- snowflake/snowpark_connect/relation/map_row_ops.py +30 -1
- snowflake/snowpark_connect/relation/map_sql.py +22 -5
- snowflake/snowpark_connect/relation/read/map_read.py +2 -1
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +8 -1
- snowflake/snowpark_connect/relation/read/reader_config.py +9 -0
- snowflake/snowpark_connect/relation/read/utils.py +7 -6
- snowflake/snowpark_connect/relation/utils.py +170 -1
- snowflake/snowpark_connect/relation/write/map_write.py +243 -68
- snowflake/snowpark_connect/server.py +25 -5
- snowflake/snowpark_connect/type_mapping.py +2 -2
- snowflake/snowpark_connect/utils/env_utils.py +55 -0
- snowflake/snowpark_connect/utils/session.py +21 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/spark_decoder.py +1 -1
- {snowpark_connect-0.22.1.dist-info → snowpark_connect-0.23.0.dist-info}/METADATA +2 -2
- {snowpark_connect-0.22.1.dist-info → snowpark_connect-0.23.0.dist-info}/RECORD +40 -40
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2_grpc.py +0 -4
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2_grpc.py +0 -4
- {snowpark_connect-0.22.1.data → snowpark_connect-0.23.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.22.1.data → snowpark_connect-0.23.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.22.1.data → snowpark_connect-0.23.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.22.1.dist-info → snowpark_connect-0.23.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.22.1.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.22.1.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.22.1.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.22.1.dist-info → snowpark_connect-0.23.0.dist-info}/top_level.txt +0 -0
|
@@ -73,12 +73,13 @@ def rename_columns_as_snowflake_standard(
|
|
|
73
73
|
return df, []
|
|
74
74
|
|
|
75
75
|
new_columns = make_column_names_snowpark_compatible(df.columns, plan_id)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
76
|
+
result = df.toDF(*new_columns)
|
|
77
|
+
if result._select_statement is not None:
|
|
78
|
+
# do not allow snowpark to flatten the to_df result
|
|
79
|
+
# TODO: remove after SNOW-2203706 is fixed
|
|
80
|
+
result._select_statement.flatten_disabled = True
|
|
81
|
+
|
|
82
|
+
return (result, new_columns)
|
|
82
83
|
|
|
83
84
|
|
|
84
85
|
class Connection(Protocol):
|
|
@@ -6,12 +6,28 @@ import random
|
|
|
6
6
|
import re
|
|
7
7
|
import string
|
|
8
8
|
import time
|
|
9
|
-
from
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
from typing import AbstractSet, List, Optional, Sequence
|
|
10
11
|
|
|
11
12
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
12
13
|
|
|
13
14
|
import snowflake.snowpark.functions as snowpark_fn
|
|
14
15
|
from snowflake import snowpark
|
|
16
|
+
from snowflake.snowpark._internal.analyzer.expression import (
|
|
17
|
+
COLUMN_DEPENDENCY_ALL,
|
|
18
|
+
COLUMN_DEPENDENCY_DOLLAR,
|
|
19
|
+
Expression,
|
|
20
|
+
FunctionExpression,
|
|
21
|
+
derive_dependent_columns,
|
|
22
|
+
)
|
|
23
|
+
from snowflake.snowpark._internal.analyzer.select_statement import (
|
|
24
|
+
SEQUENCE_DEPENDENT_DATA_GENERATION,
|
|
25
|
+
ColumnChangeState,
|
|
26
|
+
ColumnStateDict,
|
|
27
|
+
SelectStatement,
|
|
28
|
+
)
|
|
29
|
+
from snowflake.snowpark._internal.analyzer.unary_expression import Alias
|
|
30
|
+
from snowflake.snowpark._internal.analyzer.window_expression import WindowExpression
|
|
15
31
|
from snowflake.snowpark.types import (
|
|
16
32
|
BinaryType,
|
|
17
33
|
BooleanType,
|
|
@@ -223,3 +239,156 @@ def snowpark_functions_col(name: str, column_map: ColumnNameMap) -> snowpark.Col
|
|
|
223
239
|
"""
|
|
224
240
|
is_qualified_name = name not in column_map.get_snowpark_columns()
|
|
225
241
|
return snowpark_fn.col(name, _is_qualified_name=is_qualified_name)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def can_sort_be_flattened(
|
|
245
|
+
select_statement: Optional[SelectStatement], *sort_expressions: Optional[Expression]
|
|
246
|
+
) -> bool:
|
|
247
|
+
"""
|
|
248
|
+
Checks if the given SelectStatement can be "flattened" when sorting with regard to the given sort expressions.
|
|
249
|
+
Flattening means that the given SelectStatement can be enhanced and reused instead of being treated
|
|
250
|
+
as a subquery in the FROM clause after a "sort" or "filter" operation. Flattening allows accessing dropped columns
|
|
251
|
+
for sort and filter expressions.
|
|
252
|
+
"""
|
|
253
|
+
if not select_statement or select_statement.flatten_disabled:
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
# In some cases, flattening sort can lead to leaving the "order by" clause in a subquery,
|
|
257
|
+
# which can cause incorrect ordering. We want to avoid flattening sort when all its dependent columns
|
|
258
|
+
# are available in the current projection.
|
|
259
|
+
dependent_columns_in_sort = derive_dependent_columns(*sort_expressions)
|
|
260
|
+
columns_in_projection = _get_columns_in_projection(select_statement.projection)
|
|
261
|
+
if len(dependent_columns_in_sort - columns_in_projection) == 0:
|
|
262
|
+
return False
|
|
263
|
+
|
|
264
|
+
return _can_clause_dependent_columns_flatten(
|
|
265
|
+
dependent_columns_in_sort, select_statement.column_states
|
|
266
|
+
) and not _has_data_generator_exp(select_statement.projection)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def can_filter_be_flattened(
|
|
270
|
+
select_statement: Optional[SelectStatement], condition: Expression
|
|
271
|
+
) -> bool:
|
|
272
|
+
"""
|
|
273
|
+
Checks if the given SelectStatement can be "flattened" when filtering with regard to the given condition.
|
|
274
|
+
Flattening means that the given SelectStatement can be enhanced and reused instead of being treated
|
|
275
|
+
as a subquery in the FROM clause after a "sort" or "filter" operation. Flattening allows accessing dropped columns
|
|
276
|
+
for sort and filter expressions.
|
|
277
|
+
"""
|
|
278
|
+
if not select_statement or select_statement.flatten_disabled:
|
|
279
|
+
return False
|
|
280
|
+
|
|
281
|
+
return all(
|
|
282
|
+
[
|
|
283
|
+
_can_clause_dependent_columns_flatten(
|
|
284
|
+
derive_dependent_columns(condition), select_statement.column_states
|
|
285
|
+
),
|
|
286
|
+
not _has_data_generator_or_window_exp(select_statement.projection),
|
|
287
|
+
select_statement.order_by is None,
|
|
288
|
+
select_statement.limit_ is None,
|
|
289
|
+
]
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _get_columns_in_projection(
|
|
294
|
+
projection: Optional[List[Expression]],
|
|
295
|
+
) -> AbstractSet[str]:
|
|
296
|
+
if projection is None:
|
|
297
|
+
return set()
|
|
298
|
+
|
|
299
|
+
columns = set()
|
|
300
|
+
for expression in projection:
|
|
301
|
+
if hasattr(expression, "name") and expression.name:
|
|
302
|
+
columns.add(expression.name)
|
|
303
|
+
elif hasattr(expression, "children"):
|
|
304
|
+
columns.update(_get_columns_in_projection(expression.children))
|
|
305
|
+
|
|
306
|
+
return columns
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _is_self_alias(expression):
|
|
310
|
+
"""
|
|
311
|
+
Check if the expression is a self-alias, meaning it has an alias that is the same as its name.
|
|
312
|
+
A self-alias can be flattened, even if Snowpark treats it as a CHANGED_EXP.
|
|
313
|
+
"""
|
|
314
|
+
if not isinstance(expression, Alias):
|
|
315
|
+
return False
|
|
316
|
+
|
|
317
|
+
first_child_with_name = expression.child
|
|
318
|
+
while (
|
|
319
|
+
first_child_with_name
|
|
320
|
+
and hasattr(first_child_with_name, "child")
|
|
321
|
+
and not hasattr(first_child_with_name, "name")
|
|
322
|
+
):
|
|
323
|
+
first_child_with_name = first_child_with_name.child
|
|
324
|
+
|
|
325
|
+
return (
|
|
326
|
+
first_child_with_name
|
|
327
|
+
and (first_child_with_name, "name")
|
|
328
|
+
and first_child_with_name.name == expression.name
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _can_clause_dependent_columns_flatten(
|
|
333
|
+
dependent_columns: Optional[AbstractSet[str]],
|
|
334
|
+
subquery_column_states: ColumnStateDict,
|
|
335
|
+
) -> bool:
|
|
336
|
+
if dependent_columns == COLUMN_DEPENDENCY_DOLLAR:
|
|
337
|
+
return False
|
|
338
|
+
elif (
|
|
339
|
+
subquery_column_states.has_changed_columns
|
|
340
|
+
or subquery_column_states.has_new_columns
|
|
341
|
+
):
|
|
342
|
+
if dependent_columns == COLUMN_DEPENDENCY_ALL:
|
|
343
|
+
return False
|
|
344
|
+
|
|
345
|
+
assert dependent_columns is not None
|
|
346
|
+
for dc in dependent_columns:
|
|
347
|
+
dc_state = subquery_column_states.get(dc)
|
|
348
|
+
if dc_state:
|
|
349
|
+
if (
|
|
350
|
+
dc_state.change_state == ColumnChangeState.CHANGED_EXP
|
|
351
|
+
and not _is_self_alias(dc_state.expression)
|
|
352
|
+
):
|
|
353
|
+
return False
|
|
354
|
+
return True
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _has_data_generator_exp(expressions: List[Expression]) -> bool:
|
|
358
|
+
return _has_expression(expressions, [_is_generator_expression])
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def _has_data_generator_or_window_exp(expressions: List[Expression]) -> bool:
|
|
362
|
+
return _has_expression(
|
|
363
|
+
expressions, [_is_generator_expression, _is_window_expression]
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _has_expression(
|
|
368
|
+
expressions: Optional[List[Expression]], checks: List[Callable[[Expression], bool]]
|
|
369
|
+
) -> bool:
|
|
370
|
+
if expressions is None:
|
|
371
|
+
return False
|
|
372
|
+
|
|
373
|
+
for exp in expressions:
|
|
374
|
+
if not exp:
|
|
375
|
+
continue
|
|
376
|
+
|
|
377
|
+
if any([check(exp) for check in checks]):
|
|
378
|
+
return True
|
|
379
|
+
|
|
380
|
+
if _has_expression(exp.children, checks):
|
|
381
|
+
return True
|
|
382
|
+
|
|
383
|
+
return False
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _is_window_expression(exp: Expression) -> bool:
|
|
387
|
+
return isinstance(exp, WindowExpression)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _is_generator_expression(exp: Expression) -> bool:
|
|
391
|
+
# https://docs.snowflake.com/en/sql-reference/functions-data-generation
|
|
392
|
+
return isinstance(exp, FunctionExpression) and (
|
|
393
|
+
exp.is_data_generator or exp.name.lower() in SEQUENCE_DEPENDENT_DATA_GENERATION
|
|
394
|
+
)
|
|
@@ -234,6 +234,10 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
234
234
|
column_order=_column_order_for_write,
|
|
235
235
|
)
|
|
236
236
|
case "append":
|
|
237
|
+
if check_table_type(snowpark_table_name, session) != "ICEBERG":
|
|
238
|
+
raise AnalysisException(
|
|
239
|
+
f"Table {snowpark_table_name} is not an iceberg table"
|
|
240
|
+
)
|
|
237
241
|
_validate_schema_and_get_writer(
|
|
238
242
|
input_df, "append", snowpark_table_name
|
|
239
243
|
).saveAsTable(
|
|
@@ -260,7 +264,10 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
260
264
|
)
|
|
261
265
|
case "overwrite":
|
|
262
266
|
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
263
|
-
|
|
267
|
+
if check_table_type(snowpark_table_name, session) != "ICEBERG":
|
|
268
|
+
raise AnalysisException(
|
|
269
|
+
f"Table {snowpark_table_name} is not an iceberg table"
|
|
270
|
+
)
|
|
264
271
|
else:
|
|
265
272
|
create_iceberg_table(
|
|
266
273
|
snowpark_table_name=snowpark_table_name,
|
|
@@ -269,10 +276,10 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
269
276
|
snowpark_session=session,
|
|
270
277
|
)
|
|
271
278
|
_validate_schema_and_get_writer(
|
|
272
|
-
input_df, "
|
|
279
|
+
input_df, "truncate", snowpark_table_name
|
|
273
280
|
).saveAsTable(
|
|
274
281
|
table_name=snowpark_table_name,
|
|
275
|
-
mode="
|
|
282
|
+
mode="truncate",
|
|
276
283
|
column_order=_column_order_for_write,
|
|
277
284
|
)
|
|
278
285
|
case _:
|
|
@@ -286,13 +293,46 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
286
293
|
write_op.table.save_method
|
|
287
294
|
== commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE
|
|
288
295
|
):
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
+
match write_mode:
|
|
297
|
+
case "overwrite":
|
|
298
|
+
if check_snowflake_table_existence(
|
|
299
|
+
snowpark_table_name, session
|
|
300
|
+
):
|
|
301
|
+
if (
|
|
302
|
+
check_table_type(snowpark_table_name, session)
|
|
303
|
+
!= "TABLE"
|
|
304
|
+
):
|
|
305
|
+
raise AnalysisException(
|
|
306
|
+
f"Table {snowpark_table_name} is not a FDN table"
|
|
307
|
+
)
|
|
308
|
+
write_mode = "truncate"
|
|
309
|
+
_validate_schema_and_get_writer(
|
|
310
|
+
input_df, write_mode, snowpark_table_name
|
|
311
|
+
).saveAsTable(
|
|
312
|
+
table_name=snowpark_table_name,
|
|
313
|
+
mode=write_mode,
|
|
314
|
+
column_order=_column_order_for_write,
|
|
315
|
+
)
|
|
316
|
+
case "append":
|
|
317
|
+
if check_table_type(snowpark_table_name, session) != "TABLE":
|
|
318
|
+
raise AnalysisException(
|
|
319
|
+
f"Table {snowpark_table_name} is not a FDN table"
|
|
320
|
+
)
|
|
321
|
+
_validate_schema_and_get_writer(
|
|
322
|
+
input_df, write_mode, snowpark_table_name
|
|
323
|
+
).saveAsTable(
|
|
324
|
+
table_name=snowpark_table_name,
|
|
325
|
+
mode=write_mode,
|
|
326
|
+
column_order=_column_order_for_write,
|
|
327
|
+
)
|
|
328
|
+
case _:
|
|
329
|
+
_validate_schema_and_get_writer(
|
|
330
|
+
input_df, write_mode, snowpark_table_name
|
|
331
|
+
).saveAsTable(
|
|
332
|
+
table_name=snowpark_table_name,
|
|
333
|
+
mode=write_mode,
|
|
334
|
+
column_order=_column_order_for_write,
|
|
335
|
+
)
|
|
296
336
|
elif (
|
|
297
337
|
write_op.table.save_method
|
|
298
338
|
== commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO
|
|
@@ -312,21 +352,6 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
312
352
|
|
|
313
353
|
def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
314
354
|
write_op = request.plan.command.write_operation_v2
|
|
315
|
-
match write_op.mode:
|
|
316
|
-
case commands_proto.WriteOperationV2.MODE_APPEND:
|
|
317
|
-
write_mode = "append"
|
|
318
|
-
case commands_proto.WriteOperationV2.MODE_CREATE:
|
|
319
|
-
write_mode = "errorifexists"
|
|
320
|
-
case commands_proto.WriteOperationV2.MODE_OVERWRITE:
|
|
321
|
-
write_mode = "overwrite"
|
|
322
|
-
case commands_proto.WriteOperationV2.MODE_REPLACE:
|
|
323
|
-
write_mode = "overwrite"
|
|
324
|
-
case commands_proto.WriteOperationV2.MODE_CREATE_OR_REPLACE:
|
|
325
|
-
write_mode = "overwrite"
|
|
326
|
-
case _:
|
|
327
|
-
raise SnowparkConnectNotImplementedError(
|
|
328
|
-
f"Write operation {write_op.mode} not implemented."
|
|
329
|
-
)
|
|
330
355
|
|
|
331
356
|
snowpark_table_name = _spark_to_snowflake(write_op.table_name)
|
|
332
357
|
result = map_relation(write_op.input)
|
|
@@ -338,55 +363,176 @@ def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
|
338
363
|
"Write operation V2 only support table writing now"
|
|
339
364
|
)
|
|
340
365
|
|
|
341
|
-
# For OVERWRITE and APPEND modes, check if table exists first - Spark requires table to exist for these operations
|
|
342
|
-
if write_op.mode in (
|
|
343
|
-
commands_proto.WriteOperationV2.MODE_OVERWRITE,
|
|
344
|
-
commands_proto.WriteOperationV2.MODE_APPEND,
|
|
345
|
-
):
|
|
346
|
-
if not check_snowflake_table_existence(snowpark_table_name, session):
|
|
347
|
-
raise AnalysisException(
|
|
348
|
-
f"[TABLE_OR_VIEW_NOT_FOUND] The table or view `{write_op.table_name}` cannot be found. "
|
|
349
|
-
f"Verify the spelling and correctness of the schema and catalog.\n"
|
|
350
|
-
)
|
|
351
|
-
|
|
352
366
|
if write_op.provider.lower() == "iceberg":
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
367
|
+
match write_op.mode:
|
|
368
|
+
case commands_proto.WriteOperationV2.MODE_CREATE:
|
|
369
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
370
|
+
raise AnalysisException(
|
|
371
|
+
f"Table {snowpark_table_name} already exists"
|
|
372
|
+
)
|
|
373
|
+
create_iceberg_table(
|
|
374
|
+
snowpark_table_name=snowpark_table_name,
|
|
375
|
+
location=write_op.table_properties.get("location"),
|
|
376
|
+
schema=input_df.schema,
|
|
377
|
+
snowpark_session=session,
|
|
378
|
+
)
|
|
379
|
+
_validate_schema_and_get_writer(
|
|
380
|
+
input_df, "append", snowpark_table_name
|
|
381
|
+
).saveAsTable(
|
|
382
|
+
table_name=snowpark_table_name,
|
|
383
|
+
mode="append",
|
|
384
|
+
column_order=_column_order_for_write,
|
|
385
|
+
)
|
|
386
|
+
case commands_proto.WriteOperationV2.MODE_APPEND:
|
|
387
|
+
if not check_snowflake_table_existence(snowpark_table_name, session):
|
|
388
|
+
raise AnalysisException(
|
|
389
|
+
f"[TABLE_OR_VIEW_NOT_FOUND] The table or view `{write_op.table_name}` cannot be found."
|
|
390
|
+
)
|
|
391
|
+
if check_table_type(snowpark_table_name, session) != "ICEBERG":
|
|
392
|
+
raise AnalysisException(
|
|
393
|
+
f"Table {snowpark_table_name} is not an iceberg table"
|
|
394
|
+
)
|
|
395
|
+
_validate_schema_and_get_writer(
|
|
396
|
+
input_df, "append", snowpark_table_name
|
|
397
|
+
).saveAsTable(
|
|
398
|
+
table_name=snowpark_table_name,
|
|
399
|
+
mode="append",
|
|
400
|
+
column_order=_column_order_for_write,
|
|
401
|
+
)
|
|
402
|
+
case commands_proto.WriteOperationV2.MODE_OVERWRITE | commands_proto.WriteOperationV2.MODE_OVERWRITE_PARTITIONS:
|
|
403
|
+
# TODO: handle the filter condition for MODE_OVERWRITE
|
|
404
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
405
|
+
if check_table_type(snowpark_table_name, session) != "ICEBERG":
|
|
406
|
+
raise AnalysisException(
|
|
407
|
+
f"Table {snowpark_table_name} is not an iceberg table"
|
|
408
|
+
)
|
|
409
|
+
else:
|
|
410
|
+
raise AnalysisException(
|
|
411
|
+
f"[TABLE_OR_VIEW_NOT_FOUND] Table {snowpark_table_name} does not exist"
|
|
412
|
+
)
|
|
413
|
+
_validate_schema_and_get_writer(
|
|
414
|
+
input_df, "truncate", snowpark_table_name
|
|
415
|
+
).saveAsTable(
|
|
416
|
+
table_name=snowpark_table_name,
|
|
417
|
+
mode="truncate",
|
|
418
|
+
column_order=_column_order_for_write,
|
|
419
|
+
)
|
|
420
|
+
case commands_proto.WriteOperationV2.MODE_REPLACE:
|
|
421
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
422
|
+
create_iceberg_table(
|
|
423
|
+
snowpark_table_name=snowpark_table_name,
|
|
424
|
+
location=write_op.table_properties.get("location"),
|
|
425
|
+
schema=input_df.schema,
|
|
426
|
+
snowpark_session=session,
|
|
427
|
+
mode="replace",
|
|
428
|
+
)
|
|
429
|
+
else:
|
|
430
|
+
raise AnalysisException(
|
|
431
|
+
f"Table {snowpark_table_name} does not exist"
|
|
432
|
+
)
|
|
433
|
+
_validate_schema_and_get_writer(
|
|
434
|
+
input_df, "replace", snowpark_table_name
|
|
435
|
+
).saveAsTable(
|
|
436
|
+
table_name=snowpark_table_name,
|
|
437
|
+
mode="append",
|
|
438
|
+
column_order=_column_order_for_write,
|
|
439
|
+
)
|
|
440
|
+
case commands_proto.WriteOperationV2.MODE_CREATE_OR_REPLACE:
|
|
441
|
+
create_iceberg_table(
|
|
442
|
+
snowpark_table_name=snowpark_table_name,
|
|
443
|
+
location=write_op.table_properties.get("location"),
|
|
444
|
+
schema=input_df.schema,
|
|
445
|
+
snowpark_session=session,
|
|
446
|
+
mode="create_or_replace",
|
|
447
|
+
)
|
|
448
|
+
_validate_schema_and_get_writer(
|
|
449
|
+
input_df, "create_or_replace", snowpark_table_name
|
|
450
|
+
).saveAsTable(
|
|
451
|
+
table_name=snowpark_table_name,
|
|
452
|
+
mode="append",
|
|
453
|
+
column_order=_column_order_for_write,
|
|
454
|
+
)
|
|
455
|
+
case _:
|
|
456
|
+
raise SnowparkConnectNotImplementedError(
|
|
457
|
+
f"Write mode {commands_proto.WriteOperationV2.Mode.Name(write_op.mode)} is not supported"
|
|
458
|
+
)
|
|
376
459
|
else:
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
460
|
+
match write_op.mode:
|
|
461
|
+
case commands_proto.WriteOperationV2.MODE_CREATE:
|
|
462
|
+
_validate_schema_and_get_writer(
|
|
463
|
+
input_df, "errorifexists", snowpark_table_name
|
|
464
|
+
).saveAsTable(
|
|
465
|
+
table_name=snowpark_table_name,
|
|
466
|
+
mode="errorifexists",
|
|
467
|
+
column_order=_column_order_for_write,
|
|
468
|
+
)
|
|
469
|
+
case commands_proto.WriteOperationV2.MODE_APPEND:
|
|
470
|
+
if not check_snowflake_table_existence(snowpark_table_name, session):
|
|
471
|
+
raise AnalysisException(
|
|
472
|
+
f"[TABLE_OR_VIEW_NOT_FOUND] The table or view `{write_op.table_name}` cannot be found."
|
|
473
|
+
)
|
|
474
|
+
if check_table_type(snowpark_table_name, session) != "TABLE":
|
|
475
|
+
raise AnalysisException(
|
|
476
|
+
f"Table {snowpark_table_name} is not a FDN table"
|
|
477
|
+
)
|
|
478
|
+
_validate_schema_and_get_writer(
|
|
479
|
+
input_df, "append", snowpark_table_name
|
|
480
|
+
).saveAsTable(
|
|
481
|
+
table_name=snowpark_table_name,
|
|
482
|
+
mode="append",
|
|
483
|
+
column_order=_column_order_for_write,
|
|
484
|
+
)
|
|
485
|
+
case commands_proto.WriteOperationV2.MODE_OVERWRITE | commands_proto.WriteOperationV2.MODE_OVERWRITE_PARTITIONS:
|
|
486
|
+
# TODO: handle the filter condition for MODE_OVERWRITE
|
|
487
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
488
|
+
if check_table_type(snowpark_table_name, session) != "TABLE":
|
|
489
|
+
raise AnalysisException(
|
|
490
|
+
f"Table {snowpark_table_name} is not a FDN table"
|
|
491
|
+
)
|
|
492
|
+
else:
|
|
493
|
+
raise AnalysisException(
|
|
494
|
+
f"[TABLE_OR_VIEW_NOT_FOUND] Table {snowpark_table_name} does not exist"
|
|
495
|
+
)
|
|
496
|
+
_validate_schema_and_get_writer(
|
|
497
|
+
input_df, "truncate", snowpark_table_name
|
|
498
|
+
).saveAsTable(
|
|
499
|
+
table_name=snowpark_table_name,
|
|
500
|
+
mode="truncate",
|
|
501
|
+
column_order=_column_order_for_write,
|
|
502
|
+
)
|
|
503
|
+
case commands_proto.WriteOperationV2.MODE_REPLACE:
|
|
504
|
+
if not check_snowflake_table_existence(snowpark_table_name, session):
|
|
505
|
+
raise AnalysisException(
|
|
506
|
+
f"Table {snowpark_table_name} does not exist"
|
|
507
|
+
)
|
|
508
|
+
_validate_schema_and_get_writer(
|
|
509
|
+
input_df, "replace", snowpark_table_name
|
|
510
|
+
).saveAsTable(
|
|
511
|
+
table_name=snowpark_table_name,
|
|
512
|
+
mode="overwrite",
|
|
513
|
+
column_order=_column_order_for_write,
|
|
514
|
+
)
|
|
515
|
+
case commands_proto.WriteOperationV2.MODE_CREATE_OR_REPLACE:
|
|
516
|
+
_validate_schema_and_get_writer(
|
|
517
|
+
input_df, "create_or_replace", snowpark_table_name
|
|
518
|
+
).saveAsTable(
|
|
519
|
+
table_name=snowpark_table_name,
|
|
520
|
+
mode="overwrite",
|
|
521
|
+
column_order=_column_order_for_write,
|
|
522
|
+
)
|
|
523
|
+
case _:
|
|
524
|
+
raise SnowparkConnectNotImplementedError(
|
|
525
|
+
f"Write mode {commands_proto.WriteOperationV2.Mode.Name(write_op.mode)} is not supported"
|
|
526
|
+
)
|
|
384
527
|
|
|
385
528
|
|
|
386
529
|
def _validate_schema_and_get_writer(
|
|
387
530
|
input_df: snowpark.DataFrame, write_mode: str, snowpark_table_name: str
|
|
388
531
|
) -> snowpark.DataFrameWriter:
|
|
389
|
-
if write_mode
|
|
532
|
+
if write_mode is not None and write_mode.lower() in (
|
|
533
|
+
"replace",
|
|
534
|
+
"create_or_replace",
|
|
535
|
+
):
|
|
390
536
|
return input_df.write
|
|
391
537
|
|
|
392
538
|
table_schema = None
|
|
@@ -528,6 +674,7 @@ def create_iceberg_table(
|
|
|
528
674
|
location: str,
|
|
529
675
|
schema: StructType,
|
|
530
676
|
snowpark_session: snowpark.Session,
|
|
677
|
+
mode: str = "create",
|
|
531
678
|
):
|
|
532
679
|
table_schema = [
|
|
533
680
|
f"{spark_to_sf_single_id(unquote_if_quoted(field.name), is_column = True)} {snowpark_to_iceberg_type(field.datatype)}"
|
|
@@ -550,8 +697,20 @@ def create_iceberg_table(
|
|
|
550
697
|
else f"EXTERNAL_VOLUME = '{config_external_volume}'"
|
|
551
698
|
)
|
|
552
699
|
|
|
700
|
+
match mode:
|
|
701
|
+
case "create":
|
|
702
|
+
create_sql = "CREATE"
|
|
703
|
+
case "replace":
|
|
704
|
+
# There's no replace for iceberg table, so we use create or replace
|
|
705
|
+
create_sql = "CREATE OR REPLACE"
|
|
706
|
+
case "create_or_replace":
|
|
707
|
+
create_sql = "CREATE OR REPLACE"
|
|
708
|
+
case _:
|
|
709
|
+
raise SnowparkConnectNotImplementedError(
|
|
710
|
+
f"Write mode {mode} is not supported for iceberg table"
|
|
711
|
+
)
|
|
553
712
|
sql = f"""
|
|
554
|
-
|
|
713
|
+
{create_sql} ICEBERG TABLE {snowpark_table_name} ({",".join(table_schema)})
|
|
555
714
|
CATALOG = 'SNOWFLAKE'
|
|
556
715
|
{external_volume}
|
|
557
716
|
{base_location};
|
|
@@ -637,3 +796,19 @@ def check_snowflake_table_existence(
|
|
|
637
796
|
return True
|
|
638
797
|
except Exception:
|
|
639
798
|
return False
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
def check_table_type(
|
|
802
|
+
snowpark_table_name: str,
|
|
803
|
+
snowpark_session: snowpark.Session,
|
|
804
|
+
) -> str:
|
|
805
|
+
# currently we only support iceberg table and FDN table
|
|
806
|
+
metadata = snowpark_session.sql(
|
|
807
|
+
f"SHOW TABLES LIKE '{unquote_if_quoted(snowpark_table_name)}';"
|
|
808
|
+
).collect()
|
|
809
|
+
if metadata is None or len(metadata) == 0:
|
|
810
|
+
raise AnalysisException(f"Table {snowpark_table_name} does not exist")
|
|
811
|
+
metadata = metadata[0]
|
|
812
|
+
if metadata.as_dict().get("is_iceberg") == "Y":
|
|
813
|
+
return "ICEBERG"
|
|
814
|
+
return "TABLE"
|
|
@@ -83,6 +83,7 @@ from snowflake.snowpark_connect.utils.context import (
|
|
|
83
83
|
set_session_id,
|
|
84
84
|
set_spark_version,
|
|
85
85
|
)
|
|
86
|
+
from snowflake.snowpark_connect.utils.env_utils import get_int_from_env
|
|
86
87
|
from snowflake.snowpark_connect.utils.interrupt import (
|
|
87
88
|
interrupt_all_queries,
|
|
88
89
|
interrupt_queries_with_tag,
|
|
@@ -700,11 +701,27 @@ def _serve(
|
|
|
700
701
|
return
|
|
701
702
|
|
|
702
703
|
server_options = [
|
|
703
|
-
(
|
|
704
|
-
|
|
704
|
+
(
|
|
705
|
+
"grpc.max_receive_message_length",
|
|
706
|
+
get_int_from_env(
|
|
707
|
+
"SNOWFLAKE_GRPC_MAX_MESSAGE_SIZE",
|
|
708
|
+
_SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE,
|
|
709
|
+
),
|
|
710
|
+
),
|
|
711
|
+
(
|
|
712
|
+
"grpc.max_metadata_size",
|
|
713
|
+
get_int_from_env(
|
|
714
|
+
"SNOWFLAKE_GRPC_MAX_METADATA_SIZE",
|
|
715
|
+
_SPARK_CONNECT_GRPC_MAX_METADATA_SIZE,
|
|
716
|
+
),
|
|
717
|
+
),
|
|
705
718
|
(
|
|
706
719
|
"grpc.absolute_max_metadata_size",
|
|
707
|
-
|
|
720
|
+
get_int_from_env(
|
|
721
|
+
"SNOWFLAKE_GRPC_MAX_METADATA_SIZE",
|
|
722
|
+
_SPARK_CONNECT_GRPC_MAX_METADATA_SIZE,
|
|
723
|
+
)
|
|
724
|
+
* 2,
|
|
708
725
|
),
|
|
709
726
|
]
|
|
710
727
|
server = grpc.server(
|
|
@@ -812,8 +829,11 @@ class UnixDomainSocketChannelBuilder(ChannelBuilder):
|
|
|
812
829
|
Spark Connect gRPC channel builder for Unix domain sockets
|
|
813
830
|
"""
|
|
814
831
|
|
|
815
|
-
def __init__(
|
|
816
|
-
url: str =
|
|
832
|
+
def __init__(
|
|
833
|
+
self, url: str = None, channelOptions: Optional[List[Tuple[str, Any]]] = None
|
|
834
|
+
) -> None:
|
|
835
|
+
if url is None:
|
|
836
|
+
url = get_client_url()
|
|
817
837
|
if url[:6] != "unix:/" or len(url) < 7:
|
|
818
838
|
raise PySparkValueError(
|
|
819
839
|
error_class="INVALID_CONNECT_URL",
|
|
@@ -59,7 +59,7 @@ def _get_struct_type_class():
|
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
@cache
|
|
62
|
-
def
|
|
62
|
+
def get_python_sql_utils_class():
|
|
63
63
|
return jpype.JClass("org.apache.spark.sql.api.python.PythonSQLUtils")
|
|
64
64
|
|
|
65
65
|
|
|
@@ -70,7 +70,7 @@ def parse_ddl_with_spark_scala(ddl_string: str) -> pyspark.sql.types.DataType:
|
|
|
70
70
|
This mimics pysparks.ddl parsing logic pyspark.sql.types._py_parse_datatype_string
|
|
71
71
|
"""
|
|
72
72
|
struct_type_class = _get_struct_type_class()
|
|
73
|
-
python_sql_utils =
|
|
73
|
+
python_sql_utils = get_python_sql_utils_class()
|
|
74
74
|
|
|
75
75
|
try:
|
|
76
76
|
# DDL format, "fieldname datatype, fieldname datatype".
|