snowpark-connect 0.21.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/config.py +19 -14
- snowflake/snowpark_connect/error/error_utils.py +32 -0
- snowflake/snowpark_connect/error/exceptions.py +4 -0
- snowflake/snowpark_connect/expression/hybrid_column_map.py +192 -0
- snowflake/snowpark_connect/expression/literal.py +9 -12
- snowflake/snowpark_connect/expression/map_cast.py +20 -4
- snowflake/snowpark_connect/expression/map_expression.py +8 -1
- snowflake/snowpark_connect/expression/map_udf.py +4 -4
- snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +32 -5
- snowflake/snowpark_connect/expression/map_unresolved_function.py +269 -134
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +8 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +4 -2
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +127 -21
- snowflake/snowpark_connect/relation/map_aggregate.py +154 -18
- snowflake/snowpark_connect/relation/map_column_ops.py +59 -8
- snowflake/snowpark_connect/relation/map_extension.py +58 -24
- snowflake/snowpark_connect/relation/map_local_relation.py +8 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +3 -1
- snowflake/snowpark_connect/relation/map_row_ops.py +30 -1
- snowflake/snowpark_connect/relation/map_sql.py +40 -196
- snowflake/snowpark_connect/relation/map_udtf.py +4 -4
- snowflake/snowpark_connect/relation/read/map_read.py +2 -1
- snowflake/snowpark_connect/relation/read/map_read_json.py +12 -1
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +8 -1
- snowflake/snowpark_connect/relation/read/reader_config.py +10 -0
- snowflake/snowpark_connect/relation/read/utils.py +7 -6
- snowflake/snowpark_connect/relation/utils.py +170 -1
- snowflake/snowpark_connect/relation/write/map_write.py +306 -87
- snowflake/snowpark_connect/server.py +34 -5
- snowflake/snowpark_connect/type_mapping.py +6 -2
- snowflake/snowpark_connect/utils/describe_query_cache.py +2 -9
- snowflake/snowpark_connect/utils/env_utils.py +55 -0
- snowflake/snowpark_connect/utils/session.py +21 -4
- snowflake/snowpark_connect/utils/telemetry.py +213 -61
- snowflake/snowpark_connect/utils/udxf_import_utils.py +14 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/__init__.py +0 -0
- snowflake/snowpark_decoder/_internal/proto/generated/DataframeProcessorMsg_pb2.py +36 -0
- snowflake/snowpark_decoder/_internal/proto/generated/DataframeProcessorMsg_pb2.pyi +156 -0
- snowflake/snowpark_decoder/dp_session.py +111 -0
- snowflake/snowpark_decoder/spark_decoder.py +76 -0
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/METADATA +2 -2
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/RECORD +55 -44
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/top_level.txt +1 -0
- spark/__init__.py +0 -0
- spark/connect/__init__.py +0 -0
- spark/connect/envelope_pb2.py +31 -0
- spark/connect/envelope_pb2.pyi +46 -0
- snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
- {snowpark_connect-0.21.0.data → snowpark_connect-0.23.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.21.0.data → snowpark_connect-0.23.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.21.0.data → snowpark_connect-0.23.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/NOTICE-binary +0 -0
|
@@ -73,12 +73,13 @@ def rename_columns_as_snowflake_standard(
|
|
|
73
73
|
return df, []
|
|
74
74
|
|
|
75
75
|
new_columns = make_column_names_snowpark_compatible(df.columns, plan_id)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
76
|
+
result = df.toDF(*new_columns)
|
|
77
|
+
if result._select_statement is not None:
|
|
78
|
+
# do not allow snowpark to flatten the to_df result
|
|
79
|
+
# TODO: remove after SNOW-2203706 is fixed
|
|
80
|
+
result._select_statement.flatten_disabled = True
|
|
81
|
+
|
|
82
|
+
return (result, new_columns)
|
|
82
83
|
|
|
83
84
|
|
|
84
85
|
class Connection(Protocol):
|
|
@@ -6,12 +6,28 @@ import random
|
|
|
6
6
|
import re
|
|
7
7
|
import string
|
|
8
8
|
import time
|
|
9
|
-
from
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
from typing import AbstractSet, List, Optional, Sequence
|
|
10
11
|
|
|
11
12
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
12
13
|
|
|
13
14
|
import snowflake.snowpark.functions as snowpark_fn
|
|
14
15
|
from snowflake import snowpark
|
|
16
|
+
from snowflake.snowpark._internal.analyzer.expression import (
|
|
17
|
+
COLUMN_DEPENDENCY_ALL,
|
|
18
|
+
COLUMN_DEPENDENCY_DOLLAR,
|
|
19
|
+
Expression,
|
|
20
|
+
FunctionExpression,
|
|
21
|
+
derive_dependent_columns,
|
|
22
|
+
)
|
|
23
|
+
from snowflake.snowpark._internal.analyzer.select_statement import (
|
|
24
|
+
SEQUENCE_DEPENDENT_DATA_GENERATION,
|
|
25
|
+
ColumnChangeState,
|
|
26
|
+
ColumnStateDict,
|
|
27
|
+
SelectStatement,
|
|
28
|
+
)
|
|
29
|
+
from snowflake.snowpark._internal.analyzer.unary_expression import Alias
|
|
30
|
+
from snowflake.snowpark._internal.analyzer.window_expression import WindowExpression
|
|
15
31
|
from snowflake.snowpark.types import (
|
|
16
32
|
BinaryType,
|
|
17
33
|
BooleanType,
|
|
@@ -223,3 +239,156 @@ def snowpark_functions_col(name: str, column_map: ColumnNameMap) -> snowpark.Col
|
|
|
223
239
|
"""
|
|
224
240
|
is_qualified_name = name not in column_map.get_snowpark_columns()
|
|
225
241
|
return snowpark_fn.col(name, _is_qualified_name=is_qualified_name)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def can_sort_be_flattened(
|
|
245
|
+
select_statement: Optional[SelectStatement], *sort_expressions: Optional[Expression]
|
|
246
|
+
) -> bool:
|
|
247
|
+
"""
|
|
248
|
+
Checks if the given SelectStatement can be "flattened" when sorting with regard to the given sort expressions.
|
|
249
|
+
Flattening means that the given SelectStatement can be enhanced and reused instead of being treated
|
|
250
|
+
as a subquery in the FROM clause after a "sort" or "filter" operation. Flattening allows accessing dropped columns
|
|
251
|
+
for sort and filter expressions.
|
|
252
|
+
"""
|
|
253
|
+
if not select_statement or select_statement.flatten_disabled:
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
# In some cases, flattening sort can lead to leaving the "order by" clause in a subquery,
|
|
257
|
+
# which can cause incorrect ordering. We want to avoid flattening sort when all its dependent columns
|
|
258
|
+
# are available in the current projection.
|
|
259
|
+
dependent_columns_in_sort = derive_dependent_columns(*sort_expressions)
|
|
260
|
+
columns_in_projection = _get_columns_in_projection(select_statement.projection)
|
|
261
|
+
if len(dependent_columns_in_sort - columns_in_projection) == 0:
|
|
262
|
+
return False
|
|
263
|
+
|
|
264
|
+
return _can_clause_dependent_columns_flatten(
|
|
265
|
+
dependent_columns_in_sort, select_statement.column_states
|
|
266
|
+
) and not _has_data_generator_exp(select_statement.projection)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def can_filter_be_flattened(
|
|
270
|
+
select_statement: Optional[SelectStatement], condition: Expression
|
|
271
|
+
) -> bool:
|
|
272
|
+
"""
|
|
273
|
+
Checks if the given SelectStatement can be "flattened" when filtering with regard to the given condition.
|
|
274
|
+
Flattening means that the given SelectStatement can be enhanced and reused instead of being treated
|
|
275
|
+
as a subquery in the FROM clause after a "sort" or "filter" operation. Flattening allows accessing dropped columns
|
|
276
|
+
for sort and filter expressions.
|
|
277
|
+
"""
|
|
278
|
+
if not select_statement or select_statement.flatten_disabled:
|
|
279
|
+
return False
|
|
280
|
+
|
|
281
|
+
return all(
|
|
282
|
+
[
|
|
283
|
+
_can_clause_dependent_columns_flatten(
|
|
284
|
+
derive_dependent_columns(condition), select_statement.column_states
|
|
285
|
+
),
|
|
286
|
+
not _has_data_generator_or_window_exp(select_statement.projection),
|
|
287
|
+
select_statement.order_by is None,
|
|
288
|
+
select_statement.limit_ is None,
|
|
289
|
+
]
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def _get_columns_in_projection(
|
|
294
|
+
projection: Optional[List[Expression]],
|
|
295
|
+
) -> AbstractSet[str]:
|
|
296
|
+
if projection is None:
|
|
297
|
+
return set()
|
|
298
|
+
|
|
299
|
+
columns = set()
|
|
300
|
+
for expression in projection:
|
|
301
|
+
if hasattr(expression, "name") and expression.name:
|
|
302
|
+
columns.add(expression.name)
|
|
303
|
+
elif hasattr(expression, "children"):
|
|
304
|
+
columns.update(_get_columns_in_projection(expression.children))
|
|
305
|
+
|
|
306
|
+
return columns
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _is_self_alias(expression):
|
|
310
|
+
"""
|
|
311
|
+
Check if the expression is a self-alias, meaning it has an alias that is the same as its name.
|
|
312
|
+
A self-alias can be flattened, even if Snowpark treats it as a CHANGED_EXP.
|
|
313
|
+
"""
|
|
314
|
+
if not isinstance(expression, Alias):
|
|
315
|
+
return False
|
|
316
|
+
|
|
317
|
+
first_child_with_name = expression.child
|
|
318
|
+
while (
|
|
319
|
+
first_child_with_name
|
|
320
|
+
and hasattr(first_child_with_name, "child")
|
|
321
|
+
and not hasattr(first_child_with_name, "name")
|
|
322
|
+
):
|
|
323
|
+
first_child_with_name = first_child_with_name.child
|
|
324
|
+
|
|
325
|
+
return (
|
|
326
|
+
first_child_with_name
|
|
327
|
+
and (first_child_with_name, "name")
|
|
328
|
+
and first_child_with_name.name == expression.name
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _can_clause_dependent_columns_flatten(
|
|
333
|
+
dependent_columns: Optional[AbstractSet[str]],
|
|
334
|
+
subquery_column_states: ColumnStateDict,
|
|
335
|
+
) -> bool:
|
|
336
|
+
if dependent_columns == COLUMN_DEPENDENCY_DOLLAR:
|
|
337
|
+
return False
|
|
338
|
+
elif (
|
|
339
|
+
subquery_column_states.has_changed_columns
|
|
340
|
+
or subquery_column_states.has_new_columns
|
|
341
|
+
):
|
|
342
|
+
if dependent_columns == COLUMN_DEPENDENCY_ALL:
|
|
343
|
+
return False
|
|
344
|
+
|
|
345
|
+
assert dependent_columns is not None
|
|
346
|
+
for dc in dependent_columns:
|
|
347
|
+
dc_state = subquery_column_states.get(dc)
|
|
348
|
+
if dc_state:
|
|
349
|
+
if (
|
|
350
|
+
dc_state.change_state == ColumnChangeState.CHANGED_EXP
|
|
351
|
+
and not _is_self_alias(dc_state.expression)
|
|
352
|
+
):
|
|
353
|
+
return False
|
|
354
|
+
return True
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _has_data_generator_exp(expressions: List[Expression]) -> bool:
|
|
358
|
+
return _has_expression(expressions, [_is_generator_expression])
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def _has_data_generator_or_window_exp(expressions: List[Expression]) -> bool:
|
|
362
|
+
return _has_expression(
|
|
363
|
+
expressions, [_is_generator_expression, _is_window_expression]
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _has_expression(
|
|
368
|
+
expressions: Optional[List[Expression]], checks: List[Callable[[Expression], bool]]
|
|
369
|
+
) -> bool:
|
|
370
|
+
if expressions is None:
|
|
371
|
+
return False
|
|
372
|
+
|
|
373
|
+
for exp in expressions:
|
|
374
|
+
if not exp:
|
|
375
|
+
continue
|
|
376
|
+
|
|
377
|
+
if any([check(exp) for check in checks]):
|
|
378
|
+
return True
|
|
379
|
+
|
|
380
|
+
if _has_expression(exp.children, checks):
|
|
381
|
+
return True
|
|
382
|
+
|
|
383
|
+
return False
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _is_window_expression(exp: Expression) -> bool:
|
|
387
|
+
return isinstance(exp, WindowExpression)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def _is_generator_expression(exp: Expression) -> bool:
|
|
391
|
+
# https://docs.snowflake.com/en/sql-reference/functions-data-generation
|
|
392
|
+
return isinstance(exp, FunctionExpression) and (
|
|
393
|
+
exp.is_data_generator or exp.name.lower() in SEQUENCE_DEPENDENT_DATA_GENERATION
|
|
394
|
+
)
|
|
@@ -214,27 +214,78 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
214
214
|
)
|
|
215
215
|
snowpark_table_name = _spark_to_snowflake(table_name)
|
|
216
216
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
217
|
+
match write_mode:
|
|
218
|
+
case None | "error" | "errorifexists":
|
|
219
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
220
|
+
raise AnalysisException(
|
|
221
|
+
f"Table {snowpark_table_name} already exists"
|
|
222
|
+
)
|
|
223
|
+
create_iceberg_table(
|
|
224
|
+
snowpark_table_name=snowpark_table_name,
|
|
225
|
+
location=write_op.options.get("location", None),
|
|
226
|
+
schema=input_df.schema,
|
|
227
|
+
snowpark_session=session,
|
|
228
|
+
)
|
|
229
|
+
_validate_schema_and_get_writer(
|
|
230
|
+
input_df, "append", snowpark_table_name
|
|
231
|
+
).saveAsTable(
|
|
232
|
+
table_name=snowpark_table_name,
|
|
233
|
+
mode="append",
|
|
234
|
+
column_order=_column_order_for_write,
|
|
235
|
+
)
|
|
236
|
+
case "append":
|
|
237
|
+
if check_table_type(snowpark_table_name, session) != "ICEBERG":
|
|
238
|
+
raise AnalysisException(
|
|
239
|
+
f"Table {snowpark_table_name} is not an iceberg table"
|
|
240
|
+
)
|
|
241
|
+
_validate_schema_and_get_writer(
|
|
242
|
+
input_df, "append", snowpark_table_name
|
|
243
|
+
).saveAsTable(
|
|
244
|
+
table_name=snowpark_table_name,
|
|
245
|
+
mode="append",
|
|
246
|
+
column_order=_column_order_for_write,
|
|
247
|
+
)
|
|
248
|
+
case "ignore":
|
|
249
|
+
if not check_snowflake_table_existence(
|
|
250
|
+
snowpark_table_name, session
|
|
251
|
+
):
|
|
252
|
+
create_iceberg_table(
|
|
253
|
+
snowpark_table_name=snowpark_table_name,
|
|
254
|
+
location=write_op.options.get("location", None),
|
|
255
|
+
schema=input_df.schema,
|
|
256
|
+
snowpark_session=session,
|
|
257
|
+
)
|
|
258
|
+
_validate_schema_and_get_writer(
|
|
259
|
+
input_df, "append", snowpark_table_name
|
|
260
|
+
).saveAsTable(
|
|
261
|
+
table_name=snowpark_table_name,
|
|
262
|
+
mode="append",
|
|
263
|
+
column_order=_column_order_for_write,
|
|
264
|
+
)
|
|
265
|
+
case "overwrite":
|
|
266
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
267
|
+
if check_table_type(snowpark_table_name, session) != "ICEBERG":
|
|
268
|
+
raise AnalysisException(
|
|
269
|
+
f"Table {snowpark_table_name} is not an iceberg table"
|
|
270
|
+
)
|
|
271
|
+
else:
|
|
272
|
+
create_iceberg_table(
|
|
273
|
+
snowpark_table_name=snowpark_table_name,
|
|
274
|
+
location=write_op.options.get("location", None),
|
|
275
|
+
schema=input_df.schema,
|
|
276
|
+
snowpark_session=session,
|
|
277
|
+
)
|
|
278
|
+
_validate_schema_and_get_writer(
|
|
279
|
+
input_df, "truncate", snowpark_table_name
|
|
280
|
+
).saveAsTable(
|
|
281
|
+
table_name=snowpark_table_name,
|
|
282
|
+
mode="truncate",
|
|
283
|
+
column_order=_column_order_for_write,
|
|
284
|
+
)
|
|
285
|
+
case _:
|
|
286
|
+
raise SnowparkConnectNotImplementedError(
|
|
287
|
+
f"Write mode {write_mode} is not supported"
|
|
288
|
+
)
|
|
238
289
|
case _:
|
|
239
290
|
snowpark_table_name = _spark_to_snowflake(write_op.table.table_name)
|
|
240
291
|
|
|
@@ -242,13 +293,46 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
242
293
|
write_op.table.save_method
|
|
243
294
|
== commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE
|
|
244
295
|
):
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
296
|
+
match write_mode:
|
|
297
|
+
case "overwrite":
|
|
298
|
+
if check_snowflake_table_existence(
|
|
299
|
+
snowpark_table_name, session
|
|
300
|
+
):
|
|
301
|
+
if (
|
|
302
|
+
check_table_type(snowpark_table_name, session)
|
|
303
|
+
!= "TABLE"
|
|
304
|
+
):
|
|
305
|
+
raise AnalysisException(
|
|
306
|
+
f"Table {snowpark_table_name} is not a FDN table"
|
|
307
|
+
)
|
|
308
|
+
write_mode = "truncate"
|
|
309
|
+
_validate_schema_and_get_writer(
|
|
310
|
+
input_df, write_mode, snowpark_table_name
|
|
311
|
+
).saveAsTable(
|
|
312
|
+
table_name=snowpark_table_name,
|
|
313
|
+
mode=write_mode,
|
|
314
|
+
column_order=_column_order_for_write,
|
|
315
|
+
)
|
|
316
|
+
case "append":
|
|
317
|
+
if check_table_type(snowpark_table_name, session) != "TABLE":
|
|
318
|
+
raise AnalysisException(
|
|
319
|
+
f"Table {snowpark_table_name} is not a FDN table"
|
|
320
|
+
)
|
|
321
|
+
_validate_schema_and_get_writer(
|
|
322
|
+
input_df, write_mode, snowpark_table_name
|
|
323
|
+
).saveAsTable(
|
|
324
|
+
table_name=snowpark_table_name,
|
|
325
|
+
mode=write_mode,
|
|
326
|
+
column_order=_column_order_for_write,
|
|
327
|
+
)
|
|
328
|
+
case _:
|
|
329
|
+
_validate_schema_and_get_writer(
|
|
330
|
+
input_df, write_mode, snowpark_table_name
|
|
331
|
+
).saveAsTable(
|
|
332
|
+
table_name=snowpark_table_name,
|
|
333
|
+
mode=write_mode,
|
|
334
|
+
column_order=_column_order_for_write,
|
|
335
|
+
)
|
|
252
336
|
elif (
|
|
253
337
|
write_op.table.save_method
|
|
254
338
|
== commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO
|
|
@@ -268,21 +352,6 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
268
352
|
|
|
269
353
|
def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
270
354
|
write_op = request.plan.command.write_operation_v2
|
|
271
|
-
match write_op.mode:
|
|
272
|
-
case commands_proto.WriteOperationV2.MODE_APPEND:
|
|
273
|
-
write_mode = "append"
|
|
274
|
-
case commands_proto.WriteOperationV2.MODE_CREATE:
|
|
275
|
-
write_mode = "errorifexists"
|
|
276
|
-
case commands_proto.WriteOperationV2.MODE_OVERWRITE:
|
|
277
|
-
write_mode = "overwrite"
|
|
278
|
-
case commands_proto.WriteOperationV2.MODE_REPLACE:
|
|
279
|
-
write_mode = "overwrite"
|
|
280
|
-
case commands_proto.WriteOperationV2.MODE_CREATE_OR_REPLACE:
|
|
281
|
-
write_mode = "overwrite"
|
|
282
|
-
case _:
|
|
283
|
-
raise SnowparkConnectNotImplementedError(
|
|
284
|
-
f"Write operation {write_op.mode} not implemented."
|
|
285
|
-
)
|
|
286
355
|
|
|
287
356
|
snowpark_table_name = _spark_to_snowflake(write_op.table_name)
|
|
288
357
|
result = map_relation(write_op.input)
|
|
@@ -294,55 +363,176 @@ def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
|
294
363
|
"Write operation V2 only support table writing now"
|
|
295
364
|
)
|
|
296
365
|
|
|
297
|
-
# For OVERWRITE and APPEND modes, check if table exists first - Spark requires table to exist for these operations
|
|
298
|
-
if write_op.mode in (
|
|
299
|
-
commands_proto.WriteOperationV2.MODE_OVERWRITE,
|
|
300
|
-
commands_proto.WriteOperationV2.MODE_APPEND,
|
|
301
|
-
):
|
|
302
|
-
if not check_snowflake_table_existance(snowpark_table_name, session):
|
|
303
|
-
raise AnalysisException(
|
|
304
|
-
f"[TABLE_OR_VIEW_NOT_FOUND] The table or view `{write_op.table_name}` cannot be found. "
|
|
305
|
-
f"Verify the spelling and correctness of the schema and catalog.\n"
|
|
306
|
-
)
|
|
307
|
-
|
|
308
366
|
if write_op.provider.lower() == "iceberg":
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
367
|
+
match write_op.mode:
|
|
368
|
+
case commands_proto.WriteOperationV2.MODE_CREATE:
|
|
369
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
370
|
+
raise AnalysisException(
|
|
371
|
+
f"Table {snowpark_table_name} already exists"
|
|
372
|
+
)
|
|
373
|
+
create_iceberg_table(
|
|
374
|
+
snowpark_table_name=snowpark_table_name,
|
|
375
|
+
location=write_op.table_properties.get("location"),
|
|
376
|
+
schema=input_df.schema,
|
|
377
|
+
snowpark_session=session,
|
|
378
|
+
)
|
|
379
|
+
_validate_schema_and_get_writer(
|
|
380
|
+
input_df, "append", snowpark_table_name
|
|
381
|
+
).saveAsTable(
|
|
382
|
+
table_name=snowpark_table_name,
|
|
383
|
+
mode="append",
|
|
384
|
+
column_order=_column_order_for_write,
|
|
385
|
+
)
|
|
386
|
+
case commands_proto.WriteOperationV2.MODE_APPEND:
|
|
387
|
+
if not check_snowflake_table_existence(snowpark_table_name, session):
|
|
388
|
+
raise AnalysisException(
|
|
389
|
+
f"[TABLE_OR_VIEW_NOT_FOUND] The table or view `{write_op.table_name}` cannot be found."
|
|
390
|
+
)
|
|
391
|
+
if check_table_type(snowpark_table_name, session) != "ICEBERG":
|
|
392
|
+
raise AnalysisException(
|
|
393
|
+
f"Table {snowpark_table_name} is not an iceberg table"
|
|
394
|
+
)
|
|
395
|
+
_validate_schema_and_get_writer(
|
|
396
|
+
input_df, "append", snowpark_table_name
|
|
397
|
+
).saveAsTable(
|
|
398
|
+
table_name=snowpark_table_name,
|
|
399
|
+
mode="append",
|
|
400
|
+
column_order=_column_order_for_write,
|
|
401
|
+
)
|
|
402
|
+
case commands_proto.WriteOperationV2.MODE_OVERWRITE | commands_proto.WriteOperationV2.MODE_OVERWRITE_PARTITIONS:
|
|
403
|
+
# TODO: handle the filter condition for MODE_OVERWRITE
|
|
404
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
405
|
+
if check_table_type(snowpark_table_name, session) != "ICEBERG":
|
|
406
|
+
raise AnalysisException(
|
|
407
|
+
f"Table {snowpark_table_name} is not an iceberg table"
|
|
408
|
+
)
|
|
409
|
+
else:
|
|
410
|
+
raise AnalysisException(
|
|
411
|
+
f"[TABLE_OR_VIEW_NOT_FOUND] Table {snowpark_table_name} does not exist"
|
|
412
|
+
)
|
|
413
|
+
_validate_schema_and_get_writer(
|
|
414
|
+
input_df, "truncate", snowpark_table_name
|
|
415
|
+
).saveAsTable(
|
|
416
|
+
table_name=snowpark_table_name,
|
|
417
|
+
mode="truncate",
|
|
418
|
+
column_order=_column_order_for_write,
|
|
419
|
+
)
|
|
420
|
+
case commands_proto.WriteOperationV2.MODE_REPLACE:
|
|
421
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
422
|
+
create_iceberg_table(
|
|
423
|
+
snowpark_table_name=snowpark_table_name,
|
|
424
|
+
location=write_op.table_properties.get("location"),
|
|
425
|
+
schema=input_df.schema,
|
|
426
|
+
snowpark_session=session,
|
|
427
|
+
mode="replace",
|
|
428
|
+
)
|
|
429
|
+
else:
|
|
430
|
+
raise AnalysisException(
|
|
431
|
+
f"Table {snowpark_table_name} does not exist"
|
|
432
|
+
)
|
|
433
|
+
_validate_schema_and_get_writer(
|
|
434
|
+
input_df, "replace", snowpark_table_name
|
|
435
|
+
).saveAsTable(
|
|
436
|
+
table_name=snowpark_table_name,
|
|
437
|
+
mode="append",
|
|
438
|
+
column_order=_column_order_for_write,
|
|
439
|
+
)
|
|
440
|
+
case commands_proto.WriteOperationV2.MODE_CREATE_OR_REPLACE:
|
|
441
|
+
create_iceberg_table(
|
|
442
|
+
snowpark_table_name=snowpark_table_name,
|
|
443
|
+
location=write_op.table_properties.get("location"),
|
|
444
|
+
schema=input_df.schema,
|
|
445
|
+
snowpark_session=session,
|
|
446
|
+
mode="create_or_replace",
|
|
447
|
+
)
|
|
448
|
+
_validate_schema_and_get_writer(
|
|
449
|
+
input_df, "create_or_replace", snowpark_table_name
|
|
450
|
+
).saveAsTable(
|
|
451
|
+
table_name=snowpark_table_name,
|
|
452
|
+
mode="append",
|
|
453
|
+
column_order=_column_order_for_write,
|
|
454
|
+
)
|
|
455
|
+
case _:
|
|
456
|
+
raise SnowparkConnectNotImplementedError(
|
|
457
|
+
f"Write mode {commands_proto.WriteOperationV2.Mode.Name(write_op.mode)} is not supported"
|
|
458
|
+
)
|
|
332
459
|
else:
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
460
|
+
match write_op.mode:
|
|
461
|
+
case commands_proto.WriteOperationV2.MODE_CREATE:
|
|
462
|
+
_validate_schema_and_get_writer(
|
|
463
|
+
input_df, "errorifexists", snowpark_table_name
|
|
464
|
+
).saveAsTable(
|
|
465
|
+
table_name=snowpark_table_name,
|
|
466
|
+
mode="errorifexists",
|
|
467
|
+
column_order=_column_order_for_write,
|
|
468
|
+
)
|
|
469
|
+
case commands_proto.WriteOperationV2.MODE_APPEND:
|
|
470
|
+
if not check_snowflake_table_existence(snowpark_table_name, session):
|
|
471
|
+
raise AnalysisException(
|
|
472
|
+
f"[TABLE_OR_VIEW_NOT_FOUND] The table or view `{write_op.table_name}` cannot be found."
|
|
473
|
+
)
|
|
474
|
+
if check_table_type(snowpark_table_name, session) != "TABLE":
|
|
475
|
+
raise AnalysisException(
|
|
476
|
+
f"Table {snowpark_table_name} is not a FDN table"
|
|
477
|
+
)
|
|
478
|
+
_validate_schema_and_get_writer(
|
|
479
|
+
input_df, "append", snowpark_table_name
|
|
480
|
+
).saveAsTable(
|
|
481
|
+
table_name=snowpark_table_name,
|
|
482
|
+
mode="append",
|
|
483
|
+
column_order=_column_order_for_write,
|
|
484
|
+
)
|
|
485
|
+
case commands_proto.WriteOperationV2.MODE_OVERWRITE | commands_proto.WriteOperationV2.MODE_OVERWRITE_PARTITIONS:
|
|
486
|
+
# TODO: handle the filter condition for MODE_OVERWRITE
|
|
487
|
+
if check_snowflake_table_existence(snowpark_table_name, session):
|
|
488
|
+
if check_table_type(snowpark_table_name, session) != "TABLE":
|
|
489
|
+
raise AnalysisException(
|
|
490
|
+
f"Table {snowpark_table_name} is not a FDN table"
|
|
491
|
+
)
|
|
492
|
+
else:
|
|
493
|
+
raise AnalysisException(
|
|
494
|
+
f"[TABLE_OR_VIEW_NOT_FOUND] Table {snowpark_table_name} does not exist"
|
|
495
|
+
)
|
|
496
|
+
_validate_schema_and_get_writer(
|
|
497
|
+
input_df, "truncate", snowpark_table_name
|
|
498
|
+
).saveAsTable(
|
|
499
|
+
table_name=snowpark_table_name,
|
|
500
|
+
mode="truncate",
|
|
501
|
+
column_order=_column_order_for_write,
|
|
502
|
+
)
|
|
503
|
+
case commands_proto.WriteOperationV2.MODE_REPLACE:
|
|
504
|
+
if not check_snowflake_table_existence(snowpark_table_name, session):
|
|
505
|
+
raise AnalysisException(
|
|
506
|
+
f"Table {snowpark_table_name} does not exist"
|
|
507
|
+
)
|
|
508
|
+
_validate_schema_and_get_writer(
|
|
509
|
+
input_df, "replace", snowpark_table_name
|
|
510
|
+
).saveAsTable(
|
|
511
|
+
table_name=snowpark_table_name,
|
|
512
|
+
mode="overwrite",
|
|
513
|
+
column_order=_column_order_for_write,
|
|
514
|
+
)
|
|
515
|
+
case commands_proto.WriteOperationV2.MODE_CREATE_OR_REPLACE:
|
|
516
|
+
_validate_schema_and_get_writer(
|
|
517
|
+
input_df, "create_or_replace", snowpark_table_name
|
|
518
|
+
).saveAsTable(
|
|
519
|
+
table_name=snowpark_table_name,
|
|
520
|
+
mode="overwrite",
|
|
521
|
+
column_order=_column_order_for_write,
|
|
522
|
+
)
|
|
523
|
+
case _:
|
|
524
|
+
raise SnowparkConnectNotImplementedError(
|
|
525
|
+
f"Write mode {commands_proto.WriteOperationV2.Mode.Name(write_op.mode)} is not supported"
|
|
526
|
+
)
|
|
340
527
|
|
|
341
528
|
|
|
342
529
|
def _validate_schema_and_get_writer(
|
|
343
530
|
input_df: snowpark.DataFrame, write_mode: str, snowpark_table_name: str
|
|
344
531
|
) -> snowpark.DataFrameWriter:
|
|
345
|
-
if write_mode
|
|
532
|
+
if write_mode is not None and write_mode.lower() in (
|
|
533
|
+
"replace",
|
|
534
|
+
"create_or_replace",
|
|
535
|
+
):
|
|
346
536
|
return input_df.write
|
|
347
537
|
|
|
348
538
|
table_schema = None
|
|
@@ -484,6 +674,7 @@ def create_iceberg_table(
|
|
|
484
674
|
location: str,
|
|
485
675
|
schema: StructType,
|
|
486
676
|
snowpark_session: snowpark.Session,
|
|
677
|
+
mode: str = "create",
|
|
487
678
|
):
|
|
488
679
|
table_schema = [
|
|
489
680
|
f"{spark_to_sf_single_id(unquote_if_quoted(field.name), is_column = True)} {snowpark_to_iceberg_type(field.datatype)}"
|
|
@@ -506,8 +697,20 @@ def create_iceberg_table(
|
|
|
506
697
|
else f"EXTERNAL_VOLUME = '{config_external_volume}'"
|
|
507
698
|
)
|
|
508
699
|
|
|
700
|
+
match mode:
|
|
701
|
+
case "create":
|
|
702
|
+
create_sql = "CREATE"
|
|
703
|
+
case "replace":
|
|
704
|
+
# There's no replace for iceberg table, so we use create or replace
|
|
705
|
+
create_sql = "CREATE OR REPLACE"
|
|
706
|
+
case "create_or_replace":
|
|
707
|
+
create_sql = "CREATE OR REPLACE"
|
|
708
|
+
case _:
|
|
709
|
+
raise SnowparkConnectNotImplementedError(
|
|
710
|
+
f"Write mode {mode} is not supported for iceberg table"
|
|
711
|
+
)
|
|
509
712
|
sql = f"""
|
|
510
|
-
|
|
713
|
+
{create_sql} ICEBERG TABLE {snowpark_table_name} ({",".join(table_schema)})
|
|
511
714
|
CATALOG = 'SNOWFLAKE'
|
|
512
715
|
{external_volume}
|
|
513
716
|
{base_location};
|
|
@@ -584,7 +787,7 @@ def _truncate_directory(directory_path: Path) -> None:
|
|
|
584
787
|
shutil.rmtree(file)
|
|
585
788
|
|
|
586
789
|
|
|
587
|
-
def
|
|
790
|
+
def check_snowflake_table_existence(
|
|
588
791
|
snowpark_table_name: str,
|
|
589
792
|
snowpark_session: snowpark.Session,
|
|
590
793
|
):
|
|
@@ -593,3 +796,19 @@ def check_snowflake_table_existance(
|
|
|
593
796
|
return True
|
|
594
797
|
except Exception:
|
|
595
798
|
return False
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
def check_table_type(
|
|
802
|
+
snowpark_table_name: str,
|
|
803
|
+
snowpark_session: snowpark.Session,
|
|
804
|
+
) -> str:
|
|
805
|
+
# currently we only support iceberg table and FDN table
|
|
806
|
+
metadata = snowpark_session.sql(
|
|
807
|
+
f"SHOW TABLES LIKE '{unquote_if_quoted(snowpark_table_name)}';"
|
|
808
|
+
).collect()
|
|
809
|
+
if metadata is None or len(metadata) == 0:
|
|
810
|
+
raise AnalysisException(f"Table {snowpark_table_name} does not exist")
|
|
811
|
+
metadata = metadata[0]
|
|
812
|
+
if metadata.as_dict().get("is_iceberg") == "Y":
|
|
813
|
+
return "ICEBERG"
|
|
814
|
+
return "TABLE"
|