snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +717 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +309 -26
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/error_utils.py +28 -0
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +224 -15
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +86 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
- snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
- snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +171 -48
- snowflake/snowpark_connect/server.py +528 -473
- snowflake/snowpark_connect/server_common/__init__.py +503 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/type_support.py +130 -0
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +195 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +192 -40
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -2,10 +2,15 @@
|
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
4
|
|
|
5
|
+
import copy
|
|
5
6
|
import os
|
|
6
7
|
import shutil
|
|
8
|
+
import uuid
|
|
9
|
+
from contextlib import suppress
|
|
7
10
|
from pathlib import Path
|
|
8
11
|
|
|
12
|
+
import pyarrow as pa
|
|
13
|
+
import pyarrow.parquet as pq
|
|
9
14
|
import pyspark.sql.connect.proto.base_pb2 as proto_base
|
|
10
15
|
import pyspark.sql.connect.proto.commands_pb2 as commands_proto
|
|
11
16
|
from pyspark.errors.exceptions.base import AnalysisException
|
|
@@ -16,7 +21,7 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
|
16
21
|
unquote_if_quoted,
|
|
17
22
|
)
|
|
18
23
|
from snowflake.snowpark.exceptions import SnowparkSQLException
|
|
19
|
-
from snowflake.snowpark.functions import col, lit, object_construct, sql_expr
|
|
24
|
+
from snowflake.snowpark.functions import col, lit, object_construct, sql_expr, when
|
|
20
25
|
from snowflake.snowpark.types import (
|
|
21
26
|
ArrayType,
|
|
22
27
|
DataType,
|
|
@@ -28,37 +33,57 @@ from snowflake.snowpark.types import (
|
|
|
28
33
|
_NumericType,
|
|
29
34
|
)
|
|
30
35
|
from snowflake.snowpark_connect.config import (
|
|
36
|
+
auto_uppercase_column_identifiers,
|
|
37
|
+
get_parquet_metadata_generation_enabled,
|
|
38
|
+
get_success_file_generation_enabled,
|
|
31
39
|
global_config,
|
|
32
40
|
sessions_config,
|
|
33
41
|
str_to_bool,
|
|
34
42
|
)
|
|
43
|
+
from snowflake.snowpark_connect.constants import SPARK_VERSION
|
|
35
44
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
45
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
46
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
36
47
|
from snowflake.snowpark_connect.relation.io_utils import (
|
|
37
48
|
convert_file_prefix_path,
|
|
49
|
+
get_compression_for_source_and_options,
|
|
38
50
|
is_cloud_path,
|
|
39
51
|
)
|
|
40
52
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
53
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
54
|
+
without_internal_columns,
|
|
55
|
+
)
|
|
41
56
|
from snowflake.snowpark_connect.relation.read.reader_config import CsvWriterConfig
|
|
42
57
|
from snowflake.snowpark_connect.relation.stage_locator import get_paths_from_stage
|
|
43
58
|
from snowflake.snowpark_connect.relation.utils import (
|
|
44
59
|
generate_spark_compatible_filename,
|
|
45
60
|
random_string,
|
|
46
61
|
)
|
|
47
|
-
from snowflake.snowpark_connect.type_mapping import
|
|
48
|
-
|
|
62
|
+
from snowflake.snowpark_connect.type_mapping import (
|
|
63
|
+
map_pyspark_types_to_pyarrow_types,
|
|
64
|
+
map_snowpark_to_pyspark_types,
|
|
65
|
+
snowpark_to_iceberg_type,
|
|
66
|
+
)
|
|
67
|
+
from snowflake.snowpark_connect.utils.context import get_spark_session_id
|
|
49
68
|
from snowflake.snowpark_connect.utils.identifiers import (
|
|
50
69
|
spark_to_sf_single_id,
|
|
51
70
|
split_fully_qualified_spark_name,
|
|
52
71
|
)
|
|
72
|
+
from snowflake.snowpark_connect.utils.io_utils import get_table_type
|
|
53
73
|
from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
|
|
54
74
|
from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
55
75
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
56
76
|
SnowparkConnectNotImplementedError,
|
|
57
77
|
telemetry,
|
|
58
78
|
)
|
|
79
|
+
from snowflake.snowpark_connect.utils.udf_cache import register_cached_sproc
|
|
59
80
|
|
|
60
81
|
_column_order_for_write = "name"
|
|
61
82
|
|
|
83
|
+
# Available values for TARGET_FILE_SIZE
|
|
84
|
+
# reference:https://docs.snowflake.com/en/sql-reference/sql/create-iceberg-table
|
|
85
|
+
TARGET_FILE_SIZE_ACCEPTABLE_VALUES = ("AUTO", "16MB", "32MB", "64MB", "128MB")
|
|
86
|
+
|
|
62
87
|
|
|
63
88
|
# TODO: We will revise/refactor this after changes for all formats are finalized.
|
|
64
89
|
def clean_params(params):
|
|
@@ -109,9 +134,65 @@ def _spark_to_snowflake(multipart_id: str) -> str:
|
|
|
109
134
|
)
|
|
110
135
|
|
|
111
136
|
|
|
137
|
+
def _validate_table_exist_and_of_type(
|
|
138
|
+
snowpark_table_name: str,
|
|
139
|
+
session: snowpark.Session,
|
|
140
|
+
table_type: str,
|
|
141
|
+
table_schema_or_error: DataType | SnowparkSQLException,
|
|
142
|
+
) -> None:
|
|
143
|
+
if not isinstance(table_schema_or_error, DataType):
|
|
144
|
+
exception = AnalysisException(
|
|
145
|
+
f"[TABLE_OR_VIEW_NOT_FOUND] The table or view `{snowpark_table_name}` cannot be found."
|
|
146
|
+
)
|
|
147
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
148
|
+
raise exception
|
|
149
|
+
_validate_table_type(snowpark_table_name, session, table_type)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _validate_table_type(
|
|
153
|
+
snowpark_table_name: str,
|
|
154
|
+
session: snowpark.Session,
|
|
155
|
+
table_type: str,
|
|
156
|
+
) -> None:
|
|
157
|
+
actual_type = get_table_type(snowpark_table_name, session)
|
|
158
|
+
if table_type == "iceberg":
|
|
159
|
+
if actual_type not in ("ICEBERG", "TABLE"):
|
|
160
|
+
exception = AnalysisException(
|
|
161
|
+
f"Table {snowpark_table_name} is not an iceberg table"
|
|
162
|
+
)
|
|
163
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
164
|
+
raise exception
|
|
165
|
+
elif table_type == "fdn":
|
|
166
|
+
if actual_type not in ("NORMAL", "TABLE"):
|
|
167
|
+
exception = AnalysisException(
|
|
168
|
+
f"Table {snowpark_table_name} is not a FDN table"
|
|
169
|
+
)
|
|
170
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
171
|
+
raise exception
|
|
172
|
+
else:
|
|
173
|
+
raise ValueError(
|
|
174
|
+
f"Invalid table_type: {table_type}. Must be 'iceberg' or 'fdn'"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _validate_table_does_not_exist(
|
|
179
|
+
snowpark_table_name: str,
|
|
180
|
+
table_schema_or_error: DataType | SnowparkSQLException,
|
|
181
|
+
) -> None:
|
|
182
|
+
if isinstance(table_schema_or_error, DataType):
|
|
183
|
+
exception = AnalysisException(f"Table {snowpark_table_name} already exists")
|
|
184
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
185
|
+
raise exception
|
|
186
|
+
|
|
187
|
+
|
|
112
188
|
def map_write(request: proto_base.ExecutePlanRequest):
|
|
113
189
|
write_op = request.plan.command.write_operation
|
|
114
190
|
telemetry.report_io_write(write_op.source)
|
|
191
|
+
if write_op.path and write_op.options.get("path"):
|
|
192
|
+
raise AnalysisException(
|
|
193
|
+
"There is a 'path' option set and save() is called with a path parameter. "
|
|
194
|
+
"Either remove the path option, or call save() without the parameter."
|
|
195
|
+
)
|
|
115
196
|
|
|
116
197
|
write_mode = None
|
|
117
198
|
match write_op.mode:
|
|
@@ -125,9 +206,30 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
125
206
|
write_mode = "ignore"
|
|
126
207
|
|
|
127
208
|
result = map_relation(write_op.input)
|
|
128
|
-
input_df
|
|
209
|
+
input_df, snowpark_column_names = handle_column_names(result, write_op.source)
|
|
210
|
+
|
|
211
|
+
# Create updated container with transformed dataframe, then filter METADATA$FILENAME columns
|
|
212
|
+
updated_result = DataFrameContainer.create_with_column_mapping(
|
|
213
|
+
dataframe=input_df,
|
|
214
|
+
spark_column_names=result.column_map.get_spark_columns(),
|
|
215
|
+
snowpark_column_names=snowpark_column_names,
|
|
216
|
+
column_metadata=result.column_map.column_metadata,
|
|
217
|
+
column_qualifiers=result.column_map.get_qualifiers(),
|
|
218
|
+
parent_column_name_map=result.column_map.get_parent_column_name_map(),
|
|
219
|
+
table_name=result.table_name,
|
|
220
|
+
alias=result.alias,
|
|
221
|
+
partition_hint=result.partition_hint,
|
|
222
|
+
)
|
|
223
|
+
updated_result = without_internal_columns(updated_result)
|
|
224
|
+
input_df = updated_result.dataframe
|
|
225
|
+
|
|
129
226
|
session: snowpark.Session = get_or_create_snowpark_session()
|
|
130
227
|
|
|
228
|
+
# Check for partition hint early to determine precedence over single option
|
|
229
|
+
partition_hint = (
|
|
230
|
+
result.partition_hint if hasattr(result, "partition_hint") else None
|
|
231
|
+
)
|
|
232
|
+
|
|
131
233
|
# Snowflake saveAsTable doesn't support format
|
|
132
234
|
if (
|
|
133
235
|
write_op.HasField("table")
|
|
@@ -150,15 +252,59 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
150
252
|
max_file_size = 1073741824
|
|
151
253
|
match write_op.source:
|
|
152
254
|
case "csv" | "parquet" | "json" | "text":
|
|
255
|
+
if write_mode == "ignore":
|
|
256
|
+
exception = SnowparkConnectNotImplementedError(
|
|
257
|
+
f"Write mode {write_mode} is not supported for {write_op.source}"
|
|
258
|
+
)
|
|
259
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
260
|
+
raise exception
|
|
261
|
+
|
|
153
262
|
write_path = get_paths_from_stage(
|
|
154
263
|
[write_op.path],
|
|
155
264
|
session=session,
|
|
156
265
|
)[0]
|
|
266
|
+
|
|
267
|
+
# Handle error/errorifexists mode - check if file exists before writing
|
|
268
|
+
if write_mode in (None, "error", "errorifexists"):
|
|
269
|
+
is_local_path = not is_cloud_path(write_op.path)
|
|
270
|
+
|
|
271
|
+
if is_local_path:
|
|
272
|
+
# Check if local path exists
|
|
273
|
+
if os.path.exists(write_op.path) and (
|
|
274
|
+
os.path.isfile(write_op.path)
|
|
275
|
+
or (os.path.isdir(write_op.path) and os.listdir(write_op.path))
|
|
276
|
+
):
|
|
277
|
+
exception = AnalysisException(
|
|
278
|
+
f"Path {write_op.path} already exists."
|
|
279
|
+
)
|
|
280
|
+
attach_custom_error_code(
|
|
281
|
+
exception, ErrorCodes.INVALID_OPERATION
|
|
282
|
+
)
|
|
283
|
+
raise exception
|
|
284
|
+
else:
|
|
285
|
+
# Check if stage/cloud path exists by listing files
|
|
286
|
+
# If the path does not exist, SnowparkSQLException is suppressed (expected for error mode).
|
|
287
|
+
with suppress(SnowparkSQLException):
|
|
288
|
+
# TODO: Optimize this check by using a more efficient way to check if the path exists.
|
|
289
|
+
list_command = f"LIST '{write_path}/'"
|
|
290
|
+
result = session.sql(list_command).collect()
|
|
291
|
+
if result:
|
|
292
|
+
exception = AnalysisException(
|
|
293
|
+
f"Path {write_op.path} already exists."
|
|
294
|
+
)
|
|
295
|
+
attach_custom_error_code(
|
|
296
|
+
exception, ErrorCodes.INVALID_OPERATION
|
|
297
|
+
)
|
|
298
|
+
raise exception
|
|
299
|
+
|
|
157
300
|
# Generate Spark-compatible filename with proper extension
|
|
158
301
|
extension = write_op.source if write_op.source != "text" else "txt"
|
|
159
302
|
|
|
160
|
-
|
|
161
|
-
|
|
303
|
+
compression = get_compression_for_source_and_options(
|
|
304
|
+
write_op.source, write_op.options, from_read=False
|
|
305
|
+
)
|
|
306
|
+
if compression is not None:
|
|
307
|
+
write_op.options["compression"] = compression
|
|
162
308
|
|
|
163
309
|
# Generate Spark-compatible filename or prefix
|
|
164
310
|
# we need a random prefix to support "append" mode
|
|
@@ -169,27 +315,18 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
169
315
|
)
|
|
170
316
|
|
|
171
317
|
if overwrite:
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
logger.warning(
|
|
178
|
-
f"Skipping REMOVE for root path {write_path} - too broad scope"
|
|
179
|
-
)
|
|
180
|
-
else:
|
|
181
|
-
remove_command = f"REMOVE {write_path}/"
|
|
182
|
-
session.sql(remove_command).collect()
|
|
183
|
-
logger.info(f"Successfully cleared directory: {write_path}")
|
|
184
|
-
except Exception as e:
|
|
185
|
-
logger.warning(f"Could not clear directory {write_path}: {e}")
|
|
318
|
+
# Trailing slash is required as calling remove with just write_path would remove everything in the
|
|
319
|
+
# stage path with the same prefix.
|
|
320
|
+
remove_command = f"REMOVE '{write_path}/'"
|
|
321
|
+
session.sql(remove_command).collect()
|
|
322
|
+
logger.info(f"Successfully cleared directory: {write_path}")
|
|
186
323
|
|
|
187
|
-
if should_write_to_single_file:
|
|
324
|
+
if should_write_to_single_file and partition_hint is None:
|
|
188
325
|
# Single file: generate complete filename with extension
|
|
189
326
|
spark_filename = generate_spark_compatible_filename(
|
|
190
327
|
task_id=0,
|
|
191
328
|
attempt_number=0,
|
|
192
|
-
compression=
|
|
329
|
+
compression=compression,
|
|
193
330
|
format_ext=extension,
|
|
194
331
|
)
|
|
195
332
|
temp_file_prefix_on_stage = f"{write_path}/{spark_filename}"
|
|
@@ -198,15 +335,11 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
198
335
|
spark_filename_prefix = generate_spark_compatible_filename(
|
|
199
336
|
task_id=0,
|
|
200
337
|
attempt_number=0,
|
|
201
|
-
compression=
|
|
338
|
+
compression=None,
|
|
202
339
|
format_ext="", # No extension for prefix
|
|
203
340
|
)
|
|
204
341
|
temp_file_prefix_on_stage = f"{write_path}/{spark_filename_prefix}"
|
|
205
342
|
|
|
206
|
-
default_compression = "NONE" if write_op.source != "parquet" else "snappy"
|
|
207
|
-
compression = write_op.options.get(
|
|
208
|
-
"compression", default_compression
|
|
209
|
-
).upper()
|
|
210
343
|
parameters = {
|
|
211
344
|
"location": temp_file_prefix_on_stage,
|
|
212
345
|
"file_format_type": write_op.source
|
|
@@ -215,13 +348,10 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
215
348
|
"format_type_options": {
|
|
216
349
|
"COMPRESSION": compression,
|
|
217
350
|
},
|
|
218
|
-
"overwrite": overwrite,
|
|
219
351
|
}
|
|
220
|
-
#
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
# Check for partition hint early to determine precedence over single option
|
|
224
|
-
partition_hint = result.partition_hint
|
|
352
|
+
# Download from the base write path to ensure we fetch whatever Snowflake produced.
|
|
353
|
+
# Using the base avoids coupling to exact filenames/prefixes.
|
|
354
|
+
download_stage_path = write_path
|
|
225
355
|
|
|
226
356
|
# Apply max_file_size for both single and multi-file scenarios
|
|
227
357
|
# This helps control when Snowflake splits files into multiple parts
|
|
@@ -234,16 +364,26 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
234
364
|
get_param_from_options(parameters, write_op.options, write_op.source)
|
|
235
365
|
if write_op.partitioning_columns:
|
|
236
366
|
if write_op.source != "parquet":
|
|
237
|
-
|
|
367
|
+
exception = SnowparkConnectNotImplementedError(
|
|
238
368
|
"Partitioning is only supported for parquet format"
|
|
239
369
|
)
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
raise SnowparkConnectNotImplementedError(
|
|
243
|
-
"Multiple partitioning columns are not yet supported"
|
|
370
|
+
attach_custom_error_code(
|
|
371
|
+
exception, ErrorCodes.UNSUPPORTED_OPERATION
|
|
244
372
|
)
|
|
245
|
-
|
|
246
|
-
|
|
373
|
+
raise exception
|
|
374
|
+
# Build Spark-style directory structure: col1=value1/col2=value2/...
|
|
375
|
+
# Example produced expression (Snowflake SQL):
|
|
376
|
+
# 'department=' || TO_VARCHAR("department") || '/' || 'region=' || TO_VARCHAR("region")
|
|
377
|
+
partitioning_column_names = list(write_op.partitioning_columns)
|
|
378
|
+
partition_expr_parts: list[str] = []
|
|
379
|
+
for col_name in partitioning_column_names:
|
|
380
|
+
quoted = f'"{col_name}"'
|
|
381
|
+
segment = f"'{col_name}=' || COALESCE(TO_VARCHAR({quoted}), '__HIVE_DEFAULT_PARTITION__')"
|
|
382
|
+
partition_expr_parts.append(segment)
|
|
383
|
+
parameters["partition_by"] = " || '/' || ".join(partition_expr_parts)
|
|
384
|
+
# When using PARTITION BY, Snowflake writes into subdirectories under the base path.
|
|
385
|
+
# Download from the base write path to preserve partition directories locally.
|
|
386
|
+
download_stage_path = write_path
|
|
247
387
|
|
|
248
388
|
# If a partition hint is present (from DataFrame.repartition(n)), optionally split the
|
|
249
389
|
# write into n COPY INTO calls by assigning a synthetic partition id. Controlled by config.
|
|
@@ -267,15 +407,20 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
267
407
|
# Execute multiple COPY INTO operations, one per target file.
|
|
268
408
|
# Since we write per-partition with distinct prefixes, download from the base write path.
|
|
269
409
|
download_stage_path = write_path
|
|
410
|
+
|
|
411
|
+
# We need to create a new set of parameters with single=True
|
|
412
|
+
shared_uuid = str(uuid.uuid4())
|
|
413
|
+
part_params = copy.deepcopy(dict(parameters))
|
|
414
|
+
part_params["single"] = True
|
|
270
415
|
for part_idx in range(partition_hint):
|
|
271
|
-
part_params = dict(parameters)
|
|
272
416
|
# Preserve Spark-like filename prefix per partition so downloaded basenames
|
|
273
417
|
# match the expected Spark pattern (with possible Snowflake counters appended).
|
|
274
418
|
per_part_prefix = generate_spark_compatible_filename(
|
|
275
419
|
task_id=part_idx,
|
|
276
420
|
attempt_number=0,
|
|
277
|
-
compression=
|
|
278
|
-
format_ext=
|
|
421
|
+
compression=compression,
|
|
422
|
+
format_ext=extension,
|
|
423
|
+
shared_uuid=shared_uuid,
|
|
279
424
|
)
|
|
280
425
|
part_params["location"] = f"{write_path}/{per_part_prefix}"
|
|
281
426
|
(
|
|
@@ -285,13 +430,25 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
285
430
|
)
|
|
286
431
|
else:
|
|
287
432
|
rewritten_df.write.copy_into_location(**parameters)
|
|
288
|
-
|
|
433
|
+
|
|
434
|
+
is_local_path = not is_cloud_path(write_op.path)
|
|
435
|
+
if is_local_path:
|
|
289
436
|
store_files_locally(
|
|
290
437
|
download_stage_path,
|
|
291
438
|
write_op.path,
|
|
292
439
|
overwrite,
|
|
293
440
|
session,
|
|
294
441
|
)
|
|
442
|
+
|
|
443
|
+
_generate_metadata_files(
|
|
444
|
+
write_op.source,
|
|
445
|
+
write_op.path,
|
|
446
|
+
download_stage_path,
|
|
447
|
+
input_df.schema,
|
|
448
|
+
session,
|
|
449
|
+
parameters,
|
|
450
|
+
is_local_path,
|
|
451
|
+
)
|
|
295
452
|
case "jdbc":
|
|
296
453
|
from snowflake.snowpark_connect.relation.write.map_write_jdbc import (
|
|
297
454
|
map_write_jdbc,
|
|
@@ -308,48 +465,75 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
308
465
|
else write_op.table.table_name
|
|
309
466
|
)
|
|
310
467
|
snowpark_table_name = _spark_to_snowflake(table_name)
|
|
468
|
+
partition_cols = (
|
|
469
|
+
write_op.partitioning_columns if write_op.partitioning_columns else None
|
|
470
|
+
)
|
|
311
471
|
|
|
312
472
|
match write_mode:
|
|
313
473
|
case None | "error" | "errorifexists":
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
474
|
+
table_schema_or_error = _get_table_schema_or_error(
|
|
475
|
+
snowpark_table_name, session
|
|
476
|
+
)
|
|
477
|
+
_validate_table_does_not_exist(
|
|
478
|
+
snowpark_table_name, table_schema_or_error
|
|
479
|
+
)
|
|
318
480
|
create_iceberg_table(
|
|
319
481
|
snowpark_table_name=snowpark_table_name,
|
|
320
482
|
location=write_op.options.get("location", None),
|
|
321
483
|
schema=input_df.schema,
|
|
322
484
|
snowpark_session=session,
|
|
485
|
+
partition_by=partition_cols,
|
|
486
|
+
target_file_size=write_op.options.get(
|
|
487
|
+
"write.target-file-size", None
|
|
488
|
+
),
|
|
323
489
|
)
|
|
324
490
|
_validate_schema_and_get_writer(
|
|
325
|
-
input_df, "append", snowpark_table_name
|
|
491
|
+
input_df, "append", snowpark_table_name, table_schema_or_error
|
|
326
492
|
).saveAsTable(
|
|
327
493
|
table_name=snowpark_table_name,
|
|
328
494
|
mode="append",
|
|
329
495
|
column_order=_column_order_for_write,
|
|
330
496
|
)
|
|
331
497
|
case "append":
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
498
|
+
table_schema_or_error = _get_table_schema_or_error(
|
|
499
|
+
snowpark_table_name, session
|
|
500
|
+
)
|
|
501
|
+
if isinstance(table_schema_or_error, DataType): # Table exists
|
|
502
|
+
_validate_table_type(snowpark_table_name, session, "iceberg")
|
|
503
|
+
else:
|
|
504
|
+
create_iceberg_table(
|
|
505
|
+
snowpark_table_name=snowpark_table_name,
|
|
506
|
+
location=write_op.options.get("location", None),
|
|
507
|
+
schema=input_df.schema,
|
|
508
|
+
snowpark_session=session,
|
|
509
|
+
partition_by=partition_cols,
|
|
510
|
+
target_file_size=write_op.options.get(
|
|
511
|
+
"write.target-file-size", None
|
|
512
|
+
),
|
|
513
|
+
)
|
|
337
514
|
_validate_schema_and_get_writer(
|
|
338
|
-
input_df, "append", snowpark_table_name
|
|
515
|
+
input_df, "append", snowpark_table_name, table_schema_or_error
|
|
339
516
|
).saveAsTable(
|
|
340
517
|
table_name=snowpark_table_name,
|
|
341
518
|
mode="append",
|
|
342
519
|
column_order=_column_order_for_write,
|
|
343
520
|
)
|
|
344
521
|
case "ignore":
|
|
345
|
-
|
|
522
|
+
table_schema_or_error = _get_table_schema_or_error(
|
|
346
523
|
snowpark_table_name, session
|
|
347
|
-
)
|
|
524
|
+
)
|
|
525
|
+
if not isinstance(
|
|
526
|
+
table_schema_or_error, DataType
|
|
527
|
+
): # Table not exists
|
|
348
528
|
create_iceberg_table(
|
|
349
529
|
snowpark_table_name=snowpark_table_name,
|
|
350
530
|
location=write_op.options.get("location", None),
|
|
351
531
|
schema=input_df.schema,
|
|
352
532
|
snowpark_session=session,
|
|
533
|
+
partition_by=partition_cols,
|
|
534
|
+
target_file_size=write_op.options.get(
|
|
535
|
+
"write.target-file-size", None
|
|
536
|
+
),
|
|
353
537
|
)
|
|
354
538
|
_validate_schema_and_get_writer(
|
|
355
539
|
input_df, "append", snowpark_table_name
|
|
@@ -359,67 +543,108 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
359
543
|
column_order=_column_order_for_write,
|
|
360
544
|
)
|
|
361
545
|
case "overwrite":
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
546
|
+
table_schema_or_error = _get_table_schema_or_error(
|
|
547
|
+
snowpark_table_name, session
|
|
548
|
+
)
|
|
549
|
+
if isinstance(table_schema_or_error, DataType): # Table exists
|
|
550
|
+
_validate_table_type(snowpark_table_name, session, "iceberg")
|
|
551
|
+
create_iceberg_table(
|
|
552
|
+
snowpark_table_name=snowpark_table_name,
|
|
553
|
+
location=write_op.options.get("location", None),
|
|
554
|
+
schema=input_df.schema,
|
|
555
|
+
snowpark_session=session,
|
|
556
|
+
mode="replace",
|
|
557
|
+
partition_by=partition_cols,
|
|
558
|
+
target_file_size=write_op.options.get(
|
|
559
|
+
"write.target-file-size", None
|
|
560
|
+
),
|
|
561
|
+
)
|
|
369
562
|
else:
|
|
370
563
|
create_iceberg_table(
|
|
371
564
|
snowpark_table_name=snowpark_table_name,
|
|
372
565
|
location=write_op.options.get("location", None),
|
|
373
566
|
schema=input_df.schema,
|
|
374
567
|
snowpark_session=session,
|
|
568
|
+
mode="create",
|
|
569
|
+
partition_by=partition_cols,
|
|
570
|
+
target_file_size=write_op.options.get(
|
|
571
|
+
"write.target-file-size", None
|
|
572
|
+
),
|
|
375
573
|
)
|
|
376
|
-
|
|
377
|
-
input_df, "truncate", snowpark_table_name
|
|
378
|
-
).saveAsTable(
|
|
574
|
+
_get_writer_for_table_creation(input_df).saveAsTable(
|
|
379
575
|
table_name=snowpark_table_name,
|
|
380
|
-
mode="
|
|
576
|
+
mode="append",
|
|
381
577
|
column_order=_column_order_for_write,
|
|
382
578
|
)
|
|
383
579
|
case _:
|
|
384
|
-
|
|
580
|
+
exception = SnowparkConnectNotImplementedError(
|
|
385
581
|
f"Write mode {write_mode} is not supported"
|
|
386
582
|
)
|
|
583
|
+
attach_custom_error_code(
|
|
584
|
+
exception, ErrorCodes.UNSUPPORTED_OPERATION
|
|
585
|
+
)
|
|
586
|
+
raise exception
|
|
387
587
|
case _:
|
|
388
588
|
snowpark_table_name = _spark_to_snowflake(write_op.table.table_name)
|
|
589
|
+
save_method = write_op.table.save_method
|
|
590
|
+
|
|
591
|
+
if (
|
|
592
|
+
write_op.source == "snowflake"
|
|
593
|
+
and write_op.table.save_method
|
|
594
|
+
== commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_UNSPECIFIED
|
|
595
|
+
):
|
|
596
|
+
save_method = (
|
|
597
|
+
commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE
|
|
598
|
+
)
|
|
599
|
+
if len(write_op.table.table_name) == 0:
|
|
600
|
+
dbtable_name = write_op.options.get("dbtable", "")
|
|
601
|
+
if len(dbtable_name) == 0:
|
|
602
|
+
exception = SnowparkConnectNotImplementedError(
|
|
603
|
+
"Save command is not supported without a table name"
|
|
604
|
+
)
|
|
605
|
+
attach_custom_error_code(
|
|
606
|
+
exception, ErrorCodes.UNSUPPORTED_OPERATION
|
|
607
|
+
)
|
|
608
|
+
raise exception
|
|
609
|
+
else:
|
|
610
|
+
snowpark_table_name = _spark_to_snowflake(dbtable_name)
|
|
389
611
|
|
|
390
612
|
if (
|
|
391
|
-
|
|
613
|
+
save_method
|
|
392
614
|
== commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE
|
|
393
615
|
):
|
|
394
616
|
match write_mode:
|
|
395
617
|
case "overwrite":
|
|
396
|
-
|
|
618
|
+
table_schema_or_error = _get_table_schema_or_error(
|
|
397
619
|
snowpark_table_name, session
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
# ):
|
|
404
|
-
# raise AnalysisException(
|
|
405
|
-
# f"Table {snowpark_table_name} is not a FDN table"
|
|
406
|
-
# )
|
|
407
|
-
write_mode = "truncate"
|
|
620
|
+
)
|
|
621
|
+
if isinstance(table_schema_or_error, DataType): # Table exists
|
|
622
|
+
_validate_table_type(snowpark_table_name, session, "fdn")
|
|
623
|
+
|
|
624
|
+
write_mode = "overwrite"
|
|
408
625
|
_validate_schema_and_get_writer(
|
|
409
|
-
input_df,
|
|
626
|
+
input_df,
|
|
627
|
+
write_mode,
|
|
628
|
+
snowpark_table_name,
|
|
629
|
+
table_schema_or_error,
|
|
410
630
|
).saveAsTable(
|
|
411
631
|
table_name=snowpark_table_name,
|
|
412
632
|
mode=write_mode,
|
|
633
|
+
copy_grants=True,
|
|
413
634
|
column_order=_column_order_for_write,
|
|
414
635
|
)
|
|
415
636
|
case "append":
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
637
|
+
table_schema_or_error = _get_table_schema_or_error(
|
|
638
|
+
snowpark_table_name, session
|
|
639
|
+
)
|
|
640
|
+
if isinstance(table_schema_or_error, DataType): # Table exists
|
|
641
|
+
_validate_table_type(snowpark_table_name, session, "fdn")
|
|
642
|
+
|
|
421
643
|
_validate_schema_and_get_writer(
|
|
422
|
-
input_df,
|
|
644
|
+
input_df,
|
|
645
|
+
write_mode,
|
|
646
|
+
snowpark_table_name,
|
|
647
|
+
table_schema_or_error,
|
|
423
648
|
).saveAsTable(
|
|
424
649
|
table_name=snowpark_table_name,
|
|
425
650
|
mode=write_mode,
|
|
@@ -434,7 +659,7 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
434
659
|
column_order=_column_order_for_write,
|
|
435
660
|
)
|
|
436
661
|
elif (
|
|
437
|
-
|
|
662
|
+
save_method
|
|
438
663
|
== commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO
|
|
439
664
|
):
|
|
440
665
|
_validate_schema_and_get_writer(
|
|
@@ -445,9 +670,11 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
445
670
|
column_order=_column_order_for_write,
|
|
446
671
|
)
|
|
447
672
|
else:
|
|
448
|
-
|
|
449
|
-
f"Save command not supported: {
|
|
673
|
+
exception = SnowparkConnectNotImplementedError(
|
|
674
|
+
f"Save command not supported: {save_method}"
|
|
450
675
|
)
|
|
676
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
677
|
+
raise exception
|
|
451
678
|
|
|
452
679
|
|
|
453
680
|
def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
@@ -455,212 +682,252 @@ def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
|
455
682
|
|
|
456
683
|
snowpark_table_name = _spark_to_snowflake(write_op.table_name)
|
|
457
684
|
result = map_relation(write_op.input)
|
|
458
|
-
input_df
|
|
685
|
+
input_df, snowpark_column_names = handle_column_names(result, "table")
|
|
686
|
+
|
|
687
|
+
# Create updated container with transformed dataframe, then filter METADATA$FILENAME columns
|
|
688
|
+
updated_result = DataFrameContainer.create_with_column_mapping(
|
|
689
|
+
dataframe=input_df,
|
|
690
|
+
spark_column_names=result.column_map.get_spark_columns(),
|
|
691
|
+
snowpark_column_names=snowpark_column_names,
|
|
692
|
+
column_metadata=result.column_map.column_metadata,
|
|
693
|
+
column_qualifiers=result.column_map.get_qualifiers(),
|
|
694
|
+
parent_column_name_map=result.column_map.get_parent_column_name_map(),
|
|
695
|
+
table_name=result.table_name,
|
|
696
|
+
alias=result.alias,
|
|
697
|
+
partition_hint=result.partition_hint,
|
|
698
|
+
)
|
|
699
|
+
updated_result = without_internal_columns(updated_result)
|
|
700
|
+
input_df = updated_result.dataframe
|
|
701
|
+
|
|
459
702
|
session: snowpark.Session = get_or_create_snowpark_session()
|
|
460
703
|
|
|
461
704
|
if write_op.table_name is None or write_op.table_name == "":
|
|
462
|
-
|
|
705
|
+
exception = SnowparkConnectNotImplementedError(
|
|
463
706
|
"Write operation V2 only support table writing now"
|
|
464
707
|
)
|
|
708
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
709
|
+
raise exception
|
|
710
|
+
|
|
711
|
+
is_iceberg = write_op.provider.lower() == "iceberg"
|
|
712
|
+
table_type = "iceberg" if is_iceberg else "fdn"
|
|
713
|
+
partition_cols = (
|
|
714
|
+
[
|
|
715
|
+
i.unresolved_attribute.unparsed_identifier
|
|
716
|
+
for i in write_op.partitioning_columns
|
|
717
|
+
]
|
|
718
|
+
if write_op.partitioning_columns
|
|
719
|
+
else None
|
|
720
|
+
)
|
|
465
721
|
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
722
|
+
match write_op.mode:
|
|
723
|
+
case commands_proto.WriteOperationV2.MODE_CREATE:
|
|
724
|
+
table_schema_or_error = _get_table_schema_or_error(
|
|
725
|
+
snowpark_table_name, session
|
|
726
|
+
)
|
|
727
|
+
_validate_table_does_not_exist(snowpark_table_name, table_schema_or_error)
|
|
728
|
+
|
|
729
|
+
if is_iceberg:
|
|
473
730
|
create_iceberg_table(
|
|
474
731
|
snowpark_table_name=snowpark_table_name,
|
|
475
732
|
location=write_op.table_properties.get("location"),
|
|
476
733
|
schema=input_df.schema,
|
|
477
734
|
snowpark_session=session,
|
|
735
|
+
partition_by=partition_cols,
|
|
736
|
+
target_file_size=write_op.table_properties.get(
|
|
737
|
+
"write.target-file-size", None
|
|
738
|
+
),
|
|
478
739
|
)
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
740
|
+
_get_writer_for_table_creation(input_df).saveAsTable(
|
|
741
|
+
table_name=snowpark_table_name,
|
|
742
|
+
mode="append" if is_iceberg else "errorifexists",
|
|
743
|
+
column_order=_column_order_for_write,
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
case commands_proto.WriteOperationV2.MODE_APPEND:
|
|
747
|
+
table_schema_or_error = _get_table_schema_or_error(
|
|
748
|
+
snowpark_table_name, session
|
|
749
|
+
)
|
|
750
|
+
_validate_table_exist_and_of_type(
|
|
751
|
+
snowpark_table_name, session, table_type, table_schema_or_error
|
|
752
|
+
)
|
|
753
|
+
_validate_schema_and_get_writer(
|
|
754
|
+
input_df, "append", snowpark_table_name, table_schema_or_error
|
|
755
|
+
).saveAsTable(
|
|
756
|
+
table_name=snowpark_table_name,
|
|
757
|
+
mode="append",
|
|
758
|
+
column_order=_column_order_for_write,
|
|
759
|
+
)
|
|
760
|
+
|
|
761
|
+
case commands_proto.WriteOperationV2.MODE_OVERWRITE | commands_proto.WriteOperationV2.MODE_OVERWRITE_PARTITIONS:
|
|
762
|
+
# TODO: handle the filter condition for MODE_OVERWRITE
|
|
763
|
+
table_schema_or_error = _get_table_schema_or_error(
|
|
764
|
+
snowpark_table_name, session
|
|
765
|
+
)
|
|
766
|
+
_validate_table_exist_and_of_type(
|
|
767
|
+
snowpark_table_name, session, table_type, table_schema_or_error
|
|
768
|
+
)
|
|
769
|
+
|
|
770
|
+
if is_iceberg:
|
|
771
|
+
create_iceberg_table(
|
|
772
|
+
snowpark_table_name=snowpark_table_name,
|
|
773
|
+
location=write_op.options.get("location", None),
|
|
774
|
+
schema=input_df.schema,
|
|
775
|
+
snowpark_session=session,
|
|
776
|
+
mode="replace",
|
|
777
|
+
partition_by=partition_cols,
|
|
778
|
+
target_file_size=write_op.table_properties.get(
|
|
779
|
+
"write.target-file-size", None
|
|
780
|
+
),
|
|
502
781
|
)
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
# raise AnalysisException(
|
|
509
|
-
# f"Table {snowpark_table_name} is not an iceberg table"
|
|
510
|
-
# )
|
|
511
|
-
pass
|
|
512
|
-
else:
|
|
513
|
-
raise AnalysisException(
|
|
514
|
-
f"[TABLE_OR_VIEW_NOT_FOUND] Table {snowpark_table_name} does not exist"
|
|
515
|
-
)
|
|
516
|
-
_validate_schema_and_get_writer(
|
|
517
|
-
input_df, "truncate", snowpark_table_name
|
|
518
|
-
).saveAsTable(
|
|
519
|
-
table_name=snowpark_table_name,
|
|
520
|
-
mode="truncate",
|
|
521
|
-
column_order=_column_order_for_write,
|
|
782
|
+
writer = _get_writer_for_table_creation(input_df)
|
|
783
|
+
save_mode = "append"
|
|
784
|
+
else:
|
|
785
|
+
writer = _validate_schema_and_get_writer(
|
|
786
|
+
input_df, "overwrite", snowpark_table_name, table_schema_or_error
|
|
522
787
|
)
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
788
|
+
save_mode = "overwrite"
|
|
789
|
+
|
|
790
|
+
writer.saveAsTable(
|
|
791
|
+
table_name=snowpark_table_name,
|
|
792
|
+
mode=save_mode,
|
|
793
|
+
column_order=_column_order_for_write,
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
case commands_proto.WriteOperationV2.MODE_REPLACE:
|
|
797
|
+
table_schema_or_error = _get_table_schema_or_error(
|
|
798
|
+
snowpark_table_name, session
|
|
799
|
+
)
|
|
800
|
+
_validate_table_exist_and_of_type(
|
|
801
|
+
snowpark_table_name, session, table_type, table_schema_or_error
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
if is_iceberg:
|
|
805
|
+
create_iceberg_table(
|
|
806
|
+
snowpark_table_name=snowpark_table_name,
|
|
807
|
+
location=write_op.table_properties.get("location"),
|
|
808
|
+
schema=input_df.schema,
|
|
809
|
+
snowpark_session=session,
|
|
810
|
+
mode="replace",
|
|
811
|
+
partition_by=partition_cols,
|
|
812
|
+
target_file_size=write_op.table_properties.get(
|
|
813
|
+
"write.target-file-size", None
|
|
814
|
+
),
|
|
542
815
|
)
|
|
543
|
-
|
|
816
|
+
save_mode = "append"
|
|
817
|
+
else:
|
|
818
|
+
save_mode = "overwrite"
|
|
819
|
+
|
|
820
|
+
_validate_schema_and_get_writer(
|
|
821
|
+
input_df, "replace", snowpark_table_name, table_schema_or_error
|
|
822
|
+
).saveAsTable(
|
|
823
|
+
table_name=snowpark_table_name,
|
|
824
|
+
mode=save_mode,
|
|
825
|
+
column_order=_column_order_for_write,
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
case commands_proto.WriteOperationV2.MODE_CREATE_OR_REPLACE:
|
|
829
|
+
if is_iceberg:
|
|
544
830
|
create_iceberg_table(
|
|
545
831
|
snowpark_table_name=snowpark_table_name,
|
|
546
832
|
location=write_op.table_properties.get("location"),
|
|
547
833
|
schema=input_df.schema,
|
|
548
834
|
snowpark_session=session,
|
|
549
835
|
mode="create_or_replace",
|
|
836
|
+
partition_by=partition_cols,
|
|
837
|
+
target_file_size=write_op.table_properties.get(
|
|
838
|
+
"write.target-file-size", None
|
|
839
|
+
),
|
|
550
840
|
)
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
# TODO: SNOW-2299414 Fix the implementation of table type check
|
|
593
|
-
# if check_table_type(snowpark_table_name, session) != "TABLE":
|
|
594
|
-
# raise AnalysisException(
|
|
595
|
-
# f"Table {snowpark_table_name} is not a FDN table"
|
|
596
|
-
# )
|
|
597
|
-
pass
|
|
598
|
-
else:
|
|
599
|
-
raise AnalysisException(
|
|
600
|
-
f"[TABLE_OR_VIEW_NOT_FOUND] Table {snowpark_table_name} does not exist"
|
|
601
|
-
)
|
|
602
|
-
_validate_schema_and_get_writer(
|
|
603
|
-
input_df, "truncate", snowpark_table_name
|
|
604
|
-
).saveAsTable(
|
|
605
|
-
table_name=snowpark_table_name,
|
|
606
|
-
mode="truncate",
|
|
607
|
-
column_order=_column_order_for_write,
|
|
608
|
-
)
|
|
609
|
-
case commands_proto.WriteOperationV2.MODE_REPLACE:
|
|
610
|
-
if not check_snowflake_table_existence(snowpark_table_name, session):
|
|
611
|
-
raise AnalysisException(
|
|
612
|
-
f"Table {snowpark_table_name} does not exist"
|
|
613
|
-
)
|
|
614
|
-
_validate_schema_and_get_writer(
|
|
615
|
-
input_df, "replace", snowpark_table_name
|
|
616
|
-
).saveAsTable(
|
|
617
|
-
table_name=snowpark_table_name,
|
|
618
|
-
mode="overwrite",
|
|
619
|
-
column_order=_column_order_for_write,
|
|
620
|
-
)
|
|
621
|
-
case commands_proto.WriteOperationV2.MODE_CREATE_OR_REPLACE:
|
|
622
|
-
_validate_schema_and_get_writer(
|
|
623
|
-
input_df, "create_or_replace", snowpark_table_name
|
|
624
|
-
).saveAsTable(
|
|
625
|
-
table_name=snowpark_table_name,
|
|
626
|
-
mode="overwrite",
|
|
627
|
-
column_order=_column_order_for_write,
|
|
628
|
-
)
|
|
629
|
-
case _:
|
|
630
|
-
raise SnowparkConnectNotImplementedError(
|
|
631
|
-
f"Write mode {commands_proto.WriteOperationV2.Mode.Name(write_op.mode)} is not supported"
|
|
632
|
-
)
|
|
841
|
+
save_mode = "append"
|
|
842
|
+
else:
|
|
843
|
+
save_mode = "overwrite"
|
|
844
|
+
|
|
845
|
+
_validate_schema_and_get_writer(
|
|
846
|
+
input_df, "create_or_replace", snowpark_table_name
|
|
847
|
+
).saveAsTable(
|
|
848
|
+
table_name=snowpark_table_name,
|
|
849
|
+
mode=save_mode,
|
|
850
|
+
column_order=_column_order_for_write,
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
case _:
|
|
854
|
+
exception = SnowparkConnectNotImplementedError(
|
|
855
|
+
f"Write mode {commands_proto.WriteOperationV2.Mode.Name(write_op.mode)} is not supported"
|
|
856
|
+
)
|
|
857
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
858
|
+
raise exception
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
def _get_table_schema_or_error(
|
|
862
|
+
snowpark_table_name: str, snowpark_session: snowpark.Session
|
|
863
|
+
) -> DataType | SnowparkSQLException:
|
|
864
|
+
try:
|
|
865
|
+
return snowpark_session.table(snowpark_table_name).schema
|
|
866
|
+
except SnowparkSQLException as e:
|
|
867
|
+
return e
|
|
868
|
+
|
|
869
|
+
|
|
870
|
+
def _get_writer_for_table_creation(df: snowpark.DataFrame) -> snowpark.DataFrameWriter:
|
|
871
|
+
# When creating a new table, if case sensitivity is not enabled, we need to rename the columns
|
|
872
|
+
# to upper case so they are case-insensitive in Snowflake.
|
|
873
|
+
if auto_uppercase_column_identifiers():
|
|
874
|
+
for field in df.schema.fields:
|
|
875
|
+
col_name = field.name
|
|
876
|
+
# Uppercasing is fine, regardless of whether the original name was quoted or not.
|
|
877
|
+
# In Snowflake these are equivalent "COL" == COL == col == coL
|
|
878
|
+
uppercased_name = col_name.upper()
|
|
879
|
+
if col_name != uppercased_name:
|
|
880
|
+
df = df.withColumnRenamed(col_name, uppercased_name)
|
|
881
|
+
return df.write
|
|
633
882
|
|
|
634
883
|
|
|
635
884
|
def _validate_schema_and_get_writer(
|
|
636
|
-
input_df: snowpark.DataFrame,
|
|
885
|
+
input_df: snowpark.DataFrame,
|
|
886
|
+
write_mode: str,
|
|
887
|
+
snowpark_table_name: str,
|
|
888
|
+
table_schema_or_error: DataType | SnowparkSQLException | None = None,
|
|
637
889
|
) -> snowpark.DataFrameWriter:
|
|
638
890
|
if write_mode is not None and write_mode.lower() in (
|
|
639
891
|
"replace",
|
|
640
892
|
"create_or_replace",
|
|
893
|
+
"overwrite",
|
|
641
894
|
):
|
|
642
|
-
return input_df
|
|
895
|
+
return _get_writer_for_table_creation(input_df)
|
|
643
896
|
|
|
644
897
|
table_schema = None
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
898
|
+
if table_schema_or_error is not None:
|
|
899
|
+
if isinstance(table_schema_or_error, SnowparkSQLException):
|
|
900
|
+
msg = table_schema_or_error.message
|
|
901
|
+
if "SQL compilation error" in msg and "does not exist" in msg:
|
|
902
|
+
pass
|
|
903
|
+
else:
|
|
904
|
+
attach_custom_error_code(
|
|
905
|
+
table_schema_or_error, ErrorCodes.INTERNAL_ERROR
|
|
906
|
+
)
|
|
907
|
+
raise table_schema_or_error
|
|
908
|
+
elif isinstance(table_schema_or_error, DataType):
|
|
909
|
+
table_schema = table_schema_or_error
|
|
910
|
+
else:
|
|
911
|
+
try:
|
|
912
|
+
table_schema = (
|
|
913
|
+
get_or_create_snowpark_session().table(snowpark_table_name).schema
|
|
914
|
+
)
|
|
915
|
+
except SnowparkSQLException as e:
|
|
916
|
+
msg = e.message
|
|
917
|
+
if "SQL compilation error" in msg and "does not exist" in msg:
|
|
918
|
+
pass
|
|
919
|
+
else:
|
|
920
|
+
attach_custom_error_code(e, ErrorCodes.INTERNAL_ERROR)
|
|
921
|
+
raise e
|
|
655
922
|
|
|
656
923
|
if table_schema is None:
|
|
657
924
|
# If table does not exist, we can skip the schema validation
|
|
658
|
-
return input_df
|
|
925
|
+
return _get_writer_for_table_creation(input_df)
|
|
659
926
|
|
|
660
927
|
_validate_schema_for_append(table_schema, input_df.schema, snowpark_table_name)
|
|
661
928
|
|
|
662
929
|
# if table exists and case sensitivity is not enabled, we need to rename the columns to match existing table schema
|
|
663
|
-
if
|
|
930
|
+
if auto_uppercase_column_identifiers():
|
|
664
931
|
|
|
665
932
|
for field in input_df.schema.fields:
|
|
666
933
|
# Find the matching field in the table schema (case-insensitive)
|
|
@@ -670,8 +937,8 @@ def _validate_schema_and_get_writer(
|
|
|
670
937
|
(
|
|
671
938
|
f
|
|
672
939
|
for f in table_schema.fields
|
|
673
|
-
if unquote_if_quoted(f.name).
|
|
674
|
-
== unquote_if_quoted(col_name).
|
|
940
|
+
if unquote_if_quoted(f.name).upper()
|
|
941
|
+
== unquote_if_quoted(col_name).upper()
|
|
675
942
|
),
|
|
676
943
|
None,
|
|
677
944
|
)
|
|
@@ -706,21 +973,25 @@ def _validate_schema_for_append(
|
|
|
706
973
|
case (StructType() as table_struct, StructType() as data_struct):
|
|
707
974
|
|
|
708
975
|
def _comparable_col_name(col: str) -> str:
|
|
709
|
-
name = col if
|
|
976
|
+
name = col.upper() if auto_uppercase_column_identifiers() else col
|
|
710
977
|
if compare_structs:
|
|
711
978
|
return name
|
|
712
979
|
else:
|
|
713
980
|
return unquote_if_quoted(name)
|
|
714
981
|
|
|
715
982
|
def invalid_struct_schema():
|
|
716
|
-
|
|
983
|
+
exception = AnalysisException(
|
|
717
984
|
f"Cannot resolve columns for the existing table {snowpark_table_name} ({table_schema.simple_string()}) with the data schema ({data_schema.simple_string()})."
|
|
718
985
|
)
|
|
986
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
987
|
+
raise exception
|
|
719
988
|
|
|
720
989
|
if len(table_struct.fields) != len(data_struct.fields):
|
|
721
|
-
|
|
990
|
+
exception = AnalysisException(
|
|
722
991
|
f"The column number of the existing table {snowpark_table_name} ({table_schema.simple_string()}) doesn't match the data schema ({data_schema.simple_string()}).)"
|
|
723
992
|
)
|
|
993
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
994
|
+
raise exception
|
|
724
995
|
|
|
725
996
|
table_field_names = {
|
|
726
997
|
_comparable_col_name(field.name) for field in table_struct.fields
|
|
@@ -783,9 +1054,24 @@ def _validate_schema_for_append(
|
|
|
783
1054
|
case (DateType(), _) if isinstance(data_schema, (DateType, TimestampType)):
|
|
784
1055
|
return
|
|
785
1056
|
case (_, _):
|
|
786
|
-
|
|
1057
|
+
exception = AnalysisException(
|
|
787
1058
|
f"[INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_SAFELY_CAST] Cannot write incompatible data for the table {snowpark_table_name}: Cannot safely cast {data_schema.simple_string()} to {table_schema.simple_string()}"
|
|
788
1059
|
)
|
|
1060
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
1061
|
+
raise exception
|
|
1062
|
+
|
|
1063
|
+
|
|
1064
|
+
def _validate_target_file_size(target_file_size: str | None):
|
|
1065
|
+
# validate target file size is in the acceptable values
|
|
1066
|
+
if target_file_size is None:
|
|
1067
|
+
return
|
|
1068
|
+
|
|
1069
|
+
if target_file_size not in TARGET_FILE_SIZE_ACCEPTABLE_VALUES:
|
|
1070
|
+
exception = AnalysisException(
|
|
1071
|
+
f"Invalid value '{target_file_size}' for TARGET_FILE_SIZE. Allowed values: {', '.join(TARGET_FILE_SIZE_ACCEPTABLE_VALUES)}."
|
|
1072
|
+
)
|
|
1073
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_CONFIG_VALUE)
|
|
1074
|
+
raise exception
|
|
789
1075
|
|
|
790
1076
|
|
|
791
1077
|
def create_iceberg_table(
|
|
@@ -794,6 +1080,8 @@ def create_iceberg_table(
|
|
|
794
1080
|
schema: StructType,
|
|
795
1081
|
snowpark_session: snowpark.Session,
|
|
796
1082
|
mode: str = "create",
|
|
1083
|
+
partition_by: list[str] = None,
|
|
1084
|
+
target_file_size: str | None = None,
|
|
797
1085
|
):
|
|
798
1086
|
table_schema = [
|
|
799
1087
|
f"{spark_to_sf_single_id(unquote_if_quoted(field.name), is_column = True)} {snowpark_to_iceberg_type(field.datatype)}"
|
|
@@ -807,7 +1095,7 @@ def create_iceberg_table(
|
|
|
807
1095
|
)
|
|
808
1096
|
base_location = f"BASE_LOCATION = '{location}'"
|
|
809
1097
|
|
|
810
|
-
config_external_volume = sessions_config.get(
|
|
1098
|
+
config_external_volume = sessions_config.get(get_spark_session_id(), {}).get(
|
|
811
1099
|
"snowpark.connect.iceberg.external_volume", None
|
|
812
1100
|
)
|
|
813
1101
|
external_volume = (
|
|
@@ -815,24 +1103,38 @@ def create_iceberg_table(
|
|
|
815
1103
|
if config_external_volume is None or config_external_volume == ""
|
|
816
1104
|
else f"EXTERNAL_VOLUME = '{config_external_volume}'"
|
|
817
1105
|
)
|
|
1106
|
+
copy_grants = ""
|
|
1107
|
+
partition_by_sql = (
|
|
1108
|
+
f"PARTITION BY ({','.join([f'{spark_to_sf_single_id(unquote_if_quoted(p), is_column = True)}' for p in partition_by])})"
|
|
1109
|
+
if partition_by
|
|
1110
|
+
else ""
|
|
1111
|
+
)
|
|
818
1112
|
|
|
1113
|
+
_validate_target_file_size(target_file_size)
|
|
1114
|
+
target_file_size_sql = (
|
|
1115
|
+
f"TARGET_FILE_SIZE = '{target_file_size}'" if target_file_size else ""
|
|
1116
|
+
)
|
|
819
1117
|
match mode:
|
|
820
1118
|
case "create":
|
|
821
1119
|
create_sql = "CREATE"
|
|
822
|
-
case "replace":
|
|
1120
|
+
case "replace" | "create_or_replace":
|
|
823
1121
|
# There's no replace for iceberg table, so we use create or replace
|
|
824
|
-
|
|
825
|
-
case "create_or_replace":
|
|
1122
|
+
copy_grants = "COPY GRANTS"
|
|
826
1123
|
create_sql = "CREATE OR REPLACE"
|
|
827
1124
|
case _:
|
|
828
|
-
|
|
1125
|
+
exception = SnowparkConnectNotImplementedError(
|
|
829
1126
|
f"Write mode {mode} is not supported for iceberg table"
|
|
830
1127
|
)
|
|
1128
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
1129
|
+
raise exception
|
|
831
1130
|
sql = f"""
|
|
832
1131
|
{create_sql} ICEBERG TABLE {snowpark_table_name} ({",".join(table_schema)})
|
|
1132
|
+
{partition_by_sql}
|
|
833
1133
|
CATALOG = 'SNOWFLAKE'
|
|
834
1134
|
{external_volume}
|
|
835
|
-
{base_location}
|
|
1135
|
+
{base_location}
|
|
1136
|
+
{target_file_size_sql}
|
|
1137
|
+
{copy_grants};
|
|
836
1138
|
"""
|
|
837
1139
|
snowpark_session.sql(sql).collect()
|
|
838
1140
|
|
|
@@ -843,39 +1145,333 @@ def rewrite_df(input_df: snowpark.DataFrame, source: str) -> snowpark.DataFrame:
|
|
|
843
1145
|
json: construct the dataframe to 1 column in json format
|
|
844
1146
|
1. Append columns which represents the column name
|
|
845
1147
|
2. Use object_construct to aggregate the dataframe into 1 column
|
|
846
|
-
|
|
1148
|
+
csv:
|
|
1149
|
+
Use "" to replace empty string
|
|
847
1150
|
"""
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
1151
|
+
match source:
|
|
1152
|
+
case "json":
|
|
1153
|
+
rand_salt = random_string(10, "_")
|
|
1154
|
+
rewritten_df = input_df.with_columns(
|
|
1155
|
+
[co + rand_salt for co in input_df.columns],
|
|
1156
|
+
[lit(unquote_if_quoted(co)) for co in input_df.columns],
|
|
1157
|
+
)
|
|
1158
|
+
construct_key_values = []
|
|
1159
|
+
for co in input_df.columns:
|
|
1160
|
+
construct_key_values.append(col(co + rand_salt))
|
|
1161
|
+
construct_key_values.append(col(co))
|
|
1162
|
+
return rewritten_df.select(object_construct(*construct_key_values))
|
|
1163
|
+
case "csv":
|
|
1164
|
+
new_cols = []
|
|
1165
|
+
for co in input_df.columns:
|
|
1166
|
+
if isinstance(input_df.schema[co].datatype, StringType):
|
|
1167
|
+
new_col = col(co)
|
|
1168
|
+
new_col = when(
|
|
1169
|
+
new_col.isNotNull() & (new_col == ""), lit('""')
|
|
1170
|
+
).otherwise(new_col)
|
|
1171
|
+
new_cols.append(new_col.alias(co))
|
|
1172
|
+
else:
|
|
1173
|
+
new_cols.append(col(co))
|
|
1174
|
+
return input_df.select(new_cols)
|
|
1175
|
+
case _:
|
|
1176
|
+
return input_df
|
|
860
1177
|
|
|
861
1178
|
|
|
862
1179
|
def handle_column_names(
|
|
863
1180
|
container: DataFrameContainer, source: str
|
|
864
|
-
) -> snowpark.DataFrame:
|
|
1181
|
+
) -> tuple[snowpark.DataFrame, list[str]]:
|
|
865
1182
|
"""
|
|
866
1183
|
Handle column names before write so they match spark schema.
|
|
1184
|
+
|
|
1185
|
+
Returns:
|
|
1186
|
+
A tuple of (dataframe, snowpark_column_names) where snowpark_column_names
|
|
1187
|
+
are the resulting column names after any renaming.
|
|
867
1188
|
"""
|
|
868
1189
|
df = container.dataframe
|
|
1190
|
+
column_map = container.column_map
|
|
1191
|
+
|
|
869
1192
|
if source == "jdbc":
|
|
870
1193
|
# don't change column names for jdbc sources as we directly use spark column names for writing to the destination tables.
|
|
871
|
-
return df
|
|
872
|
-
column_map = container.column_map
|
|
1194
|
+
return df, column_map.get_snowpark_columns()
|
|
873
1195
|
|
|
1196
|
+
snowpark_column_names = []
|
|
874
1197
|
for column in column_map.columns:
|
|
875
|
-
|
|
876
|
-
|
|
1198
|
+
new_name = quote_name_without_upper_casing(column.spark_name)
|
|
1199
|
+
df = df.withColumnRenamed(column.snowpark_name, new_name)
|
|
1200
|
+
snowpark_column_names.append(new_name)
|
|
1201
|
+
|
|
1202
|
+
return df, snowpark_column_names
|
|
1203
|
+
|
|
1204
|
+
|
|
1205
|
+
def _generate_metadata_files(
|
|
1206
|
+
source: str,
|
|
1207
|
+
write_path: str,
|
|
1208
|
+
stage_path: str,
|
|
1209
|
+
schema: StructType,
|
|
1210
|
+
session: snowpark.Session,
|
|
1211
|
+
parameters: dict,
|
|
1212
|
+
is_local_path: bool,
|
|
1213
|
+
) -> None:
|
|
1214
|
+
"""
|
|
1215
|
+
Generate marker and metadata files after write completes.
|
|
1216
|
+
|
|
1217
|
+
Handles _SUCCESS marker files and Parquet _common_metadata generation
|
|
1218
|
+
for both local and cloud/stage paths.
|
|
1219
|
+
|
|
1220
|
+
Args:
|
|
1221
|
+
source: Write format (csv, parquet, json, etc.)
|
|
1222
|
+
write_path: Original write path (local or cloud)
|
|
1223
|
+
stage_path: Stage path where files were written
|
|
1224
|
+
schema: DataFrame schema
|
|
1225
|
+
session: Snowpark session
|
|
1226
|
+
parameters: Write parameters
|
|
1227
|
+
is_local_path: Whether writing to local filesystem
|
|
1228
|
+
"""
|
|
1229
|
+
generate_success = get_success_file_generation_enabled()
|
|
1230
|
+
generate_parquet_metadata = (
|
|
1231
|
+
source == "parquet" and get_parquet_metadata_generation_enabled()
|
|
1232
|
+
)
|
|
1233
|
+
|
|
1234
|
+
if is_local_path:
|
|
1235
|
+
# Local path: write files directly
|
|
1236
|
+
if generate_success:
|
|
1237
|
+
_write_success_file_locally(write_path)
|
|
1238
|
+
if generate_parquet_metadata:
|
|
1239
|
+
_write_parquet_metadata_files_locally(write_path, schema)
|
|
1240
|
+
else:
|
|
1241
|
+
# Cloud/stage path: upload via stage operations
|
|
1242
|
+
if generate_success:
|
|
1243
|
+
_write_success_file_to_stage(stage_path, session, parameters)
|
|
1244
|
+
if generate_parquet_metadata:
|
|
1245
|
+
_upload_common_metadata_to_stage(stage_path, schema, session)
|
|
1246
|
+
|
|
1247
|
+
|
|
1248
|
+
def _write_success_file_locally(directory_path: str) -> None:
|
|
1249
|
+
"""
|
|
1250
|
+
Write a _SUCCESS marker file to a local directory.
|
|
1251
|
+
"""
|
|
1252
|
+
try:
|
|
1253
|
+
success_file = Path(directory_path) / "_SUCCESS"
|
|
1254
|
+
success_file.touch()
|
|
1255
|
+
logger.debug(f"Created _SUCCESS file at {directory_path}")
|
|
1256
|
+
except Exception as e:
|
|
1257
|
+
logger.warning(f"Failed to create _SUCCESS file at {directory_path}: {e}")
|
|
1258
|
+
|
|
1259
|
+
|
|
1260
|
+
def _write_success_file_to_stage(
|
|
1261
|
+
stage_path: str,
|
|
1262
|
+
session: snowpark.Session,
|
|
1263
|
+
parameters: dict,
|
|
1264
|
+
) -> None:
|
|
1265
|
+
"""
|
|
1266
|
+
Write a _SUCCESS marker file to a stage location.
|
|
1267
|
+
"""
|
|
1268
|
+
try:
|
|
1269
|
+
# Create a dummy dataframe with one row containing "SUCCESS"
|
|
1270
|
+
success_df = session.create_dataframe([["SUCCESS"]]).to_df(["STATUS"])
|
|
1271
|
+
success_params = copy.deepcopy(parameters)
|
|
1272
|
+
|
|
1273
|
+
success_params.pop("partition_by", None)
|
|
1274
|
+
|
|
1275
|
+
success_params["location"] = f"{stage_path}/_SUCCESS"
|
|
1276
|
+
success_params["single"] = True
|
|
1277
|
+
success_params["header"] = True
|
|
1278
|
+
|
|
1279
|
+
# Set CSV format with explicit no compression for _SUCCESS file
|
|
1280
|
+
success_params["file_format_type"] = "csv"
|
|
1281
|
+
success_params["format_type_options"] = {
|
|
1282
|
+
"COMPRESSION": "NONE",
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
success_df.write.copy_into_location(**success_params)
|
|
1286
|
+
|
|
1287
|
+
logger.debug(f"Created _SUCCESS file at {stage_path}")
|
|
1288
|
+
except Exception as e:
|
|
1289
|
+
logger.warning(f"Failed to create _SUCCESS file at {stage_path}: {e}")
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
def _get_metadata_upload_sproc() -> str:
|
|
1293
|
+
"""
|
|
1294
|
+
Get the cached metadata upload stored procedure.
|
|
1295
|
+
|
|
1296
|
+
Returns:
|
|
1297
|
+
Fully qualified name of the cached stored procedure
|
|
1298
|
+
"""
|
|
1299
|
+
sproc_body = """import base64
|
|
1300
|
+
import tempfile
|
|
1301
|
+
import os
|
|
1302
|
+
|
|
1303
|
+
def upload_file(session, file_content_b64: str, file_name: str, target_stage: str):
|
|
1304
|
+
import base64
|
|
1305
|
+
import tempfile
|
|
1306
|
+
import os
|
|
1307
|
+
|
|
1308
|
+
# Decode base64 content
|
|
1309
|
+
file_content = base64.b64decode(file_content_b64)
|
|
1310
|
+
|
|
1311
|
+
# Create temp directory and write file with exact name
|
|
1312
|
+
temp_dir = tempfile.mkdtemp()
|
|
1313
|
+
tmp_file_path = os.path.join(temp_dir, file_name)
|
|
1314
|
+
|
|
1315
|
+
with open(tmp_file_path, 'wb') as f:
|
|
1316
|
+
f.write(file_content)
|
|
1317
|
+
|
|
1318
|
+
try:
|
|
1319
|
+
# Use session.file.put() - works for both internal and external stages in sproc context
|
|
1320
|
+
result = session.file.put(
|
|
1321
|
+
tmp_file_path,
|
|
1322
|
+
target_stage,
|
|
1323
|
+
auto_compress=False,
|
|
1324
|
+
overwrite=True
|
|
1325
|
+
)
|
|
1326
|
+
|
|
1327
|
+
# Extract status from result
|
|
1328
|
+
if result and len(result) > 0:
|
|
1329
|
+
status = result[0].status if hasattr(result[0], 'status') else str(result[0])
|
|
1330
|
+
else:
|
|
1331
|
+
status = "uploaded"
|
|
1332
|
+
|
|
1333
|
+
return "Uploaded " + file_name + " Status: " + status
|
|
1334
|
+
finally:
|
|
1335
|
+
# Clean up temp files
|
|
1336
|
+
try:
|
|
1337
|
+
os.unlink(tmp_file_path)
|
|
1338
|
+
os.rmdir(temp_dir)
|
|
1339
|
+
except (OSError, IOError):
|
|
1340
|
+
pass"""
|
|
1341
|
+
|
|
1342
|
+
# Use the cached sproc system for better performance and schema/database change handling
|
|
1343
|
+
return register_cached_sproc(
|
|
1344
|
+
sproc_body=sproc_body,
|
|
1345
|
+
handler_name="upload_file",
|
|
1346
|
+
input_arg_types=["STRING", "STRING", "STRING"],
|
|
1347
|
+
return_type="STRING",
|
|
1348
|
+
runtime_version="3.11",
|
|
1349
|
+
packages=["snowflake-snowpark-python"],
|
|
1350
|
+
)
|
|
1351
|
+
|
|
1352
|
+
|
|
1353
|
+
def _upload_file_to_stage_via_sproc(
|
|
1354
|
+
local_file_path: Path, stage_path: str, session: snowpark.Session
|
|
1355
|
+
) -> None:
|
|
1356
|
+
"""
|
|
1357
|
+
Upload a file to a stage using the reusable stored procedure. We cannot directly use session.file.put() as it doesn't support external stages.
|
|
1358
|
+
|
|
1359
|
+
Args:
|
|
1360
|
+
local_file_path: Local file to upload
|
|
1361
|
+
stage_path: Target stage path (e.g., @STAGE_NAME/path)
|
|
1362
|
+
session: Snowpark session
|
|
1363
|
+
"""
|
|
1364
|
+
import base64
|
|
1365
|
+
|
|
1366
|
+
sproc_name = _get_metadata_upload_sproc()
|
|
1367
|
+
|
|
1368
|
+
with open(local_file_path, "rb") as f:
|
|
1369
|
+
file_content = f.read()
|
|
1370
|
+
|
|
1371
|
+
file_content_b64 = base64.b64encode(file_content).decode("utf-8")
|
|
1372
|
+
file_name = "_common_metadata"
|
|
1373
|
+
session.call(sproc_name, file_content_b64, file_name, stage_path)
|
|
1374
|
+
|
|
1375
|
+
logger.debug(f"Uploaded {file_name} to {stage_path} via stored procedure")
|
|
1376
|
+
|
|
1377
|
+
|
|
1378
|
+
def _upload_common_metadata_to_stage(
|
|
1379
|
+
stage_path: str, snowpark_schema: StructType, session: snowpark.Session
|
|
1380
|
+
) -> None:
|
|
1381
|
+
"""
|
|
1382
|
+
Generate and upload _common_metadata file to a stage.
|
|
1383
|
+
|
|
1384
|
+
Converts Snowpark → PySpark → Spark JSON, creates PyArrow schema with Spark metadata,
|
|
1385
|
+
then uploads to stage via temporary stored procedure (supports internal and external stages).
|
|
1386
|
+
|
|
1387
|
+
Args:
|
|
1388
|
+
stage_path: Stage path where to upload _common_metadata (e.g., @STAGE/path)
|
|
1389
|
+
snowpark_schema: DataFrame schema (already in memory)
|
|
1390
|
+
session: Snowpark session for uploading
|
|
1391
|
+
"""
|
|
1392
|
+
try:
|
|
1393
|
+
import tempfile
|
|
1394
|
+
|
|
1395
|
+
spark_only_schema = _create_spark_schema_from_snowpark(snowpark_schema)
|
|
1396
|
+
|
|
1397
|
+
with tempfile.NamedTemporaryFile(
|
|
1398
|
+
suffix="_common_metadata", delete=False
|
|
1399
|
+
) as tmp_file:
|
|
1400
|
+
tmp_path = Path(tmp_file.name)
|
|
1401
|
+
pq.write_metadata(spark_only_schema, tmp_path)
|
|
1402
|
+
_upload_file_to_stage_via_sproc(tmp_path, stage_path, session)
|
|
1403
|
+
tmp_path.unlink()
|
|
1404
|
+
|
|
1405
|
+
logger.debug(f"Created _common_metadata at {stage_path}")
|
|
1406
|
+
|
|
1407
|
+
except ImportError:
|
|
1408
|
+
logger.warning(
|
|
1409
|
+
"PyArrow is required to generate Parquet metadata files. "
|
|
1410
|
+
"Install with: pip install pyarrow"
|
|
1411
|
+
)
|
|
1412
|
+
except Exception as e:
|
|
1413
|
+
logger.warning(f"Failed to create _common_metadata file: {e}")
|
|
1414
|
+
|
|
1415
|
+
|
|
1416
|
+
def _create_spark_schema_from_snowpark(snowpark_schema: StructType) -> pa.Schema:
|
|
1417
|
+
"""
|
|
1418
|
+
Create PyArrow schema with Spark metadata from Snowpark schema.
|
|
1419
|
+
"""
|
|
1420
|
+
# Unquote field names (Snowpark may have quoted names like "ab")
|
|
1421
|
+
unquoted_fields = []
|
|
1422
|
+
for field in snowpark_schema.fields:
|
|
1423
|
+
unquoted_name = unquote_if_quoted(field.name)
|
|
1424
|
+
unquoted_fields.append(
|
|
1425
|
+
snowpark.types.StructField(
|
|
1426
|
+
unquoted_name, field.datatype, field.nullable, _is_column=False
|
|
1427
|
+
)
|
|
877
1428
|
)
|
|
878
|
-
|
|
1429
|
+
unquoted_snowpark_schema = snowpark.types.StructType(
|
|
1430
|
+
unquoted_fields, structured=snowpark_schema.structured
|
|
1431
|
+
)
|
|
1432
|
+
pyspark_schema = map_snowpark_to_pyspark_types(unquoted_snowpark_schema)
|
|
1433
|
+
spark_schema_json = pyspark_schema.json()
|
|
1434
|
+
|
|
1435
|
+
spark_metadata = {
|
|
1436
|
+
b"org.apache.spark.version": SPARK_VERSION.encode("utf-8"),
|
|
1437
|
+
b"org.apache.spark.sql.parquet.row.metadata": spark_schema_json.encode("utf-8"),
|
|
1438
|
+
}
|
|
1439
|
+
|
|
1440
|
+
# Convert PySpark to PyArrow for the physical schema structure
|
|
1441
|
+
# NOTE: Spark reads schema from the JSON metadata above, NOT from the Parquet schema!
|
|
1442
|
+
# However, correct Parquet types are needed as fallback if JSON parsing fails,
|
|
1443
|
+
# and for compatibility with non-Spark tools (PyArrow, Dask, Presto, etc.)
|
|
1444
|
+
arrow_fields = []
|
|
1445
|
+
for field in pyspark_schema.fields:
|
|
1446
|
+
pa_type = map_pyspark_types_to_pyarrow_types(field.dataType)
|
|
1447
|
+
arrow_fields.append(pa.field(field.name, pa_type, nullable=field.nullable))
|
|
1448
|
+
|
|
1449
|
+
return pa.schema(arrow_fields, metadata=spark_metadata)
|
|
1450
|
+
|
|
1451
|
+
|
|
1452
|
+
def _write_parquet_metadata_files_locally(
|
|
1453
|
+
write_path: str, snowpark_schema: StructType
|
|
1454
|
+
) -> None:
|
|
1455
|
+
"""
|
|
1456
|
+
Generate _common_metadata file for local Parquet datasets.
|
|
1457
|
+
|
|
1458
|
+
Only generates _common_metadata (not _metadata) for consistency with cloud paths,
|
|
1459
|
+
where downloading all files for row group statistics would be inefficient.
|
|
1460
|
+
"""
|
|
1461
|
+
try:
|
|
1462
|
+
local_path = Path(write_path)
|
|
1463
|
+
spark_only_schema = _create_spark_schema_from_snowpark(snowpark_schema)
|
|
1464
|
+
pq.write_metadata(spark_only_schema, local_path / "_common_metadata")
|
|
1465
|
+
|
|
1466
|
+
logger.debug(f"Created _common_metadata at {write_path}")
|
|
1467
|
+
|
|
1468
|
+
except ImportError:
|
|
1469
|
+
logger.warning(
|
|
1470
|
+
"PyArrow is required to generate Parquet metadata files. "
|
|
1471
|
+
"Install with: pip install pyarrow"
|
|
1472
|
+
)
|
|
1473
|
+
except Exception as e:
|
|
1474
|
+
logger.warning(f"Failed to create _common_metadata file: {e}")
|
|
879
1475
|
|
|
880
1476
|
|
|
881
1477
|
def store_files_locally(
|
|
@@ -889,14 +1485,56 @@ def store_files_locally(
|
|
|
889
1485
|
)
|
|
890
1486
|
if overwrite and os.path.isdir(target_path):
|
|
891
1487
|
_truncate_directory(real_path)
|
|
892
|
-
|
|
1488
|
+
# Per Snowflake docs: "The command does not preserve stage directory structure when transferring files to your client machine"
|
|
1489
|
+
# https://docs.snowflake.com/en/sql-reference/sql/get
|
|
1490
|
+
# Preserve directory structure under stage_path by listing files and
|
|
1491
|
+
# downloading each into its corresponding local subdirectory when partition subdirs exist.
|
|
1492
|
+
# Otherwise, fall back to a direct GET which flattens.
|
|
1493
|
+
|
|
1494
|
+
# TODO(SNOW-2326973): This can be parallelized further. Its not done here because it only affects
|
|
1495
|
+
# write to local storage.
|
|
1496
|
+
|
|
1497
|
+
ls_dataframe = session.sql(f"LS {stage_path}")
|
|
1498
|
+
ls_iterator = ls_dataframe.toLocalIterator()
|
|
1499
|
+
|
|
1500
|
+
# Build a normalized base prefix from stage_path to compute relatives
|
|
1501
|
+
# Example: stage_path='@MY_STAGE/prefix' -> base_prefix='my_stage/prefix/'
|
|
1502
|
+
base_prefix = stage_path.lstrip("@").rstrip("/") + "/"
|
|
1503
|
+
base_prefix_lower = base_prefix.lower()
|
|
1504
|
+
|
|
1505
|
+
# Group by parent directory under the base prefix, then issue a GET per directory.
|
|
1506
|
+
# This gives a small parallelism advantage if we have many files per partition directory.
|
|
1507
|
+
parent_dirs: set[str] = set()
|
|
1508
|
+
for row in ls_iterator:
|
|
1509
|
+
name: str = row[0]
|
|
1510
|
+
name_lower = name.lower()
|
|
1511
|
+
rel_start = name_lower.find(base_prefix_lower)
|
|
1512
|
+
relative = name[rel_start + len(base_prefix) :] if rel_start != -1 else name
|
|
1513
|
+
parent_dir = os.path.dirname(relative)
|
|
1514
|
+
if parent_dir and parent_dir != ".":
|
|
1515
|
+
parent_dirs.add(parent_dir)
|
|
1516
|
+
|
|
1517
|
+
# If no parent directories were discovered (non-partitioned unload prefix), use direct GET.
|
|
1518
|
+
if not parent_dirs:
|
|
1519
|
+
snowpark.file_operation.FileOperation(session).get(stage_path, str(real_path))
|
|
1520
|
+
return
|
|
1521
|
+
|
|
1522
|
+
file_op = snowpark.file_operation.FileOperation(session)
|
|
1523
|
+
for parent_dir in sorted(parent_dirs):
|
|
1524
|
+
local_dir = real_path / parent_dir
|
|
1525
|
+
os.makedirs(local_dir, exist_ok=True)
|
|
1526
|
+
|
|
1527
|
+
src_dir = f"@{base_prefix}{parent_dir}"
|
|
1528
|
+
file_op.get(src_dir, str(local_dir))
|
|
893
1529
|
|
|
894
1530
|
|
|
895
1531
|
def _truncate_directory(directory_path: Path) -> None:
|
|
896
1532
|
if not directory_path.exists():
|
|
897
|
-
|
|
1533
|
+
exception = FileNotFoundError(
|
|
898
1534
|
f"The specified directory {directory_path} does not exist."
|
|
899
1535
|
)
|
|
1536
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
1537
|
+
raise exception
|
|
900
1538
|
# Iterate over all the files and directories in the specified directory
|
|
901
1539
|
for file in directory_path.iterdir():
|
|
902
1540
|
# Check if it is a file or directory and remove it
|
|
@@ -904,31 +1542,3 @@ def _truncate_directory(directory_path: Path) -> None:
|
|
|
904
1542
|
file.unlink()
|
|
905
1543
|
elif file.is_dir():
|
|
906
1544
|
shutil.rmtree(file)
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
def check_snowflake_table_existence(
|
|
910
|
-
snowpark_table_name: str,
|
|
911
|
-
snowpark_session: snowpark.Session,
|
|
912
|
-
):
|
|
913
|
-
try:
|
|
914
|
-
snowpark_session.sql(f"SELECT 1 FROM {snowpark_table_name} LIMIT 1").collect()
|
|
915
|
-
return True
|
|
916
|
-
except Exception:
|
|
917
|
-
return False
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
# TODO: SNOW-2299414 Fix the implementation of table type check
|
|
921
|
-
# def check_table_type(
|
|
922
|
-
# snowpark_table_name: str,
|
|
923
|
-
# snowpark_session: snowpark.Session,
|
|
924
|
-
# ) -> str:
|
|
925
|
-
# # currently we only support iceberg table and FDN table
|
|
926
|
-
# metadata = snowpark_session.sql(
|
|
927
|
-
# f"SHOW TABLES LIKE '{unquote_if_quoted(snowpark_table_name)}';"
|
|
928
|
-
# ).collect()
|
|
929
|
-
# if metadata is None or len(metadata) == 0:
|
|
930
|
-
# raise AnalysisException(f"Table {snowpark_table_name} does not exist")
|
|
931
|
-
# metadata = metadata[0]
|
|
932
|
-
# if metadata.as_dict().get("is_iceberg") == "Y":
|
|
933
|
-
# return "ICEBERG"
|
|
934
|
-
# return "TABLE"
|