snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. snowflake/snowpark_connect/__init__.py +1 -0
  2. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
  3. snowflake/snowpark_connect/client/__init__.py +15 -0
  4. snowflake/snowpark_connect/client/error_utils.py +30 -0
  5. snowflake/snowpark_connect/client/exceptions.py +36 -0
  6. snowflake/snowpark_connect/client/query_results.py +90 -0
  7. snowflake/snowpark_connect/client/server.py +717 -0
  8. snowflake/snowpark_connect/client/utils/__init__.py +10 -0
  9. snowflake/snowpark_connect/client/utils/session.py +85 -0
  10. snowflake/snowpark_connect/column_name_handler.py +404 -243
  11. snowflake/snowpark_connect/column_qualifier.py +43 -0
  12. snowflake/snowpark_connect/config.py +309 -26
  13. snowflake/snowpark_connect/constants.py +2 -0
  14. snowflake/snowpark_connect/dataframe_container.py +102 -8
  15. snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
  16. snowflake/snowpark_connect/error/error_codes.py +50 -0
  17. snowflake/snowpark_connect/error/error_utils.py +172 -23
  18. snowflake/snowpark_connect/error/exceptions.py +13 -4
  19. snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
  20. snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
  21. snowflake/snowpark_connect/execute_plan/utils.py +5 -1
  22. snowflake/snowpark_connect/expression/error_utils.py +28 -0
  23. snowflake/snowpark_connect/expression/function_defaults.py +9 -2
  24. snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
  25. snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
  26. snowflake/snowpark_connect/expression/literal.py +37 -13
  27. snowflake/snowpark_connect/expression/map_cast.py +224 -15
  28. snowflake/snowpark_connect/expression/map_expression.py +80 -27
  29. snowflake/snowpark_connect/expression/map_extension.py +322 -12
  30. snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
  31. snowflake/snowpark_connect/expression/map_udf.py +86 -20
  32. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
  33. snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
  34. snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
  35. snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
  36. snowflake/snowpark_connect/expression/map_window_function.py +18 -3
  37. snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
  39. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
  43. snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
  44. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
  45. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
  46. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
  47. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
  48. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
  49. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
  50. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
  51. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
  52. snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
  53. snowflake/snowpark_connect/relation/io_utils.py +110 -10
  54. snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
  55. snowflake/snowpark_connect/relation/map_catalog.py +5 -1
  56. snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
  57. snowflake/snowpark_connect/relation/map_extension.py +263 -29
  58. snowflake/snowpark_connect/relation/map_join.py +683 -442
  59. snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
  60. snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
  61. snowflake/snowpark_connect/relation/map_relation.py +48 -19
  62. snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
  63. snowflake/snowpark_connect/relation/map_show_string.py +13 -6
  64. snowflake/snowpark_connect/relation/map_sql.py +1233 -222
  65. snowflake/snowpark_connect/relation/map_stats.py +48 -9
  66. snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
  67. snowflake/snowpark_connect/relation/map_udtf.py +14 -4
  68. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
  69. snowflake/snowpark_connect/relation/read/map_read.py +134 -43
  70. snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
  71. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
  72. snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
  73. snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
  74. snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
  75. snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
  76. snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
  77. snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
  78. snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
  79. snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
  80. snowflake/snowpark_connect/relation/read/utils.py +50 -5
  81. snowflake/snowpark_connect/relation/stage_locator.py +91 -55
  82. snowflake/snowpark_connect/relation/utils.py +128 -5
  83. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
  84. snowflake/snowpark_connect/relation/write/map_write.py +929 -319
  85. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
  86. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  87. snowflake/snowpark_connect/resources_initializer.py +171 -48
  88. snowflake/snowpark_connect/server.py +528 -473
  89. snowflake/snowpark_connect/server_common/__init__.py +503 -0
  90. snowflake/snowpark_connect/snowflake_session.py +65 -0
  91. snowflake/snowpark_connect/start_server.py +53 -5
  92. snowflake/snowpark_connect/type_mapping.py +349 -27
  93. snowflake/snowpark_connect/type_support.py +130 -0
  94. snowflake/snowpark_connect/typed_column.py +9 -7
  95. snowflake/snowpark_connect/utils/artifacts.py +9 -8
  96. snowflake/snowpark_connect/utils/cache.py +49 -27
  97. snowflake/snowpark_connect/utils/concurrent.py +36 -1
  98. snowflake/snowpark_connect/utils/context.py +195 -37
  99. snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
  100. snowflake/snowpark_connect/utils/env_utils.py +5 -1
  101. snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
  102. snowflake/snowpark_connect/utils/identifiers.py +137 -3
  103. snowflake/snowpark_connect/utils/io_utils.py +57 -1
  104. snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
  105. snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
  106. snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
  107. snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
  108. snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
  109. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
  110. snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
  111. snowflake/snowpark_connect/utils/profiling.py +25 -8
  112. snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
  113. snowflake/snowpark_connect/utils/sequence.py +21 -0
  114. snowflake/snowpark_connect/utils/session.py +64 -28
  115. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
  116. snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
  117. snowflake/snowpark_connect/utils/telemetry.py +192 -40
  118. snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
  119. snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
  120. snowflake/snowpark_connect/utils/udf_cache.py +117 -41
  121. snowflake/snowpark_connect/utils/udf_helper.py +39 -37
  122. snowflake/snowpark_connect/utils/udf_utils.py +133 -14
  123. snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
  124. snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
  125. snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
  126. snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
  127. snowflake/snowpark_connect/version.py +1 -1
  128. snowflake/snowpark_decoder/dp_session.py +6 -2
  129. snowflake/snowpark_decoder/spark_decoder.py +12 -0
  130. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
  131. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
  132. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
  133. snowflake/snowpark_connect/hidden_column.py +0 -39
  134. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  135. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  136. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  137. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  138. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  139. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  140. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  141. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  142. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  143. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  144. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  145. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  146. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  147. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  148. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  149. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  150. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  151. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  152. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  153. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  154. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  155. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  156. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  157. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  158. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  159. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  160. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  161. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  162. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  163. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  164. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  165. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  166. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  167. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  168. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  169. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  170. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  171. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  172. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  173. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  174. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  175. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  176. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  177. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  178. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  179. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  180. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  181. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  182. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  183. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  184. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  185. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  186. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  187. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  188. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  189. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  190. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  191. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  192. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  193. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  194. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
  195. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
  196. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
  197. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
  198. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
  199. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
  200. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
@@ -2,10 +2,15 @@
2
2
  # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
4
 
5
+ import copy
5
6
  import os
6
7
  import shutil
8
+ import uuid
9
+ from contextlib import suppress
7
10
  from pathlib import Path
8
11
 
12
+ import pyarrow as pa
13
+ import pyarrow.parquet as pq
9
14
  import pyspark.sql.connect.proto.base_pb2 as proto_base
10
15
  import pyspark.sql.connect.proto.commands_pb2 as commands_proto
11
16
  from pyspark.errors.exceptions.base import AnalysisException
@@ -16,7 +21,7 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
16
21
  unquote_if_quoted,
17
22
  )
18
23
  from snowflake.snowpark.exceptions import SnowparkSQLException
19
- from snowflake.snowpark.functions import col, lit, object_construct, sql_expr
24
+ from snowflake.snowpark.functions import col, lit, object_construct, sql_expr, when
20
25
  from snowflake.snowpark.types import (
21
26
  ArrayType,
22
27
  DataType,
@@ -28,37 +33,57 @@ from snowflake.snowpark.types import (
28
33
  _NumericType,
29
34
  )
30
35
  from snowflake.snowpark_connect.config import (
36
+ auto_uppercase_column_identifiers,
37
+ get_parquet_metadata_generation_enabled,
38
+ get_success_file_generation_enabled,
31
39
  global_config,
32
40
  sessions_config,
33
41
  str_to_bool,
34
42
  )
43
+ from snowflake.snowpark_connect.constants import SPARK_VERSION
35
44
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
45
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
46
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
36
47
  from snowflake.snowpark_connect.relation.io_utils import (
37
48
  convert_file_prefix_path,
49
+ get_compression_for_source_and_options,
38
50
  is_cloud_path,
39
51
  )
40
52
  from snowflake.snowpark_connect.relation.map_relation import map_relation
53
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
54
+ without_internal_columns,
55
+ )
41
56
  from snowflake.snowpark_connect.relation.read.reader_config import CsvWriterConfig
42
57
  from snowflake.snowpark_connect.relation.stage_locator import get_paths_from_stage
43
58
  from snowflake.snowpark_connect.relation.utils import (
44
59
  generate_spark_compatible_filename,
45
60
  random_string,
46
61
  )
47
- from snowflake.snowpark_connect.type_mapping import snowpark_to_iceberg_type
48
- from snowflake.snowpark_connect.utils.context import get_session_id
62
+ from snowflake.snowpark_connect.type_mapping import (
63
+ map_pyspark_types_to_pyarrow_types,
64
+ map_snowpark_to_pyspark_types,
65
+ snowpark_to_iceberg_type,
66
+ )
67
+ from snowflake.snowpark_connect.utils.context import get_spark_session_id
49
68
  from snowflake.snowpark_connect.utils.identifiers import (
50
69
  spark_to_sf_single_id,
51
70
  split_fully_qualified_spark_name,
52
71
  )
72
+ from snowflake.snowpark_connect.utils.io_utils import get_table_type
53
73
  from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
54
74
  from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
55
75
  from snowflake.snowpark_connect.utils.telemetry import (
56
76
  SnowparkConnectNotImplementedError,
57
77
  telemetry,
58
78
  )
79
+ from snowflake.snowpark_connect.utils.udf_cache import register_cached_sproc
59
80
 
60
81
  _column_order_for_write = "name"
61
82
 
83
+ # Available values for TARGET_FILE_SIZE
84
+ # reference:https://docs.snowflake.com/en/sql-reference/sql/create-iceberg-table
85
+ TARGET_FILE_SIZE_ACCEPTABLE_VALUES = ("AUTO", "16MB", "32MB", "64MB", "128MB")
86
+
62
87
 
63
88
  # TODO: We will revise/refactor this after changes for all formats are finalized.
64
89
  def clean_params(params):
@@ -109,9 +134,65 @@ def _spark_to_snowflake(multipart_id: str) -> str:
109
134
  )
110
135
 
111
136
 
137
+ def _validate_table_exist_and_of_type(
138
+ snowpark_table_name: str,
139
+ session: snowpark.Session,
140
+ table_type: str,
141
+ table_schema_or_error: DataType | SnowparkSQLException,
142
+ ) -> None:
143
+ if not isinstance(table_schema_or_error, DataType):
144
+ exception = AnalysisException(
145
+ f"[TABLE_OR_VIEW_NOT_FOUND] The table or view `{snowpark_table_name}` cannot be found."
146
+ )
147
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
148
+ raise exception
149
+ _validate_table_type(snowpark_table_name, session, table_type)
150
+
151
+
152
+ def _validate_table_type(
153
+ snowpark_table_name: str,
154
+ session: snowpark.Session,
155
+ table_type: str,
156
+ ) -> None:
157
+ actual_type = get_table_type(snowpark_table_name, session)
158
+ if table_type == "iceberg":
159
+ if actual_type not in ("ICEBERG", "TABLE"):
160
+ exception = AnalysisException(
161
+ f"Table {snowpark_table_name} is not an iceberg table"
162
+ )
163
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
164
+ raise exception
165
+ elif table_type == "fdn":
166
+ if actual_type not in ("NORMAL", "TABLE"):
167
+ exception = AnalysisException(
168
+ f"Table {snowpark_table_name} is not a FDN table"
169
+ )
170
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
171
+ raise exception
172
+ else:
173
+ raise ValueError(
174
+ f"Invalid table_type: {table_type}. Must be 'iceberg' or 'fdn'"
175
+ )
176
+
177
+
178
+ def _validate_table_does_not_exist(
179
+ snowpark_table_name: str,
180
+ table_schema_or_error: DataType | SnowparkSQLException,
181
+ ) -> None:
182
+ if isinstance(table_schema_or_error, DataType):
183
+ exception = AnalysisException(f"Table {snowpark_table_name} already exists")
184
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
185
+ raise exception
186
+
187
+
112
188
  def map_write(request: proto_base.ExecutePlanRequest):
113
189
  write_op = request.plan.command.write_operation
114
190
  telemetry.report_io_write(write_op.source)
191
+ if write_op.path and write_op.options.get("path"):
192
+ raise AnalysisException(
193
+ "There is a 'path' option set and save() is called with a path parameter. "
194
+ "Either remove the path option, or call save() without the parameter."
195
+ )
115
196
 
116
197
  write_mode = None
117
198
  match write_op.mode:
@@ -125,9 +206,30 @@ def map_write(request: proto_base.ExecutePlanRequest):
125
206
  write_mode = "ignore"
126
207
 
127
208
  result = map_relation(write_op.input)
128
- input_df: snowpark.DataFrame = handle_column_names(result, write_op.source)
209
+ input_df, snowpark_column_names = handle_column_names(result, write_op.source)
210
+
211
+ # Create updated container with transformed dataframe, then filter METADATA$FILENAME columns
212
+ updated_result = DataFrameContainer.create_with_column_mapping(
213
+ dataframe=input_df,
214
+ spark_column_names=result.column_map.get_spark_columns(),
215
+ snowpark_column_names=snowpark_column_names,
216
+ column_metadata=result.column_map.column_metadata,
217
+ column_qualifiers=result.column_map.get_qualifiers(),
218
+ parent_column_name_map=result.column_map.get_parent_column_name_map(),
219
+ table_name=result.table_name,
220
+ alias=result.alias,
221
+ partition_hint=result.partition_hint,
222
+ )
223
+ updated_result = without_internal_columns(updated_result)
224
+ input_df = updated_result.dataframe
225
+
129
226
  session: snowpark.Session = get_or_create_snowpark_session()
130
227
 
228
+ # Check for partition hint early to determine precedence over single option
229
+ partition_hint = (
230
+ result.partition_hint if hasattr(result, "partition_hint") else None
231
+ )
232
+
131
233
  # Snowflake saveAsTable doesn't support format
132
234
  if (
133
235
  write_op.HasField("table")
@@ -150,15 +252,59 @@ def map_write(request: proto_base.ExecutePlanRequest):
150
252
  max_file_size = 1073741824
151
253
  match write_op.source:
152
254
  case "csv" | "parquet" | "json" | "text":
255
+ if write_mode == "ignore":
256
+ exception = SnowparkConnectNotImplementedError(
257
+ f"Write mode {write_mode} is not supported for {write_op.source}"
258
+ )
259
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
260
+ raise exception
261
+
153
262
  write_path = get_paths_from_stage(
154
263
  [write_op.path],
155
264
  session=session,
156
265
  )[0]
266
+
267
+ # Handle error/errorifexists mode - check if file exists before writing
268
+ if write_mode in (None, "error", "errorifexists"):
269
+ is_local_path = not is_cloud_path(write_op.path)
270
+
271
+ if is_local_path:
272
+ # Check if local path exists
273
+ if os.path.exists(write_op.path) and (
274
+ os.path.isfile(write_op.path)
275
+ or (os.path.isdir(write_op.path) and os.listdir(write_op.path))
276
+ ):
277
+ exception = AnalysisException(
278
+ f"Path {write_op.path} already exists."
279
+ )
280
+ attach_custom_error_code(
281
+ exception, ErrorCodes.INVALID_OPERATION
282
+ )
283
+ raise exception
284
+ else:
285
+ # Check if stage/cloud path exists by listing files
286
+ # If the path does not exist, SnowparkSQLException is suppressed (expected for error mode).
287
+ with suppress(SnowparkSQLException):
288
+ # TODO: Optimize this check by using a more efficient way to check if the path exists.
289
+ list_command = f"LIST '{write_path}/'"
290
+ result = session.sql(list_command).collect()
291
+ if result:
292
+ exception = AnalysisException(
293
+ f"Path {write_op.path} already exists."
294
+ )
295
+ attach_custom_error_code(
296
+ exception, ErrorCodes.INVALID_OPERATION
297
+ )
298
+ raise exception
299
+
157
300
  # Generate Spark-compatible filename with proper extension
158
301
  extension = write_op.source if write_op.source != "text" else "txt"
159
302
 
160
- # Get compression from options for proper filename generation
161
- compression_option = write_op.options.get("compression", "none")
303
+ compression = get_compression_for_source_and_options(
304
+ write_op.source, write_op.options, from_read=False
305
+ )
306
+ if compression is not None:
307
+ write_op.options["compression"] = compression
162
308
 
163
309
  # Generate Spark-compatible filename or prefix
164
310
  # we need a random prefix to support "append" mode
@@ -169,27 +315,18 @@ def map_write(request: proto_base.ExecutePlanRequest):
169
315
  )
170
316
 
171
317
  if overwrite:
172
- try:
173
- path_after_stage = (
174
- write_path.split("/", 1)[1] if "/" in write_path else ""
175
- )
176
- if not path_after_stage or path_after_stage == "/":
177
- logger.warning(
178
- f"Skipping REMOVE for root path {write_path} - too broad scope"
179
- )
180
- else:
181
- remove_command = f"REMOVE {write_path}/"
182
- session.sql(remove_command).collect()
183
- logger.info(f"Successfully cleared directory: {write_path}")
184
- except Exception as e:
185
- logger.warning(f"Could not clear directory {write_path}: {e}")
318
+ # Trailing slash is required as calling remove with just write_path would remove everything in the
319
+ # stage path with the same prefix.
320
+ remove_command = f"REMOVE '{write_path}/'"
321
+ session.sql(remove_command).collect()
322
+ logger.info(f"Successfully cleared directory: {write_path}")
186
323
 
187
- if should_write_to_single_file:
324
+ if should_write_to_single_file and partition_hint is None:
188
325
  # Single file: generate complete filename with extension
189
326
  spark_filename = generate_spark_compatible_filename(
190
327
  task_id=0,
191
328
  attempt_number=0,
192
- compression=compression_option,
329
+ compression=compression,
193
330
  format_ext=extension,
194
331
  )
195
332
  temp_file_prefix_on_stage = f"{write_path}/{spark_filename}"
@@ -198,15 +335,11 @@ def map_write(request: proto_base.ExecutePlanRequest):
198
335
  spark_filename_prefix = generate_spark_compatible_filename(
199
336
  task_id=0,
200
337
  attempt_number=0,
201
- compression=compression_option,
338
+ compression=None,
202
339
  format_ext="", # No extension for prefix
203
340
  )
204
341
  temp_file_prefix_on_stage = f"{write_path}/{spark_filename_prefix}"
205
342
 
206
- default_compression = "NONE" if write_op.source != "parquet" else "snappy"
207
- compression = write_op.options.get(
208
- "compression", default_compression
209
- ).upper()
210
343
  parameters = {
211
344
  "location": temp_file_prefix_on_stage,
212
345
  "file_format_type": write_op.source
@@ -215,13 +348,10 @@ def map_write(request: proto_base.ExecutePlanRequest):
215
348
  "format_type_options": {
216
349
  "COMPRESSION": compression,
217
350
  },
218
- "overwrite": overwrite,
219
351
  }
220
- # By default, download from the same prefix we wrote to.
221
- download_stage_path = temp_file_prefix_on_stage
222
-
223
- # Check for partition hint early to determine precedence over single option
224
- partition_hint = result.partition_hint
352
+ # Download from the base write path to ensure we fetch whatever Snowflake produced.
353
+ # Using the base avoids coupling to exact filenames/prefixes.
354
+ download_stage_path = write_path
225
355
 
226
356
  # Apply max_file_size for both single and multi-file scenarios
227
357
  # This helps control when Snowflake splits files into multiple parts
@@ -234,16 +364,26 @@ def map_write(request: proto_base.ExecutePlanRequest):
234
364
  get_param_from_options(parameters, write_op.options, write_op.source)
235
365
  if write_op.partitioning_columns:
236
366
  if write_op.source != "parquet":
237
- raise SnowparkConnectNotImplementedError(
367
+ exception = SnowparkConnectNotImplementedError(
238
368
  "Partitioning is only supported for parquet format"
239
369
  )
240
- partitioning_columns = [f'"{c}"' for c in write_op.partitioning_columns]
241
- if len(partitioning_columns) > 1:
242
- raise SnowparkConnectNotImplementedError(
243
- "Multiple partitioning columns are not yet supported"
370
+ attach_custom_error_code(
371
+ exception, ErrorCodes.UNSUPPORTED_OPERATION
244
372
  )
245
- else:
246
- parameters["partition_by"] = partitioning_columns[0]
373
+ raise exception
374
+ # Build Spark-style directory structure: col1=value1/col2=value2/...
375
+ # Example produced expression (Snowflake SQL):
376
+ # 'department=' || TO_VARCHAR("department") || '/' || 'region=' || TO_VARCHAR("region")
377
+ partitioning_column_names = list(write_op.partitioning_columns)
378
+ partition_expr_parts: list[str] = []
379
+ for col_name in partitioning_column_names:
380
+ quoted = f'"{col_name}"'
381
+ segment = f"'{col_name}=' || COALESCE(TO_VARCHAR({quoted}), '__HIVE_DEFAULT_PARTITION__')"
382
+ partition_expr_parts.append(segment)
383
+ parameters["partition_by"] = " || '/' || ".join(partition_expr_parts)
384
+ # When using PARTITION BY, Snowflake writes into subdirectories under the base path.
385
+ # Download from the base write path to preserve partition directories locally.
386
+ download_stage_path = write_path
247
387
 
248
388
  # If a partition hint is present (from DataFrame.repartition(n)), optionally split the
249
389
  # write into n COPY INTO calls by assigning a synthetic partition id. Controlled by config.
@@ -267,15 +407,20 @@ def map_write(request: proto_base.ExecutePlanRequest):
267
407
  # Execute multiple COPY INTO operations, one per target file.
268
408
  # Since we write per-partition with distinct prefixes, download from the base write path.
269
409
  download_stage_path = write_path
410
+
411
+ # We need to create a new set of parameters with single=True
412
+ shared_uuid = str(uuid.uuid4())
413
+ part_params = copy.deepcopy(dict(parameters))
414
+ part_params["single"] = True
270
415
  for part_idx in range(partition_hint):
271
- part_params = dict(parameters)
272
416
  # Preserve Spark-like filename prefix per partition so downloaded basenames
273
417
  # match the expected Spark pattern (with possible Snowflake counters appended).
274
418
  per_part_prefix = generate_spark_compatible_filename(
275
419
  task_id=part_idx,
276
420
  attempt_number=0,
277
- compression=compression_option,
278
- format_ext="", # prefix only; Snowflake appends extension/counters
421
+ compression=compression,
422
+ format_ext=extension,
423
+ shared_uuid=shared_uuid,
279
424
  )
280
425
  part_params["location"] = f"{write_path}/{per_part_prefix}"
281
426
  (
@@ -285,13 +430,25 @@ def map_write(request: proto_base.ExecutePlanRequest):
285
430
  )
286
431
  else:
287
432
  rewritten_df.write.copy_into_location(**parameters)
288
- if not is_cloud_path(write_op.path):
433
+
434
+ is_local_path = not is_cloud_path(write_op.path)
435
+ if is_local_path:
289
436
  store_files_locally(
290
437
  download_stage_path,
291
438
  write_op.path,
292
439
  overwrite,
293
440
  session,
294
441
  )
442
+
443
+ _generate_metadata_files(
444
+ write_op.source,
445
+ write_op.path,
446
+ download_stage_path,
447
+ input_df.schema,
448
+ session,
449
+ parameters,
450
+ is_local_path,
451
+ )
295
452
  case "jdbc":
296
453
  from snowflake.snowpark_connect.relation.write.map_write_jdbc import (
297
454
  map_write_jdbc,
@@ -308,48 +465,75 @@ def map_write(request: proto_base.ExecutePlanRequest):
308
465
  else write_op.table.table_name
309
466
  )
310
467
  snowpark_table_name = _spark_to_snowflake(table_name)
468
+ partition_cols = (
469
+ write_op.partitioning_columns if write_op.partitioning_columns else None
470
+ )
311
471
 
312
472
  match write_mode:
313
473
  case None | "error" | "errorifexists":
314
- if check_snowflake_table_existence(snowpark_table_name, session):
315
- raise AnalysisException(
316
- f"Table {snowpark_table_name} already exists"
317
- )
474
+ table_schema_or_error = _get_table_schema_or_error(
475
+ snowpark_table_name, session
476
+ )
477
+ _validate_table_does_not_exist(
478
+ snowpark_table_name, table_schema_or_error
479
+ )
318
480
  create_iceberg_table(
319
481
  snowpark_table_name=snowpark_table_name,
320
482
  location=write_op.options.get("location", None),
321
483
  schema=input_df.schema,
322
484
  snowpark_session=session,
485
+ partition_by=partition_cols,
486
+ target_file_size=write_op.options.get(
487
+ "write.target-file-size", None
488
+ ),
323
489
  )
324
490
  _validate_schema_and_get_writer(
325
- input_df, "append", snowpark_table_name
491
+ input_df, "append", snowpark_table_name, table_schema_or_error
326
492
  ).saveAsTable(
327
493
  table_name=snowpark_table_name,
328
494
  mode="append",
329
495
  column_order=_column_order_for_write,
330
496
  )
331
497
  case "append":
332
- # TODO: SNOW-2299414 Fix the implementation of table type check
333
- # if check_table_type(snowpark_table_name, session) != "ICEBERG":
334
- # raise AnalysisException(
335
- # f"Table {snowpark_table_name} is not an iceberg table"
336
- # )
498
+ table_schema_or_error = _get_table_schema_or_error(
499
+ snowpark_table_name, session
500
+ )
501
+ if isinstance(table_schema_or_error, DataType): # Table exists
502
+ _validate_table_type(snowpark_table_name, session, "iceberg")
503
+ else:
504
+ create_iceberg_table(
505
+ snowpark_table_name=snowpark_table_name,
506
+ location=write_op.options.get("location", None),
507
+ schema=input_df.schema,
508
+ snowpark_session=session,
509
+ partition_by=partition_cols,
510
+ target_file_size=write_op.options.get(
511
+ "write.target-file-size", None
512
+ ),
513
+ )
337
514
  _validate_schema_and_get_writer(
338
- input_df, "append", snowpark_table_name
515
+ input_df, "append", snowpark_table_name, table_schema_or_error
339
516
  ).saveAsTable(
340
517
  table_name=snowpark_table_name,
341
518
  mode="append",
342
519
  column_order=_column_order_for_write,
343
520
  )
344
521
  case "ignore":
345
- if not check_snowflake_table_existence(
522
+ table_schema_or_error = _get_table_schema_or_error(
346
523
  snowpark_table_name, session
347
- ):
524
+ )
525
+ if not isinstance(
526
+ table_schema_or_error, DataType
527
+ ): # Table not exists
348
528
  create_iceberg_table(
349
529
  snowpark_table_name=snowpark_table_name,
350
530
  location=write_op.options.get("location", None),
351
531
  schema=input_df.schema,
352
532
  snowpark_session=session,
533
+ partition_by=partition_cols,
534
+ target_file_size=write_op.options.get(
535
+ "write.target-file-size", None
536
+ ),
353
537
  )
354
538
  _validate_schema_and_get_writer(
355
539
  input_df, "append", snowpark_table_name
@@ -359,67 +543,108 @@ def map_write(request: proto_base.ExecutePlanRequest):
359
543
  column_order=_column_order_for_write,
360
544
  )
361
545
  case "overwrite":
362
- if check_snowflake_table_existence(snowpark_table_name, session):
363
- # TODO: SNOW-2299414 Fix the implementation of table type check
364
- # if check_table_type(snowpark_table_name, session) != "ICEBERG":
365
- # raise AnalysisException(
366
- # f"Table {snowpark_table_name} is not an iceberg table"
367
- # )
368
- pass
546
+ table_schema_or_error = _get_table_schema_or_error(
547
+ snowpark_table_name, session
548
+ )
549
+ if isinstance(table_schema_or_error, DataType): # Table exists
550
+ _validate_table_type(snowpark_table_name, session, "iceberg")
551
+ create_iceberg_table(
552
+ snowpark_table_name=snowpark_table_name,
553
+ location=write_op.options.get("location", None),
554
+ schema=input_df.schema,
555
+ snowpark_session=session,
556
+ mode="replace",
557
+ partition_by=partition_cols,
558
+ target_file_size=write_op.options.get(
559
+ "write.target-file-size", None
560
+ ),
561
+ )
369
562
  else:
370
563
  create_iceberg_table(
371
564
  snowpark_table_name=snowpark_table_name,
372
565
  location=write_op.options.get("location", None),
373
566
  schema=input_df.schema,
374
567
  snowpark_session=session,
568
+ mode="create",
569
+ partition_by=partition_cols,
570
+ target_file_size=write_op.options.get(
571
+ "write.target-file-size", None
572
+ ),
375
573
  )
376
- _validate_schema_and_get_writer(
377
- input_df, "truncate", snowpark_table_name
378
- ).saveAsTable(
574
+ _get_writer_for_table_creation(input_df).saveAsTable(
379
575
  table_name=snowpark_table_name,
380
- mode="truncate",
576
+ mode="append",
381
577
  column_order=_column_order_for_write,
382
578
  )
383
579
  case _:
384
- raise SnowparkConnectNotImplementedError(
580
+ exception = SnowparkConnectNotImplementedError(
385
581
  f"Write mode {write_mode} is not supported"
386
582
  )
583
+ attach_custom_error_code(
584
+ exception, ErrorCodes.UNSUPPORTED_OPERATION
585
+ )
586
+ raise exception
387
587
  case _:
388
588
  snowpark_table_name = _spark_to_snowflake(write_op.table.table_name)
589
+ save_method = write_op.table.save_method
590
+
591
+ if (
592
+ write_op.source == "snowflake"
593
+ and write_op.table.save_method
594
+ == commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_UNSPECIFIED
595
+ ):
596
+ save_method = (
597
+ commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE
598
+ )
599
+ if len(write_op.table.table_name) == 0:
600
+ dbtable_name = write_op.options.get("dbtable", "")
601
+ if len(dbtable_name) == 0:
602
+ exception = SnowparkConnectNotImplementedError(
603
+ "Save command is not supported without a table name"
604
+ )
605
+ attach_custom_error_code(
606
+ exception, ErrorCodes.UNSUPPORTED_OPERATION
607
+ )
608
+ raise exception
609
+ else:
610
+ snowpark_table_name = _spark_to_snowflake(dbtable_name)
389
611
 
390
612
  if (
391
- write_op.table.save_method
613
+ save_method
392
614
  == commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE
393
615
  ):
394
616
  match write_mode:
395
617
  case "overwrite":
396
- if check_snowflake_table_existence(
618
+ table_schema_or_error = _get_table_schema_or_error(
397
619
  snowpark_table_name, session
398
- ):
399
- # TODO: SNOW-2299414 Fix the implementation of table type check
400
- # if (
401
- # check_table_type(snowpark_table_name, session)
402
- # != "TABLE"
403
- # ):
404
- # raise AnalysisException(
405
- # f"Table {snowpark_table_name} is not a FDN table"
406
- # )
407
- write_mode = "truncate"
620
+ )
621
+ if isinstance(table_schema_or_error, DataType): # Table exists
622
+ _validate_table_type(snowpark_table_name, session, "fdn")
623
+
624
+ write_mode = "overwrite"
408
625
  _validate_schema_and_get_writer(
409
- input_df, write_mode, snowpark_table_name
626
+ input_df,
627
+ write_mode,
628
+ snowpark_table_name,
629
+ table_schema_or_error,
410
630
  ).saveAsTable(
411
631
  table_name=snowpark_table_name,
412
632
  mode=write_mode,
633
+ copy_grants=True,
413
634
  column_order=_column_order_for_write,
414
635
  )
415
636
  case "append":
416
- # TODO: SNOW-2299414 Fix the implementation of table type check
417
- # if check_table_type(snowpark_table_name, session) != "TABLE":
418
- # raise AnalysisException(
419
- # f"Table {snowpark_table_name} is not a FDN table"
420
- # )
637
+ table_schema_or_error = _get_table_schema_or_error(
638
+ snowpark_table_name, session
639
+ )
640
+ if isinstance(table_schema_or_error, DataType): # Table exists
641
+ _validate_table_type(snowpark_table_name, session, "fdn")
642
+
421
643
  _validate_schema_and_get_writer(
422
- input_df, write_mode, snowpark_table_name
644
+ input_df,
645
+ write_mode,
646
+ snowpark_table_name,
647
+ table_schema_or_error,
423
648
  ).saveAsTable(
424
649
  table_name=snowpark_table_name,
425
650
  mode=write_mode,
@@ -434,7 +659,7 @@ def map_write(request: proto_base.ExecutePlanRequest):
434
659
  column_order=_column_order_for_write,
435
660
  )
436
661
  elif (
437
- write_op.table.save_method
662
+ save_method
438
663
  == commands_proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO
439
664
  ):
440
665
  _validate_schema_and_get_writer(
@@ -445,9 +670,11 @@ def map_write(request: proto_base.ExecutePlanRequest):
445
670
  column_order=_column_order_for_write,
446
671
  )
447
672
  else:
448
- raise SnowparkConnectNotImplementedError(
449
- f"Save command not supported: {write_op.table.save_method}"
673
+ exception = SnowparkConnectNotImplementedError(
674
+ f"Save command not supported: {save_method}"
450
675
  )
676
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
677
+ raise exception
451
678
 
452
679
 
453
680
  def map_write_v2(request: proto_base.ExecutePlanRequest):
@@ -455,212 +682,252 @@ def map_write_v2(request: proto_base.ExecutePlanRequest):
455
682
 
456
683
  snowpark_table_name = _spark_to_snowflake(write_op.table_name)
457
684
  result = map_relation(write_op.input)
458
- input_df: snowpark.DataFrame = handle_column_names(result, "table")
685
+ input_df, snowpark_column_names = handle_column_names(result, "table")
686
+
687
+ # Create updated container with transformed dataframe, then filter METADATA$FILENAME columns
688
+ updated_result = DataFrameContainer.create_with_column_mapping(
689
+ dataframe=input_df,
690
+ spark_column_names=result.column_map.get_spark_columns(),
691
+ snowpark_column_names=snowpark_column_names,
692
+ column_metadata=result.column_map.column_metadata,
693
+ column_qualifiers=result.column_map.get_qualifiers(),
694
+ parent_column_name_map=result.column_map.get_parent_column_name_map(),
695
+ table_name=result.table_name,
696
+ alias=result.alias,
697
+ partition_hint=result.partition_hint,
698
+ )
699
+ updated_result = without_internal_columns(updated_result)
700
+ input_df = updated_result.dataframe
701
+
459
702
  session: snowpark.Session = get_or_create_snowpark_session()
460
703
 
461
704
  if write_op.table_name is None or write_op.table_name == "":
462
- raise SnowparkConnectNotImplementedError(
705
+ exception = SnowparkConnectNotImplementedError(
463
706
  "Write operation V2 only support table writing now"
464
707
  )
708
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
709
+ raise exception
710
+
711
+ is_iceberg = write_op.provider.lower() == "iceberg"
712
+ table_type = "iceberg" if is_iceberg else "fdn"
713
+ partition_cols = (
714
+ [
715
+ i.unresolved_attribute.unparsed_identifier
716
+ for i in write_op.partitioning_columns
717
+ ]
718
+ if write_op.partitioning_columns
719
+ else None
720
+ )
465
721
 
466
- if write_op.provider.lower() == "iceberg":
467
- match write_op.mode:
468
- case commands_proto.WriteOperationV2.MODE_CREATE:
469
- if check_snowflake_table_existence(snowpark_table_name, session):
470
- raise AnalysisException(
471
- f"Table {snowpark_table_name} already exists"
472
- )
722
+ match write_op.mode:
723
+ case commands_proto.WriteOperationV2.MODE_CREATE:
724
+ table_schema_or_error = _get_table_schema_or_error(
725
+ snowpark_table_name, session
726
+ )
727
+ _validate_table_does_not_exist(snowpark_table_name, table_schema_or_error)
728
+
729
+ if is_iceberg:
473
730
  create_iceberg_table(
474
731
  snowpark_table_name=snowpark_table_name,
475
732
  location=write_op.table_properties.get("location"),
476
733
  schema=input_df.schema,
477
734
  snowpark_session=session,
735
+ partition_by=partition_cols,
736
+ target_file_size=write_op.table_properties.get(
737
+ "write.target-file-size", None
738
+ ),
478
739
  )
479
- _validate_schema_and_get_writer(
480
- input_df, "append", snowpark_table_name
481
- ).saveAsTable(
482
- table_name=snowpark_table_name,
483
- mode="append",
484
- column_order=_column_order_for_write,
485
- )
486
- case commands_proto.WriteOperationV2.MODE_APPEND:
487
- if not check_snowflake_table_existence(snowpark_table_name, session):
488
- raise AnalysisException(
489
- f"[TABLE_OR_VIEW_NOT_FOUND] The table or view `{write_op.table_name}` cannot be found."
490
- )
491
- # TODO: SNOW-2299414 Fix the implementation of table type check
492
- # if check_table_type(snowpark_table_name, session) != "ICEBERG":
493
- # raise AnalysisException(
494
- # f"Table {snowpark_table_name} is not an iceberg table"
495
- # )
496
- _validate_schema_and_get_writer(
497
- input_df, "append", snowpark_table_name
498
- ).saveAsTable(
499
- table_name=snowpark_table_name,
500
- mode="append",
501
- column_order=_column_order_for_write,
740
+ _get_writer_for_table_creation(input_df).saveAsTable(
741
+ table_name=snowpark_table_name,
742
+ mode="append" if is_iceberg else "errorifexists",
743
+ column_order=_column_order_for_write,
744
+ )
745
+
746
+ case commands_proto.WriteOperationV2.MODE_APPEND:
747
+ table_schema_or_error = _get_table_schema_or_error(
748
+ snowpark_table_name, session
749
+ )
750
+ _validate_table_exist_and_of_type(
751
+ snowpark_table_name, session, table_type, table_schema_or_error
752
+ )
753
+ _validate_schema_and_get_writer(
754
+ input_df, "append", snowpark_table_name, table_schema_or_error
755
+ ).saveAsTable(
756
+ table_name=snowpark_table_name,
757
+ mode="append",
758
+ column_order=_column_order_for_write,
759
+ )
760
+
761
+ case commands_proto.WriteOperationV2.MODE_OVERWRITE | commands_proto.WriteOperationV2.MODE_OVERWRITE_PARTITIONS:
762
+ # TODO: handle the filter condition for MODE_OVERWRITE
763
+ table_schema_or_error = _get_table_schema_or_error(
764
+ snowpark_table_name, session
765
+ )
766
+ _validate_table_exist_and_of_type(
767
+ snowpark_table_name, session, table_type, table_schema_or_error
768
+ )
769
+
770
+ if is_iceberg:
771
+ create_iceberg_table(
772
+ snowpark_table_name=snowpark_table_name,
773
+ location=write_op.options.get("location", None),
774
+ schema=input_df.schema,
775
+ snowpark_session=session,
776
+ mode="replace",
777
+ partition_by=partition_cols,
778
+ target_file_size=write_op.table_properties.get(
779
+ "write.target-file-size", None
780
+ ),
502
781
  )
503
- case commands_proto.WriteOperationV2.MODE_OVERWRITE | commands_proto.WriteOperationV2.MODE_OVERWRITE_PARTITIONS:
504
- # TODO: handle the filter condition for MODE_OVERWRITE
505
- if check_snowflake_table_existence(snowpark_table_name, session):
506
- # TODO: SNOW-2299414 Fix the implementation of table type check
507
- # if check_table_type(snowpark_table_name, session) != "ICEBERG":
508
- # raise AnalysisException(
509
- # f"Table {snowpark_table_name} is not an iceberg table"
510
- # )
511
- pass
512
- else:
513
- raise AnalysisException(
514
- f"[TABLE_OR_VIEW_NOT_FOUND] Table {snowpark_table_name} does not exist"
515
- )
516
- _validate_schema_and_get_writer(
517
- input_df, "truncate", snowpark_table_name
518
- ).saveAsTable(
519
- table_name=snowpark_table_name,
520
- mode="truncate",
521
- column_order=_column_order_for_write,
782
+ writer = _get_writer_for_table_creation(input_df)
783
+ save_mode = "append"
784
+ else:
785
+ writer = _validate_schema_and_get_writer(
786
+ input_df, "overwrite", snowpark_table_name, table_schema_or_error
522
787
  )
523
- case commands_proto.WriteOperationV2.MODE_REPLACE:
524
- if check_snowflake_table_existence(snowpark_table_name, session):
525
- create_iceberg_table(
526
- snowpark_table_name=snowpark_table_name,
527
- location=write_op.table_properties.get("location"),
528
- schema=input_df.schema,
529
- snowpark_session=session,
530
- mode="replace",
531
- )
532
- else:
533
- raise AnalysisException(
534
- f"Table {snowpark_table_name} does not exist"
535
- )
536
- _validate_schema_and_get_writer(
537
- input_df, "replace", snowpark_table_name
538
- ).saveAsTable(
539
- table_name=snowpark_table_name,
540
- mode="append",
541
- column_order=_column_order_for_write,
788
+ save_mode = "overwrite"
789
+
790
+ writer.saveAsTable(
791
+ table_name=snowpark_table_name,
792
+ mode=save_mode,
793
+ column_order=_column_order_for_write,
794
+ )
795
+
796
+ case commands_proto.WriteOperationV2.MODE_REPLACE:
797
+ table_schema_or_error = _get_table_schema_or_error(
798
+ snowpark_table_name, session
799
+ )
800
+ _validate_table_exist_and_of_type(
801
+ snowpark_table_name, session, table_type, table_schema_or_error
802
+ )
803
+
804
+ if is_iceberg:
805
+ create_iceberg_table(
806
+ snowpark_table_name=snowpark_table_name,
807
+ location=write_op.table_properties.get("location"),
808
+ schema=input_df.schema,
809
+ snowpark_session=session,
810
+ mode="replace",
811
+ partition_by=partition_cols,
812
+ target_file_size=write_op.table_properties.get(
813
+ "write.target-file-size", None
814
+ ),
542
815
  )
543
- case commands_proto.WriteOperationV2.MODE_CREATE_OR_REPLACE:
816
+ save_mode = "append"
817
+ else:
818
+ save_mode = "overwrite"
819
+
820
+ _validate_schema_and_get_writer(
821
+ input_df, "replace", snowpark_table_name, table_schema_or_error
822
+ ).saveAsTable(
823
+ table_name=snowpark_table_name,
824
+ mode=save_mode,
825
+ column_order=_column_order_for_write,
826
+ )
827
+
828
+ case commands_proto.WriteOperationV2.MODE_CREATE_OR_REPLACE:
829
+ if is_iceberg:
544
830
  create_iceberg_table(
545
831
  snowpark_table_name=snowpark_table_name,
546
832
  location=write_op.table_properties.get("location"),
547
833
  schema=input_df.schema,
548
834
  snowpark_session=session,
549
835
  mode="create_or_replace",
836
+ partition_by=partition_cols,
837
+ target_file_size=write_op.table_properties.get(
838
+ "write.target-file-size", None
839
+ ),
550
840
  )
551
- _validate_schema_and_get_writer(
552
- input_df, "create_or_replace", snowpark_table_name
553
- ).saveAsTable(
554
- table_name=snowpark_table_name,
555
- mode="append",
556
- column_order=_column_order_for_write,
557
- )
558
- case _:
559
- raise SnowparkConnectNotImplementedError(
560
- f"Write mode {commands_proto.WriteOperationV2.Mode.Name(write_op.mode)} is not supported"
561
- )
562
- else:
563
- match write_op.mode:
564
- case commands_proto.WriteOperationV2.MODE_CREATE:
565
- _validate_schema_and_get_writer(
566
- input_df, "errorifexists", snowpark_table_name
567
- ).saveAsTable(
568
- table_name=snowpark_table_name,
569
- mode="errorifexists",
570
- column_order=_column_order_for_write,
571
- )
572
- case commands_proto.WriteOperationV2.MODE_APPEND:
573
- if not check_snowflake_table_existence(snowpark_table_name, session):
574
- raise AnalysisException(
575
- f"[TABLE_OR_VIEW_NOT_FOUND] The table or view `{write_op.table_name}` cannot be found."
576
- )
577
- # TODO: SNOW-2299414 Fix the implementation of table type check
578
- # if check_table_type(snowpark_table_name, session) != "TABLE":
579
- # raise AnalysisException(
580
- # f"Table {snowpark_table_name} is not a FDN table"
581
- # )
582
- _validate_schema_and_get_writer(
583
- input_df, "append", snowpark_table_name
584
- ).saveAsTable(
585
- table_name=snowpark_table_name,
586
- mode="append",
587
- column_order=_column_order_for_write,
588
- )
589
- case commands_proto.WriteOperationV2.MODE_OVERWRITE | commands_proto.WriteOperationV2.MODE_OVERWRITE_PARTITIONS:
590
- # TODO: handle the filter condition for MODE_OVERWRITE
591
- if check_snowflake_table_existence(snowpark_table_name, session):
592
- # TODO: SNOW-2299414 Fix the implementation of table type check
593
- # if check_table_type(snowpark_table_name, session) != "TABLE":
594
- # raise AnalysisException(
595
- # f"Table {snowpark_table_name} is not a FDN table"
596
- # )
597
- pass
598
- else:
599
- raise AnalysisException(
600
- f"[TABLE_OR_VIEW_NOT_FOUND] Table {snowpark_table_name} does not exist"
601
- )
602
- _validate_schema_and_get_writer(
603
- input_df, "truncate", snowpark_table_name
604
- ).saveAsTable(
605
- table_name=snowpark_table_name,
606
- mode="truncate",
607
- column_order=_column_order_for_write,
608
- )
609
- case commands_proto.WriteOperationV2.MODE_REPLACE:
610
- if not check_snowflake_table_existence(snowpark_table_name, session):
611
- raise AnalysisException(
612
- f"Table {snowpark_table_name} does not exist"
613
- )
614
- _validate_schema_and_get_writer(
615
- input_df, "replace", snowpark_table_name
616
- ).saveAsTable(
617
- table_name=snowpark_table_name,
618
- mode="overwrite",
619
- column_order=_column_order_for_write,
620
- )
621
- case commands_proto.WriteOperationV2.MODE_CREATE_OR_REPLACE:
622
- _validate_schema_and_get_writer(
623
- input_df, "create_or_replace", snowpark_table_name
624
- ).saveAsTable(
625
- table_name=snowpark_table_name,
626
- mode="overwrite",
627
- column_order=_column_order_for_write,
628
- )
629
- case _:
630
- raise SnowparkConnectNotImplementedError(
631
- f"Write mode {commands_proto.WriteOperationV2.Mode.Name(write_op.mode)} is not supported"
632
- )
841
+ save_mode = "append"
842
+ else:
843
+ save_mode = "overwrite"
844
+
845
+ _validate_schema_and_get_writer(
846
+ input_df, "create_or_replace", snowpark_table_name
847
+ ).saveAsTable(
848
+ table_name=snowpark_table_name,
849
+ mode=save_mode,
850
+ column_order=_column_order_for_write,
851
+ )
852
+
853
+ case _:
854
+ exception = SnowparkConnectNotImplementedError(
855
+ f"Write mode {commands_proto.WriteOperationV2.Mode.Name(write_op.mode)} is not supported"
856
+ )
857
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
858
+ raise exception
859
+
860
+
861
+ def _get_table_schema_or_error(
862
+ snowpark_table_name: str, snowpark_session: snowpark.Session
863
+ ) -> DataType | SnowparkSQLException:
864
+ try:
865
+ return snowpark_session.table(snowpark_table_name).schema
866
+ except SnowparkSQLException as e:
867
+ return e
868
+
869
+
870
+ def _get_writer_for_table_creation(df: snowpark.DataFrame) -> snowpark.DataFrameWriter:
871
+ # When creating a new table, if case sensitivity is not enabled, we need to rename the columns
872
+ # to upper case so they are case-insensitive in Snowflake.
873
+ if auto_uppercase_column_identifiers():
874
+ for field in df.schema.fields:
875
+ col_name = field.name
876
+ # Uppercasing is fine, regardless of whether the original name was quoted or not.
877
+ # In Snowflake these are equivalent "COL" == COL == col == coL
878
+ uppercased_name = col_name.upper()
879
+ if col_name != uppercased_name:
880
+ df = df.withColumnRenamed(col_name, uppercased_name)
881
+ return df.write
633
882
 
634
883
 
635
884
  def _validate_schema_and_get_writer(
636
- input_df: snowpark.DataFrame, write_mode: str, snowpark_table_name: str
885
+ input_df: snowpark.DataFrame,
886
+ write_mode: str,
887
+ snowpark_table_name: str,
888
+ table_schema_or_error: DataType | SnowparkSQLException | None = None,
637
889
  ) -> snowpark.DataFrameWriter:
638
890
  if write_mode is not None and write_mode.lower() in (
639
891
  "replace",
640
892
  "create_or_replace",
893
+ "overwrite",
641
894
  ):
642
- return input_df.write
895
+ return _get_writer_for_table_creation(input_df)
643
896
 
644
897
  table_schema = None
645
- try:
646
- table_schema = (
647
- get_or_create_snowpark_session().table(snowpark_table_name).schema
648
- )
649
- except SnowparkSQLException as e:
650
- msg = e.message
651
- if "SQL compilation error" in msg and "does not exist" in msg:
652
- pass
653
- else:
654
- raise e
898
+ if table_schema_or_error is not None:
899
+ if isinstance(table_schema_or_error, SnowparkSQLException):
900
+ msg = table_schema_or_error.message
901
+ if "SQL compilation error" in msg and "does not exist" in msg:
902
+ pass
903
+ else:
904
+ attach_custom_error_code(
905
+ table_schema_or_error, ErrorCodes.INTERNAL_ERROR
906
+ )
907
+ raise table_schema_or_error
908
+ elif isinstance(table_schema_or_error, DataType):
909
+ table_schema = table_schema_or_error
910
+ else:
911
+ try:
912
+ table_schema = (
913
+ get_or_create_snowpark_session().table(snowpark_table_name).schema
914
+ )
915
+ except SnowparkSQLException as e:
916
+ msg = e.message
917
+ if "SQL compilation error" in msg and "does not exist" in msg:
918
+ pass
919
+ else:
920
+ attach_custom_error_code(e, ErrorCodes.INTERNAL_ERROR)
921
+ raise e
655
922
 
656
923
  if table_schema is None:
657
924
  # If table does not exist, we can skip the schema validation
658
- return input_df.write
925
+ return _get_writer_for_table_creation(input_df)
659
926
 
660
927
  _validate_schema_for_append(table_schema, input_df.schema, snowpark_table_name)
661
928
 
662
929
  # if table exists and case sensitivity is not enabled, we need to rename the columns to match existing table schema
663
- if not global_config.spark_sql_caseSensitive:
930
+ if auto_uppercase_column_identifiers():
664
931
 
665
932
  for field in input_df.schema.fields:
666
933
  # Find the matching field in the table schema (case-insensitive)
@@ -670,8 +937,8 @@ def _validate_schema_and_get_writer(
670
937
  (
671
938
  f
672
939
  for f in table_schema.fields
673
- if unquote_if_quoted(f.name).lower()
674
- == unquote_if_quoted(col_name).lower()
940
+ if unquote_if_quoted(f.name).upper()
941
+ == unquote_if_quoted(col_name).upper()
675
942
  ),
676
943
  None,
677
944
  )
@@ -706,21 +973,25 @@ def _validate_schema_for_append(
706
973
  case (StructType() as table_struct, StructType() as data_struct):
707
974
 
708
975
  def _comparable_col_name(col: str) -> str:
709
- name = col if global_config.spark_sql_caseSensitive else col.lower()
976
+ name = col.upper() if auto_uppercase_column_identifiers() else col
710
977
  if compare_structs:
711
978
  return name
712
979
  else:
713
980
  return unquote_if_quoted(name)
714
981
 
715
982
  def invalid_struct_schema():
716
- raise AnalysisException(
983
+ exception = AnalysisException(
717
984
  f"Cannot resolve columns for the existing table {snowpark_table_name} ({table_schema.simple_string()}) with the data schema ({data_schema.simple_string()})."
718
985
  )
986
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
987
+ raise exception
719
988
 
720
989
  if len(table_struct.fields) != len(data_struct.fields):
721
- raise AnalysisException(
990
+ exception = AnalysisException(
722
991
  f"The column number of the existing table {snowpark_table_name} ({table_schema.simple_string()}) doesn't match the data schema ({data_schema.simple_string()}).)"
723
992
  )
993
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
994
+ raise exception
724
995
 
725
996
  table_field_names = {
726
997
  _comparable_col_name(field.name) for field in table_struct.fields
@@ -783,9 +1054,24 @@ def _validate_schema_for_append(
783
1054
  case (DateType(), _) if isinstance(data_schema, (DateType, TimestampType)):
784
1055
  return
785
1056
  case (_, _):
786
- raise AnalysisException(
1057
+ exception = AnalysisException(
787
1058
  f"[INCOMPATIBLE_DATA_FOR_TABLE.CANNOT_SAFELY_CAST] Cannot write incompatible data for the table {snowpark_table_name}: Cannot safely cast {data_schema.simple_string()} to {table_schema.simple_string()}"
788
1059
  )
1060
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
1061
+ raise exception
1062
+
1063
+
1064
+ def _validate_target_file_size(target_file_size: str | None):
1065
+ # validate target file size is in the acceptable values
1066
+ if target_file_size is None:
1067
+ return
1068
+
1069
+ if target_file_size not in TARGET_FILE_SIZE_ACCEPTABLE_VALUES:
1070
+ exception = AnalysisException(
1071
+ f"Invalid value '{target_file_size}' for TARGET_FILE_SIZE. Allowed values: {', '.join(TARGET_FILE_SIZE_ACCEPTABLE_VALUES)}."
1072
+ )
1073
+ attach_custom_error_code(exception, ErrorCodes.INVALID_CONFIG_VALUE)
1074
+ raise exception
789
1075
 
790
1076
 
791
1077
  def create_iceberg_table(
@@ -794,6 +1080,8 @@ def create_iceberg_table(
794
1080
  schema: StructType,
795
1081
  snowpark_session: snowpark.Session,
796
1082
  mode: str = "create",
1083
+ partition_by: list[str] = None,
1084
+ target_file_size: str | None = None,
797
1085
  ):
798
1086
  table_schema = [
799
1087
  f"{spark_to_sf_single_id(unquote_if_quoted(field.name), is_column = True)} {snowpark_to_iceberg_type(field.datatype)}"
@@ -807,7 +1095,7 @@ def create_iceberg_table(
807
1095
  )
808
1096
  base_location = f"BASE_LOCATION = '{location}'"
809
1097
 
810
- config_external_volume = sessions_config.get(get_session_id(), {}).get(
1098
+ config_external_volume = sessions_config.get(get_spark_session_id(), {}).get(
811
1099
  "snowpark.connect.iceberg.external_volume", None
812
1100
  )
813
1101
  external_volume = (
@@ -815,24 +1103,38 @@ def create_iceberg_table(
815
1103
  if config_external_volume is None or config_external_volume == ""
816
1104
  else f"EXTERNAL_VOLUME = '{config_external_volume}'"
817
1105
  )
1106
+ copy_grants = ""
1107
+ partition_by_sql = (
1108
+ f"PARTITION BY ({','.join([f'{spark_to_sf_single_id(unquote_if_quoted(p), is_column = True)}' for p in partition_by])})"
1109
+ if partition_by
1110
+ else ""
1111
+ )
818
1112
 
1113
+ _validate_target_file_size(target_file_size)
1114
+ target_file_size_sql = (
1115
+ f"TARGET_FILE_SIZE = '{target_file_size}'" if target_file_size else ""
1116
+ )
819
1117
  match mode:
820
1118
  case "create":
821
1119
  create_sql = "CREATE"
822
- case "replace":
1120
+ case "replace" | "create_or_replace":
823
1121
  # There's no replace for iceberg table, so we use create or replace
824
- create_sql = "CREATE OR REPLACE"
825
- case "create_or_replace":
1122
+ copy_grants = "COPY GRANTS"
826
1123
  create_sql = "CREATE OR REPLACE"
827
1124
  case _:
828
- raise SnowparkConnectNotImplementedError(
1125
+ exception = SnowparkConnectNotImplementedError(
829
1126
  f"Write mode {mode} is not supported for iceberg table"
830
1127
  )
1128
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
1129
+ raise exception
831
1130
  sql = f"""
832
1131
  {create_sql} ICEBERG TABLE {snowpark_table_name} ({",".join(table_schema)})
1132
+ {partition_by_sql}
833
1133
  CATALOG = 'SNOWFLAKE'
834
1134
  {external_volume}
835
- {base_location};
1135
+ {base_location}
1136
+ {target_file_size_sql}
1137
+ {copy_grants};
836
1138
  """
837
1139
  snowpark_session.sql(sql).collect()
838
1140
 
@@ -843,39 +1145,333 @@ def rewrite_df(input_df: snowpark.DataFrame, source: str) -> snowpark.DataFrame:
843
1145
  json: construct the dataframe to 1 column in json format
844
1146
  1. Append columns which represents the column name
845
1147
  2. Use object_construct to aggregate the dataframe into 1 column
846
-
1148
+ csv:
1149
+ Use "" to replace empty string
847
1150
  """
848
- if source != "json":
849
- return input_df
850
- rand_salt = random_string(10, "_")
851
- rewritten_df = input_df.with_columns(
852
- [co + rand_salt for co in input_df.columns],
853
- [lit(unquote_if_quoted(co)) for co in input_df.columns],
854
- )
855
- construct_key_values = []
856
- for co in input_df.columns:
857
- construct_key_values.append(col(co + rand_salt))
858
- construct_key_values.append(col(co))
859
- return rewritten_df.select(object_construct(*construct_key_values))
1151
+ match source:
1152
+ case "json":
1153
+ rand_salt = random_string(10, "_")
1154
+ rewritten_df = input_df.with_columns(
1155
+ [co + rand_salt for co in input_df.columns],
1156
+ [lit(unquote_if_quoted(co)) for co in input_df.columns],
1157
+ )
1158
+ construct_key_values = []
1159
+ for co in input_df.columns:
1160
+ construct_key_values.append(col(co + rand_salt))
1161
+ construct_key_values.append(col(co))
1162
+ return rewritten_df.select(object_construct(*construct_key_values))
1163
+ case "csv":
1164
+ new_cols = []
1165
+ for co in input_df.columns:
1166
+ if isinstance(input_df.schema[co].datatype, StringType):
1167
+ new_col = col(co)
1168
+ new_col = when(
1169
+ new_col.isNotNull() & (new_col == ""), lit('""')
1170
+ ).otherwise(new_col)
1171
+ new_cols.append(new_col.alias(co))
1172
+ else:
1173
+ new_cols.append(col(co))
1174
+ return input_df.select(new_cols)
1175
+ case _:
1176
+ return input_df
860
1177
 
861
1178
 
862
1179
  def handle_column_names(
863
1180
  container: DataFrameContainer, source: str
864
- ) -> snowpark.DataFrame:
1181
+ ) -> tuple[snowpark.DataFrame, list[str]]:
865
1182
  """
866
1183
  Handle column names before write so they match spark schema.
1184
+
1185
+ Returns:
1186
+ A tuple of (dataframe, snowpark_column_names) where snowpark_column_names
1187
+ are the resulting column names after any renaming.
867
1188
  """
868
1189
  df = container.dataframe
1190
+ column_map = container.column_map
1191
+
869
1192
  if source == "jdbc":
870
1193
  # don't change column names for jdbc sources as we directly use spark column names for writing to the destination tables.
871
- return df
872
- column_map = container.column_map
1194
+ return df, column_map.get_snowpark_columns()
873
1195
 
1196
+ snowpark_column_names = []
874
1197
  for column in column_map.columns:
875
- df = df.withColumnRenamed(
876
- column.snowpark_name, quote_name_without_upper_casing(column.spark_name)
1198
+ new_name = quote_name_without_upper_casing(column.spark_name)
1199
+ df = df.withColumnRenamed(column.snowpark_name, new_name)
1200
+ snowpark_column_names.append(new_name)
1201
+
1202
+ return df, snowpark_column_names
1203
+
1204
+
1205
+ def _generate_metadata_files(
1206
+ source: str,
1207
+ write_path: str,
1208
+ stage_path: str,
1209
+ schema: StructType,
1210
+ session: snowpark.Session,
1211
+ parameters: dict,
1212
+ is_local_path: bool,
1213
+ ) -> None:
1214
+ """
1215
+ Generate marker and metadata files after write completes.
1216
+
1217
+ Handles _SUCCESS marker files and Parquet _common_metadata generation
1218
+ for both local and cloud/stage paths.
1219
+
1220
+ Args:
1221
+ source: Write format (csv, parquet, json, etc.)
1222
+ write_path: Original write path (local or cloud)
1223
+ stage_path: Stage path where files were written
1224
+ schema: DataFrame schema
1225
+ session: Snowpark session
1226
+ parameters: Write parameters
1227
+ is_local_path: Whether writing to local filesystem
1228
+ """
1229
+ generate_success = get_success_file_generation_enabled()
1230
+ generate_parquet_metadata = (
1231
+ source == "parquet" and get_parquet_metadata_generation_enabled()
1232
+ )
1233
+
1234
+ if is_local_path:
1235
+ # Local path: write files directly
1236
+ if generate_success:
1237
+ _write_success_file_locally(write_path)
1238
+ if generate_parquet_metadata:
1239
+ _write_parquet_metadata_files_locally(write_path, schema)
1240
+ else:
1241
+ # Cloud/stage path: upload via stage operations
1242
+ if generate_success:
1243
+ _write_success_file_to_stage(stage_path, session, parameters)
1244
+ if generate_parquet_metadata:
1245
+ _upload_common_metadata_to_stage(stage_path, schema, session)
1246
+
1247
+
1248
+ def _write_success_file_locally(directory_path: str) -> None:
1249
+ """
1250
+ Write a _SUCCESS marker file to a local directory.
1251
+ """
1252
+ try:
1253
+ success_file = Path(directory_path) / "_SUCCESS"
1254
+ success_file.touch()
1255
+ logger.debug(f"Created _SUCCESS file at {directory_path}")
1256
+ except Exception as e:
1257
+ logger.warning(f"Failed to create _SUCCESS file at {directory_path}: {e}")
1258
+
1259
+
1260
+ def _write_success_file_to_stage(
1261
+ stage_path: str,
1262
+ session: snowpark.Session,
1263
+ parameters: dict,
1264
+ ) -> None:
1265
+ """
1266
+ Write a _SUCCESS marker file to a stage location.
1267
+ """
1268
+ try:
1269
+ # Create a dummy dataframe with one row containing "SUCCESS"
1270
+ success_df = session.create_dataframe([["SUCCESS"]]).to_df(["STATUS"])
1271
+ success_params = copy.deepcopy(parameters)
1272
+
1273
+ success_params.pop("partition_by", None)
1274
+
1275
+ success_params["location"] = f"{stage_path}/_SUCCESS"
1276
+ success_params["single"] = True
1277
+ success_params["header"] = True
1278
+
1279
+ # Set CSV format with explicit no compression for _SUCCESS file
1280
+ success_params["file_format_type"] = "csv"
1281
+ success_params["format_type_options"] = {
1282
+ "COMPRESSION": "NONE",
1283
+ }
1284
+
1285
+ success_df.write.copy_into_location(**success_params)
1286
+
1287
+ logger.debug(f"Created _SUCCESS file at {stage_path}")
1288
+ except Exception as e:
1289
+ logger.warning(f"Failed to create _SUCCESS file at {stage_path}: {e}")
1290
+
1291
+
1292
+ def _get_metadata_upload_sproc() -> str:
1293
+ """
1294
+ Get the cached metadata upload stored procedure.
1295
+
1296
+ Returns:
1297
+ Fully qualified name of the cached stored procedure
1298
+ """
1299
+ sproc_body = """import base64
1300
+ import tempfile
1301
+ import os
1302
+
1303
+ def upload_file(session, file_content_b64: str, file_name: str, target_stage: str):
1304
+ import base64
1305
+ import tempfile
1306
+ import os
1307
+
1308
+ # Decode base64 content
1309
+ file_content = base64.b64decode(file_content_b64)
1310
+
1311
+ # Create temp directory and write file with exact name
1312
+ temp_dir = tempfile.mkdtemp()
1313
+ tmp_file_path = os.path.join(temp_dir, file_name)
1314
+
1315
+ with open(tmp_file_path, 'wb') as f:
1316
+ f.write(file_content)
1317
+
1318
+ try:
1319
+ # Use session.file.put() - works for both internal and external stages in sproc context
1320
+ result = session.file.put(
1321
+ tmp_file_path,
1322
+ target_stage,
1323
+ auto_compress=False,
1324
+ overwrite=True
1325
+ )
1326
+
1327
+ # Extract status from result
1328
+ if result and len(result) > 0:
1329
+ status = result[0].status if hasattr(result[0], 'status') else str(result[0])
1330
+ else:
1331
+ status = "uploaded"
1332
+
1333
+ return "Uploaded " + file_name + " Status: " + status
1334
+ finally:
1335
+ # Clean up temp files
1336
+ try:
1337
+ os.unlink(tmp_file_path)
1338
+ os.rmdir(temp_dir)
1339
+ except (OSError, IOError):
1340
+ pass"""
1341
+
1342
+ # Use the cached sproc system for better performance and schema/database change handling
1343
+ return register_cached_sproc(
1344
+ sproc_body=sproc_body,
1345
+ handler_name="upload_file",
1346
+ input_arg_types=["STRING", "STRING", "STRING"],
1347
+ return_type="STRING",
1348
+ runtime_version="3.11",
1349
+ packages=["snowflake-snowpark-python"],
1350
+ )
1351
+
1352
+
1353
+ def _upload_file_to_stage_via_sproc(
1354
+ local_file_path: Path, stage_path: str, session: snowpark.Session
1355
+ ) -> None:
1356
+ """
1357
+ Upload a file to a stage using the reusable stored procedure. We cannot directly use session.file.put() as it doesn't support external stages.
1358
+
1359
+ Args:
1360
+ local_file_path: Local file to upload
1361
+ stage_path: Target stage path (e.g., @STAGE_NAME/path)
1362
+ session: Snowpark session
1363
+ """
1364
+ import base64
1365
+
1366
+ sproc_name = _get_metadata_upload_sproc()
1367
+
1368
+ with open(local_file_path, "rb") as f:
1369
+ file_content = f.read()
1370
+
1371
+ file_content_b64 = base64.b64encode(file_content).decode("utf-8")
1372
+ file_name = "_common_metadata"
1373
+ session.call(sproc_name, file_content_b64, file_name, stage_path)
1374
+
1375
+ logger.debug(f"Uploaded {file_name} to {stage_path} via stored procedure")
1376
+
1377
+
1378
+ def _upload_common_metadata_to_stage(
1379
+ stage_path: str, snowpark_schema: StructType, session: snowpark.Session
1380
+ ) -> None:
1381
+ """
1382
+ Generate and upload _common_metadata file to a stage.
1383
+
1384
+ Converts Snowpark → PySpark → Spark JSON, creates PyArrow schema with Spark metadata,
1385
+ then uploads to stage via temporary stored procedure (supports internal and external stages).
1386
+
1387
+ Args:
1388
+ stage_path: Stage path where to upload _common_metadata (e.g., @STAGE/path)
1389
+ snowpark_schema: DataFrame schema (already in memory)
1390
+ session: Snowpark session for uploading
1391
+ """
1392
+ try:
1393
+ import tempfile
1394
+
1395
+ spark_only_schema = _create_spark_schema_from_snowpark(snowpark_schema)
1396
+
1397
+ with tempfile.NamedTemporaryFile(
1398
+ suffix="_common_metadata", delete=False
1399
+ ) as tmp_file:
1400
+ tmp_path = Path(tmp_file.name)
1401
+ pq.write_metadata(spark_only_schema, tmp_path)
1402
+ _upload_file_to_stage_via_sproc(tmp_path, stage_path, session)
1403
+ tmp_path.unlink()
1404
+
1405
+ logger.debug(f"Created _common_metadata at {stage_path}")
1406
+
1407
+ except ImportError:
1408
+ logger.warning(
1409
+ "PyArrow is required to generate Parquet metadata files. "
1410
+ "Install with: pip install pyarrow"
1411
+ )
1412
+ except Exception as e:
1413
+ logger.warning(f"Failed to create _common_metadata file: {e}")
1414
+
1415
+
1416
+ def _create_spark_schema_from_snowpark(snowpark_schema: StructType) -> pa.Schema:
1417
+ """
1418
+ Create PyArrow schema with Spark metadata from Snowpark schema.
1419
+ """
1420
+ # Unquote field names (Snowpark may have quoted names like "ab")
1421
+ unquoted_fields = []
1422
+ for field in snowpark_schema.fields:
1423
+ unquoted_name = unquote_if_quoted(field.name)
1424
+ unquoted_fields.append(
1425
+ snowpark.types.StructField(
1426
+ unquoted_name, field.datatype, field.nullable, _is_column=False
1427
+ )
877
1428
  )
878
- return df
1429
+ unquoted_snowpark_schema = snowpark.types.StructType(
1430
+ unquoted_fields, structured=snowpark_schema.structured
1431
+ )
1432
+ pyspark_schema = map_snowpark_to_pyspark_types(unquoted_snowpark_schema)
1433
+ spark_schema_json = pyspark_schema.json()
1434
+
1435
+ spark_metadata = {
1436
+ b"org.apache.spark.version": SPARK_VERSION.encode("utf-8"),
1437
+ b"org.apache.spark.sql.parquet.row.metadata": spark_schema_json.encode("utf-8"),
1438
+ }
1439
+
1440
+ # Convert PySpark to PyArrow for the physical schema structure
1441
+ # NOTE: Spark reads schema from the JSON metadata above, NOT from the Parquet schema!
1442
+ # However, correct Parquet types are needed as fallback if JSON parsing fails,
1443
+ # and for compatibility with non-Spark tools (PyArrow, Dask, Presto, etc.)
1444
+ arrow_fields = []
1445
+ for field in pyspark_schema.fields:
1446
+ pa_type = map_pyspark_types_to_pyarrow_types(field.dataType)
1447
+ arrow_fields.append(pa.field(field.name, pa_type, nullable=field.nullable))
1448
+
1449
+ return pa.schema(arrow_fields, metadata=spark_metadata)
1450
+
1451
+
1452
+ def _write_parquet_metadata_files_locally(
1453
+ write_path: str, snowpark_schema: StructType
1454
+ ) -> None:
1455
+ """
1456
+ Generate _common_metadata file for local Parquet datasets.
1457
+
1458
+ Only generates _common_metadata (not _metadata) for consistency with cloud paths,
1459
+ where downloading all files for row group statistics would be inefficient.
1460
+ """
1461
+ try:
1462
+ local_path = Path(write_path)
1463
+ spark_only_schema = _create_spark_schema_from_snowpark(snowpark_schema)
1464
+ pq.write_metadata(spark_only_schema, local_path / "_common_metadata")
1465
+
1466
+ logger.debug(f"Created _common_metadata at {write_path}")
1467
+
1468
+ except ImportError:
1469
+ logger.warning(
1470
+ "PyArrow is required to generate Parquet metadata files. "
1471
+ "Install with: pip install pyarrow"
1472
+ )
1473
+ except Exception as e:
1474
+ logger.warning(f"Failed to create _common_metadata file: {e}")
879
1475
 
880
1476
 
881
1477
  def store_files_locally(
@@ -889,14 +1485,56 @@ def store_files_locally(
889
1485
  )
890
1486
  if overwrite and os.path.isdir(target_path):
891
1487
  _truncate_directory(real_path)
892
- snowpark.file_operation.FileOperation(session).get(stage_path, str(real_path))
1488
+ # Per Snowflake docs: "The command does not preserve stage directory structure when transferring files to your client machine"
1489
+ # https://docs.snowflake.com/en/sql-reference/sql/get
1490
+ # Preserve directory structure under stage_path by listing files and
1491
+ # downloading each into its corresponding local subdirectory when partition subdirs exist.
1492
+ # Otherwise, fall back to a direct GET which flattens.
1493
+
1494
+ # TODO(SNOW-2326973): This can be parallelized further. Its not done here because it only affects
1495
+ # write to local storage.
1496
+
1497
+ ls_dataframe = session.sql(f"LS {stage_path}")
1498
+ ls_iterator = ls_dataframe.toLocalIterator()
1499
+
1500
+ # Build a normalized base prefix from stage_path to compute relatives
1501
+ # Example: stage_path='@MY_STAGE/prefix' -> base_prefix='my_stage/prefix/'
1502
+ base_prefix = stage_path.lstrip("@").rstrip("/") + "/"
1503
+ base_prefix_lower = base_prefix.lower()
1504
+
1505
+ # Group by parent directory under the base prefix, then issue a GET per directory.
1506
+ # This gives a small parallelism advantage if we have many files per partition directory.
1507
+ parent_dirs: set[str] = set()
1508
+ for row in ls_iterator:
1509
+ name: str = row[0]
1510
+ name_lower = name.lower()
1511
+ rel_start = name_lower.find(base_prefix_lower)
1512
+ relative = name[rel_start + len(base_prefix) :] if rel_start != -1 else name
1513
+ parent_dir = os.path.dirname(relative)
1514
+ if parent_dir and parent_dir != ".":
1515
+ parent_dirs.add(parent_dir)
1516
+
1517
+ # If no parent directories were discovered (non-partitioned unload prefix), use direct GET.
1518
+ if not parent_dirs:
1519
+ snowpark.file_operation.FileOperation(session).get(stage_path, str(real_path))
1520
+ return
1521
+
1522
+ file_op = snowpark.file_operation.FileOperation(session)
1523
+ for parent_dir in sorted(parent_dirs):
1524
+ local_dir = real_path / parent_dir
1525
+ os.makedirs(local_dir, exist_ok=True)
1526
+
1527
+ src_dir = f"@{base_prefix}{parent_dir}"
1528
+ file_op.get(src_dir, str(local_dir))
893
1529
 
894
1530
 
895
1531
  def _truncate_directory(directory_path: Path) -> None:
896
1532
  if not directory_path.exists():
897
- raise FileNotFoundError(
1533
+ exception = FileNotFoundError(
898
1534
  f"The specified directory {directory_path} does not exist."
899
1535
  )
1536
+ attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
1537
+ raise exception
900
1538
  # Iterate over all the files and directories in the specified directory
901
1539
  for file in directory_path.iterdir():
902
1540
  # Check if it is a file or directory and remove it
@@ -904,31 +1542,3 @@ def _truncate_directory(directory_path: Path) -> None:
904
1542
  file.unlink()
905
1543
  elif file.is_dir():
906
1544
  shutil.rmtree(file)
907
-
908
-
909
- def check_snowflake_table_existence(
910
- snowpark_table_name: str,
911
- snowpark_session: snowpark.Session,
912
- ):
913
- try:
914
- snowpark_session.sql(f"SELECT 1 FROM {snowpark_table_name} LIMIT 1").collect()
915
- return True
916
- except Exception:
917
- return False
918
-
919
-
920
- # TODO: SNOW-2299414 Fix the implementation of table type check
921
- # def check_table_type(
922
- # snowpark_table_name: str,
923
- # snowpark_session: snowpark.Session,
924
- # ) -> str:
925
- # # currently we only support iceberg table and FDN table
926
- # metadata = snowpark_session.sql(
927
- # f"SHOW TABLES LIKE '{unquote_if_quoted(snowpark_table_name)}';"
928
- # ).collect()
929
- # if metadata is None or len(metadata) == 0:
930
- # raise AnalysisException(f"Table {snowpark_table_name} does not exist")
931
- # metadata = metadata[0]
932
- # if metadata.as_dict().get("is_iceberg") == "Y":
933
- # return "ICEBERG"
934
- # return "TABLE"