snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. snowflake/snowpark_connect/__init__.py +1 -0
  2. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
  3. snowflake/snowpark_connect/client/__init__.py +15 -0
  4. snowflake/snowpark_connect/client/error_utils.py +30 -0
  5. snowflake/snowpark_connect/client/exceptions.py +36 -0
  6. snowflake/snowpark_connect/client/query_results.py +90 -0
  7. snowflake/snowpark_connect/client/server.py +680 -0
  8. snowflake/snowpark_connect/client/utils/__init__.py +10 -0
  9. snowflake/snowpark_connect/client/utils/session.py +85 -0
  10. snowflake/snowpark_connect/column_name_handler.py +404 -243
  11. snowflake/snowpark_connect/column_qualifier.py +43 -0
  12. snowflake/snowpark_connect/config.py +237 -23
  13. snowflake/snowpark_connect/constants.py +2 -0
  14. snowflake/snowpark_connect/dataframe_container.py +102 -8
  15. snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
  16. snowflake/snowpark_connect/error/error_codes.py +50 -0
  17. snowflake/snowpark_connect/error/error_utils.py +172 -23
  18. snowflake/snowpark_connect/error/exceptions.py +13 -4
  19. snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
  20. snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
  21. snowflake/snowpark_connect/execute_plan/utils.py +5 -1
  22. snowflake/snowpark_connect/expression/function_defaults.py +9 -2
  23. snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
  24. snowflake/snowpark_connect/expression/literal.py +37 -13
  25. snowflake/snowpark_connect/expression/map_cast.py +123 -5
  26. snowflake/snowpark_connect/expression/map_expression.py +80 -27
  27. snowflake/snowpark_connect/expression/map_extension.py +322 -12
  28. snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
  29. snowflake/snowpark_connect/expression/map_udf.py +85 -20
  30. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
  31. snowflake/snowpark_connect/expression/map_unresolved_function.py +2748 -746
  32. snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
  33. snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
  34. snowflake/snowpark_connect/expression/map_window_function.py +18 -3
  35. snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
  36. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
  37. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
  38. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
  39. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
  40. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
  41. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
  42. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
  43. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
  44. snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
  45. snowflake/snowpark_connect/relation/io_utils.py +110 -10
  46. snowflake/snowpark_connect/relation/map_aggregate.py +196 -255
  47. snowflake/snowpark_connect/relation/map_catalog.py +5 -1
  48. snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
  49. snowflake/snowpark_connect/relation/map_extension.py +263 -29
  50. snowflake/snowpark_connect/relation/map_join.py +683 -442
  51. snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
  52. snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
  53. snowflake/snowpark_connect/relation/map_relation.py +48 -19
  54. snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
  55. snowflake/snowpark_connect/relation/map_show_string.py +13 -6
  56. snowflake/snowpark_connect/relation/map_sql.py +1233 -222
  57. snowflake/snowpark_connect/relation/map_stats.py +48 -9
  58. snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
  59. snowflake/snowpark_connect/relation/map_udtf.py +14 -4
  60. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
  61. snowflake/snowpark_connect/relation/read/map_read.py +134 -43
  62. snowflake/snowpark_connect/relation/read/map_read_csv.py +255 -45
  63. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +17 -5
  64. snowflake/snowpark_connect/relation/read/map_read_json.py +320 -85
  65. snowflake/snowpark_connect/relation/read/map_read_parquet.py +142 -27
  66. snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
  67. snowflake/snowpark_connect/relation/read/map_read_socket.py +11 -3
  68. snowflake/snowpark_connect/relation/read/map_read_table.py +82 -5
  69. snowflake/snowpark_connect/relation/read/map_read_text.py +18 -3
  70. snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
  71. snowflake/snowpark_connect/relation/read/reader_config.py +36 -3
  72. snowflake/snowpark_connect/relation/read/utils.py +50 -5
  73. snowflake/snowpark_connect/relation/stage_locator.py +91 -55
  74. snowflake/snowpark_connect/relation/utils.py +128 -5
  75. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
  76. snowflake/snowpark_connect/relation/write/map_write.py +929 -319
  77. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
  78. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  79. snowflake/snowpark_connect/resources_initializer.py +110 -48
  80. snowflake/snowpark_connect/server.py +546 -456
  81. snowflake/snowpark_connect/server_common/__init__.py +500 -0
  82. snowflake/snowpark_connect/snowflake_session.py +65 -0
  83. snowflake/snowpark_connect/start_server.py +53 -5
  84. snowflake/snowpark_connect/type_mapping.py +349 -27
  85. snowflake/snowpark_connect/typed_column.py +9 -7
  86. snowflake/snowpark_connect/utils/artifacts.py +9 -8
  87. snowflake/snowpark_connect/utils/cache.py +49 -27
  88. snowflake/snowpark_connect/utils/concurrent.py +36 -1
  89. snowflake/snowpark_connect/utils/context.py +187 -37
  90. snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
  91. snowflake/snowpark_connect/utils/env_utils.py +5 -1
  92. snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
  93. snowflake/snowpark_connect/utils/identifiers.py +137 -3
  94. snowflake/snowpark_connect/utils/io_utils.py +57 -1
  95. snowflake/snowpark_connect/utils/java_stored_procedure.py +125 -0
  96. snowflake/snowpark_connect/utils/java_udaf_utils.py +303 -0
  97. snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
  98. snowflake/snowpark_connect/utils/jvm_udf_utils.py +248 -0
  99. snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
  100. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
  101. snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
  102. snowflake/snowpark_connect/utils/profiling.py +25 -8
  103. snowflake/snowpark_connect/utils/scala_udf_utils.py +101 -332
  104. snowflake/snowpark_connect/utils/sequence.py +21 -0
  105. snowflake/snowpark_connect/utils/session.py +64 -28
  106. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
  107. snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
  108. snowflake/snowpark_connect/utils/telemetry.py +163 -22
  109. snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
  110. snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
  111. snowflake/snowpark_connect/utils/udf_cache.py +117 -41
  112. snowflake/snowpark_connect/utils/udf_helper.py +39 -37
  113. snowflake/snowpark_connect/utils/udf_utils.py +133 -14
  114. snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
  115. snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
  116. snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
  117. snowflake/snowpark_connect/version.py +1 -1
  118. snowflake/snowpark_decoder/dp_session.py +6 -2
  119. snowflake/snowpark_decoder/spark_decoder.py +12 -0
  120. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-submit +2 -2
  121. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/METADATA +14 -7
  122. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/RECORD +129 -167
  123. snowflake/snowpark_connect/hidden_column.py +0 -39
  124. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  125. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  126. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  127. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  128. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  129. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  130. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  131. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  132. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  133. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  134. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  135. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  136. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  137. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  138. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  139. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  140. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  141. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  142. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  143. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  144. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  145. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  146. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  147. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  148. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  149. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  150. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  151. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  152. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  153. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  154. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  155. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  156. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  157. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  158. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  159. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  160. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  161. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  162. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  163. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  164. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  165. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  166. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  167. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  168. snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
  169. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  170. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  171. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  172. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  173. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  174. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  175. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  176. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  177. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  178. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  179. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  180. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  181. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  182. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  183. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  184. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  185. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  186. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-connect +0 -0
  187. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-session +0 -0
  188. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/WHEEL +0 -0
  189. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE-binary +0 -0
  190. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE.txt +0 -0
  191. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/NOTICE-binary +0 -0
  192. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/top_level.txt +0 -0
@@ -3,19 +3,30 @@
3
3
  #
4
4
 
5
5
  import copy
6
+ from typing import Any
6
7
 
7
8
  import pyspark.sql.connect.proto.relations_pb2 as relation_proto
9
+ from pyspark.errors.exceptions.base import AnalysisException
8
10
 
9
11
  import snowflake.snowpark.functions as snowpark_fn
10
12
  from snowflake import snowpark
11
13
  from snowflake.snowpark.dataframe_reader import DataFrameReader
12
14
  from snowflake.snowpark.types import StringType, StructField, StructType
15
+ from snowflake.snowpark_connect.config import global_config, str_to_bool
13
16
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
17
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
18
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
14
19
  from snowflake.snowpark_connect.relation.read.map_read import CsvReaderConfig
20
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
21
+ add_filename_metadata_to_reader,
22
+ get_non_metadata_fields,
23
+ )
15
24
  from snowflake.snowpark_connect.relation.read.utils import (
25
+ apply_metadata_exclusion_pattern,
16
26
  get_spark_column_names_from_snowpark_columns,
17
27
  rename_columns_as_snowflake_standard,
18
28
  )
29
+ from snowflake.snowpark_connect.utils.io_utils import cached_file_format
19
30
  from snowflake.snowpark_connect.utils.telemetry import (
20
31
  SnowparkConnectNotImplementedError,
21
32
  )
@@ -37,33 +48,68 @@ def map_read_csv(
37
48
 
38
49
  if rel.read.is_streaming is True:
39
50
  # TODO: Structured streaming implementation.
40
- raise SnowparkConnectNotImplementedError(
51
+ exception = SnowparkConnectNotImplementedError(
41
52
  "Streaming is not supported for CSV files."
42
53
  )
54
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
55
+ raise exception
43
56
  else:
44
- snowpark_options = options.convert_to_snowpark_args()
57
+ converted_snowpark_options = options.convert_to_snowpark_args()
58
+ parse_header = converted_snowpark_options.get("PARSE_HEADER", False)
59
+ file_format_options = _parse_csv_snowpark_options(converted_snowpark_options)
60
+ file_format = cached_file_format(session, "csv", file_format_options)
61
+
62
+ snowpark_reader_options = dict()
63
+ snowpark_reader_options["FORMAT_NAME"] = file_format
64
+ snowpark_reader_options["ENFORCE_EXISTING_FILE_FORMAT"] = True
65
+ snowpark_reader_options["INFER_SCHEMA"] = converted_snowpark_options.get(
66
+ "INFER_SCHEMA", False
67
+ )
68
+ snowpark_reader_options[
69
+ "INFER_SCHEMA_OPTIONS"
70
+ ] = converted_snowpark_options.get("INFER_SCHEMA_OPTIONS", {})
71
+
72
+ # Use Try_cast to avoid schema inference errors
73
+ if snowpark_reader_options.get("INFER_SCHEMA", False):
74
+ snowpark_reader_options["TRY_CAST"] = True
75
+
76
+ apply_metadata_exclusion_pattern(converted_snowpark_options)
77
+ snowpark_reader_options["PATTERN"] = converted_snowpark_options.get(
78
+ "PATTERN", None
79
+ )
80
+
45
81
  raw_options = rel.read.data_source.options
82
+
46
83
  if schema is None or (
47
- snowpark_options.get("PARSE_HEADER", False)
48
- and raw_options.get("enforceSchema", "True").lower() == "false"
84
+ parse_header
85
+ and str(raw_options.get("enforceSchema", "True")).lower() == "false"
49
86
  ): # Schema has to equals to header's format
50
- reader = session.read.options(snowpark_options)
87
+ reader = add_filename_metadata_to_reader(
88
+ session.read.options(snowpark_reader_options), raw_options
89
+ )
51
90
  else:
52
- reader = session.read.options(snowpark_options).schema(schema)
91
+ reader = add_filename_metadata_to_reader(
92
+ session.read.options(snowpark_reader_options).schema(schema),
93
+ raw_options,
94
+ )
53
95
  df = read_data(
54
96
  reader,
55
97
  schema,
56
98
  session,
57
99
  paths[0],
58
- snowpark_options,
100
+ file_format_options,
101
+ snowpark_reader_options,
59
102
  raw_options,
103
+ parse_header,
60
104
  )
61
105
  if len(paths) > 1:
62
106
  # TODO: figure out if this is what Spark does.
63
107
  for p in paths[1:]:
64
108
  df = df.union_all(reader.csv(p))
65
109
 
66
- if schema is None:
110
+ if schema is None and not str_to_bool(
111
+ str(raw_options.get("inferSchema", "false"))
112
+ ):
67
113
  df = df.select(
68
114
  [snowpark_fn.col(c).cast("STRING").alias(c) for c in df.schema.names]
69
115
  )
@@ -81,62 +127,226 @@ def map_read_csv(
81
127
  )
82
128
 
83
129
 
130
+ _csv_file_format_allowed_options = {
131
+ "COMPRESSION",
132
+ "RECORD_DELIMITER",
133
+ "FIELD_DELIMITER",
134
+ "MULTI_LINE",
135
+ "FILE_EXTENSION",
136
+ "PARSE_HEADER",
137
+ "SKIP_HEADER",
138
+ "SKIP_BLANK_LINES",
139
+ "DATE_FORMAT",
140
+ "TIME_FORMAT",
141
+ "TIMESTAMP_FORMAT",
142
+ "BINARY_FORMAT",
143
+ "ESCAPE",
144
+ "ESCAPE_UNENCLOSED_FIELD",
145
+ "TRIM_SPACE",
146
+ "FIELD_OPTIONALLY_ENCLOSED_BY",
147
+ "NULL_IF",
148
+ "ERROR_ON_COLUMN_COUNT_MISMATCH",
149
+ "REPLACE_INVALID_CHARACTERS",
150
+ "EMPTY_FIELD_AS_NULL",
151
+ "SKIP_BYTE_ORDER_MARK",
152
+ "ENCODING",
153
+ }
154
+
155
+
156
+ def _parse_csv_snowpark_options(snowpark_options: dict[str, Any]) -> dict[str, Any]:
157
+ file_format_options = dict()
158
+ for key, value in snowpark_options.items():
159
+ upper_key = key.upper()
160
+ if upper_key in _csv_file_format_allowed_options:
161
+ file_format_options[upper_key] = value
162
+
163
+ # This option has to be removed, because we cannot use at the same time predefined file format and parse_header option
164
+ # Such combination causes snowpark to raise SQL compilation error: Invalid file format "PARSE_HEADER" is only allowed for CSV INFER_SCHEMA and MATCH_BY_COLUMN_NAME
165
+ parse_header = file_format_options.get("PARSE_HEADER", False)
166
+ if parse_header:
167
+ file_format_options["SKIP_HEADER"] = 1
168
+ del file_format_options["PARSE_HEADER"]
169
+
170
+ return file_format_options
171
+
172
+
173
+ def _deduplicate_column_names_pyspark_style(
174
+ column_names: list[str], case_sensitive: bool
175
+ ) -> list[str]:
176
+ """
177
+ Deduplicate column names following PySpark's behavior in CSVUtils.scala::makeSafeHeader by appending
178
+ global position index to all occurrences of duplicated names.
179
+
180
+ Examples with case_sensitive=False:
181
+ ['ab', 'AB'] -> ['ab0', 'AB1']
182
+ ['ab', 'ab'] -> ['ab0', 'ab1']
183
+ ['a', 'b', 'A', 'c', 'B'] -> ['a0', 'b1', 'A2', 'c', 'B4'] (positions: a=0,2; b=1,4; c=3)
184
+
185
+ Examples with case_sensitive=True:
186
+ ['ab', 'AB'] -> ['ab', 'AB'] (no duplicates, different case)
187
+ ['ab', 'ab'] -> ['ab0', 'ab1'] (exact duplicates at positions 0, 1)
188
+ ['a', 'b', 'A', 'c', 'B'] -> ['a', 'b', 'A', 'c', 'B'] (no duplicates)
189
+
190
+ Edge cases:
191
+ ['a0', 'a0'] -> ['a00', 'a01'] (appends position even if name already has digits)
192
+ ['a', '', 'b'] -> ['a', '_c1', 'b'] (empty names become _c<position>)
193
+ """
194
+ seen = set()
195
+ duplicates = set()
196
+
197
+ for name in column_names:
198
+ # filter out nulls and apply case transformation
199
+ if not name:
200
+ continue
201
+ key = name if case_sensitive else name.lower()
202
+ if key in seen:
203
+ duplicates.add(key)
204
+ else:
205
+ seen.add(key)
206
+
207
+ result = []
208
+ for index, value in enumerate(column_names):
209
+ # Empty/null, append _c<index>
210
+ if value is None or value == "":
211
+ result.append(f"_c{index}")
212
+ # Case-insensitive duplicate, append index
213
+ elif not case_sensitive and value.lower() in duplicates:
214
+ result.append(f"{value}{index}")
215
+ # Case-sensitive duplicate, append index
216
+ elif case_sensitive and value in duplicates:
217
+ result.append(f"{value}{index}")
218
+ else:
219
+ result.append(value)
220
+
221
+ return result
222
+
223
+
84
224
  def get_header_names(
85
225
  session: snowpark.Session,
86
226
  path: list[str],
87
- snowpark_options: dict,
227
+ file_format_options: dict,
228
+ snowpark_read_options: dict,
229
+ raw_options: dict,
230
+ parse_header: bool,
88
231
  ) -> list[str]:
89
- snowpark_options_no_header = copy.copy(snowpark_options)
90
- snowpark_options_no_header["PARSE_HEADER"] = False
91
-
92
- header_df = session.read.options(snowpark_options_no_header).csv(path).limit(1)
93
- header_data = header_df.collect()[0]
94
- return [
95
- f'"{header_data[i]}"'
96
- for i in range(len(header_df.schema.fields))
97
- if header_data[i] is not None
232
+ no_header_file_format_options = copy.copy(file_format_options)
233
+ no_header_file_format_options["PARSE_HEADER"] = False
234
+ no_header_file_format_options.pop("SKIP_HEADER", None)
235
+
236
+ file_format = cached_file_format(session, "csv", no_header_file_format_options)
237
+ no_header_snowpark_read_options = copy.copy(snowpark_read_options)
238
+ no_header_snowpark_read_options["FORMAT_NAME"] = file_format
239
+ no_header_snowpark_read_options.pop("INFER_SCHEMA", None)
240
+
241
+ # If we don't set this, snowpark will try to infer the schema for all rows in the csv file.
242
+ # Since there's no easy way to just read the header from the csv, we use this approach where we force the df reader to infer the schema for 10 rows and
243
+ # and we are only interested in the first row to get the header names and discard the inferred schema.
244
+ no_header_snowpark_read_options["INFER_SCHEMA_OPTIONS"] = {
245
+ "MAX_RECORDS_PER_FILE": 1,
246
+ }
247
+
248
+ header_df = session.read.options(no_header_snowpark_read_options).csv(path).limit(1)
249
+ collected_data = header_df.collect()
250
+
251
+ if len(collected_data) == 0:
252
+ error_msg = f"Path does not exist or contains no data: {path}"
253
+ user_pattern = raw_options.get("pathGlobFilter", None)
254
+ if user_pattern:
255
+ error_msg += f" (with pathGlobFilter: {user_pattern})"
256
+
257
+ exception = AnalysisException(error_msg)
258
+ attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
259
+ raise exception
260
+
261
+ header_data = collected_data[0]
262
+ num_columns = len(header_df.schema.fields)
263
+
264
+ if not parse_header:
265
+ # parse_header=False, use default _c0, _c1, _c2... naming for columns
266
+ return [f'"_c{i}"' for i in range(num_columns)]
267
+
268
+ # parse_header=True: Read first row as column names and deduplicate
269
+ raw_column_names = [
270
+ header_data[i] if header_data[i] is not None else "" for i in range(num_columns)
98
271
  ]
99
272
 
273
+ case_sensitive = global_config.spark_sql_caseSensitive
274
+ deduplicated_names = _deduplicate_column_names_pyspark_style(
275
+ raw_column_names, case_sensitive
276
+ )
277
+
278
+ return [f'"{name}"' for name in deduplicated_names]
279
+
100
280
 
101
281
  def read_data(
102
282
  reader: DataFrameReader,
103
283
  schema: snowpark.types.StructType | None,
104
284
  session: snowpark.Session,
105
285
  path: list[str],
106
- snowpark_options: dict,
286
+ file_format_options: dict,
287
+ snowpark_read_options: dict,
107
288
  raw_options: dict,
289
+ parse_header: bool,
108
290
  ) -> snowpark.DataFrame:
109
- df = reader.csv(path)
110
291
  filename = path.strip("/").split("/")[-1]
292
+
111
293
  if schema is not None:
112
- if len(schema.fields) != len(df.schema.fields):
113
- raise Exception(f"csv load from {filename} failed.")
114
- if raw_options.get("enforceSchema", "True").lower() == "false":
294
+ df = reader.csv(path)
295
+ non_metadata_fields = get_non_metadata_fields(df.schema.fields)
296
+ if len(schema.fields) != len(non_metadata_fields):
297
+ exception = Exception(f"csv load from {filename} failed.")
298
+ attach_custom_error_code(exception, ErrorCodes.INVALID_CAST)
299
+ raise exception
300
+ if str(raw_options.get("enforceSchema", "True")).lower() == "false":
115
301
  for i in range(len(schema.fields)):
116
302
  if (
117
- schema.fields[i].name != df.schema.fields[i].name
118
- and f'"{schema.fields[i].name}"' != df.schema.fields[i].name
303
+ schema.fields[i].name != non_metadata_fields[i].name
304
+ and f'"{schema.fields[i].name}"' != non_metadata_fields[i].name
119
305
  ):
120
- raise Exception("CSV header does not conform to the schema")
306
+ exception = Exception("CSV header does not conform to the schema")
307
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
308
+ raise exception
121
309
  return df
122
310
 
123
- headers = get_header_names(session, path, snowpark_options)
124
-
125
- # Handle mismatch in column count between header and data
126
- if (
127
- len(df.schema.fields) == 1
128
- and df.schema.fields[0].name.upper() == "C1"
129
- and snowpark_options.get("PARSE_HEADER") is True
130
- and len(headers) != len(df.schema.fields)
131
- ):
132
- df = (
133
- session.read.options(snowpark_options)
134
- .schema(StructType([StructField(h, StringType(), True) for h in headers]))
135
- .csv(path)
136
- )
137
- elif snowpark_options.get("PARSE_HEADER") is False and len(headers) != len(
138
- df.schema.fields
139
- ):
140
- return df.select([df.schema.fields[i].name for i in range(len(headers))])
311
+ headers = get_header_names(
312
+ session,
313
+ path,
314
+ file_format_options,
315
+ snowpark_read_options,
316
+ raw_options,
317
+ parse_header,
318
+ )
319
+
320
+ # Create schema with the column names and read CSV
321
+ if len(headers) > 0:
322
+ if (
323
+ not str_to_bool(str(raw_options.get("inferSchema", "false")))
324
+ and schema is None
325
+ ):
326
+ inferred_schema = StructType(
327
+ [StructField(h, StringType(), True) for h in headers]
328
+ )
329
+ df = reader.schema(inferred_schema).csv(path)
330
+ else:
331
+ df = reader.csv(path)
332
+ non_metadata_fields = get_non_metadata_fields(df.schema.fields)
333
+ if len(non_metadata_fields) != len(headers):
334
+ exception = Exception(
335
+ f"CSV header: {headers} does not conform to the schema"
336
+ )
337
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
338
+ raise exception
339
+ if any(
340
+ non_metadata_fields[i].name != headers[i]
341
+ for i in range(len(non_metadata_fields))
342
+ ):
343
+ df = df.select(
344
+ [
345
+ snowpark_fn.col(non_metadata_fields[i].name).alias(headers[i])
346
+ for i in range(len(non_metadata_fields))
347
+ ]
348
+ )
349
+ return df
141
350
 
142
- return df
351
+ # Fallback: no headers, shouldn't reach here
352
+ return reader.csv(path)
@@ -9,6 +9,8 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
9
9
  from snowflake import snowpark
10
10
  from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
11
11
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
12
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
13
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
12
14
  from snowflake.snowpark_connect.relation.read.jdbc_read_dbapi import JdbcDataFrameReader
13
15
  from snowflake.snowpark_connect.relation.read.utils import (
14
16
  Connection,
@@ -28,7 +30,9 @@ def create_connection(jdbc_options: dict[str, str]) -> Connection:
28
30
  return jaydebeapi.connect(driver, url, jdbc_options)
29
31
  except Exception as e:
30
32
  jpype.detachThreadFromJVM()
31
- raise Exception(f"Error connecting JDBC datasource: {e}")
33
+ exception = Exception(f"Error connecting JDBC datasource: {e}")
34
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
35
+ raise exception
32
36
 
33
37
 
34
38
  def close_connection(conn: Connection) -> None:
@@ -70,17 +74,23 @@ def map_read_jdbc(
70
74
  dbtable = None
71
75
 
72
76
  if not dbtable and not query:
73
- raise ValueError("Include dbtable or query is required option")
77
+ exception = ValueError("Include dbtable or query is required option")
78
+ attach_custom_error_code(exception, ErrorCodes.INSUFFICIENT_INPUT)
79
+ raise exception
74
80
 
75
81
  if query is not None and dbtable is not None:
76
- raise ValueError(
82
+ exception = ValueError(
77
83
  "Not allowed to specify dbtable and query options at the same time"
78
84
  )
85
+ attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
86
+ raise exception
79
87
 
80
88
  if query is not None and partition_column is not None:
81
- raise ValueError(
89
+ exception = ValueError(
82
90
  "Not allowed to specify partitionColumn and query options at the same time"
83
91
  )
92
+ attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
93
+ raise exception
84
94
 
85
95
  try:
86
96
  df = JdbcDataFrameReader(session, jdbc_options).jdbc_read_dbapi(
@@ -105,4 +115,6 @@ def map_read_jdbc(
105
115
  snowpark_column_types=[f.datatype for f in df.schema.fields],
106
116
  )
107
117
  except Exception as e:
108
- raise Exception(f"Error accessing JDBC datasource for read: {e}")
118
+ exception = Exception(f"Error accessing JDBC datasource for read: {e}")
119
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
120
+ raise exception