snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. snowflake/snowpark_connect/__init__.py +1 -0
  2. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
  3. snowflake/snowpark_connect/client/__init__.py +15 -0
  4. snowflake/snowpark_connect/client/error_utils.py +30 -0
  5. snowflake/snowpark_connect/client/exceptions.py +36 -0
  6. snowflake/snowpark_connect/client/query_results.py +90 -0
  7. snowflake/snowpark_connect/client/server.py +717 -0
  8. snowflake/snowpark_connect/client/utils/__init__.py +10 -0
  9. snowflake/snowpark_connect/client/utils/session.py +85 -0
  10. snowflake/snowpark_connect/column_name_handler.py +404 -243
  11. snowflake/snowpark_connect/column_qualifier.py +43 -0
  12. snowflake/snowpark_connect/config.py +309 -26
  13. snowflake/snowpark_connect/constants.py +2 -0
  14. snowflake/snowpark_connect/dataframe_container.py +102 -8
  15. snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
  16. snowflake/snowpark_connect/error/error_codes.py +50 -0
  17. snowflake/snowpark_connect/error/error_utils.py +172 -23
  18. snowflake/snowpark_connect/error/exceptions.py +13 -4
  19. snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
  20. snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
  21. snowflake/snowpark_connect/execute_plan/utils.py +5 -1
  22. snowflake/snowpark_connect/expression/error_utils.py +28 -0
  23. snowflake/snowpark_connect/expression/function_defaults.py +9 -2
  24. snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
  25. snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
  26. snowflake/snowpark_connect/expression/literal.py +37 -13
  27. snowflake/snowpark_connect/expression/map_cast.py +224 -15
  28. snowflake/snowpark_connect/expression/map_expression.py +80 -27
  29. snowflake/snowpark_connect/expression/map_extension.py +322 -12
  30. snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
  31. snowflake/snowpark_connect/expression/map_udf.py +86 -20
  32. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
  33. snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
  34. snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
  35. snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
  36. snowflake/snowpark_connect/expression/map_window_function.py +18 -3
  37. snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
  39. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
  43. snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
  44. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
  45. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
  46. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
  47. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
  48. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
  49. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
  50. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
  51. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
  52. snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
  53. snowflake/snowpark_connect/relation/io_utils.py +110 -10
  54. snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
  55. snowflake/snowpark_connect/relation/map_catalog.py +5 -1
  56. snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
  57. snowflake/snowpark_connect/relation/map_extension.py +263 -29
  58. snowflake/snowpark_connect/relation/map_join.py +683 -442
  59. snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
  60. snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
  61. snowflake/snowpark_connect/relation/map_relation.py +48 -19
  62. snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
  63. snowflake/snowpark_connect/relation/map_show_string.py +13 -6
  64. snowflake/snowpark_connect/relation/map_sql.py +1233 -222
  65. snowflake/snowpark_connect/relation/map_stats.py +48 -9
  66. snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
  67. snowflake/snowpark_connect/relation/map_udtf.py +14 -4
  68. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
  69. snowflake/snowpark_connect/relation/read/map_read.py +134 -43
  70. snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
  71. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
  72. snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
  73. snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
  74. snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
  75. snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
  76. snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
  77. snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
  78. snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
  79. snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
  80. snowflake/snowpark_connect/relation/read/utils.py +50 -5
  81. snowflake/snowpark_connect/relation/stage_locator.py +91 -55
  82. snowflake/snowpark_connect/relation/utils.py +128 -5
  83. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
  84. snowflake/snowpark_connect/relation/write/map_write.py +929 -319
  85. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
  86. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  87. snowflake/snowpark_connect/resources_initializer.py +171 -48
  88. snowflake/snowpark_connect/server.py +528 -473
  89. snowflake/snowpark_connect/server_common/__init__.py +503 -0
  90. snowflake/snowpark_connect/snowflake_session.py +65 -0
  91. snowflake/snowpark_connect/start_server.py +53 -5
  92. snowflake/snowpark_connect/type_mapping.py +349 -27
  93. snowflake/snowpark_connect/type_support.py +130 -0
  94. snowflake/snowpark_connect/typed_column.py +9 -7
  95. snowflake/snowpark_connect/utils/artifacts.py +9 -8
  96. snowflake/snowpark_connect/utils/cache.py +49 -27
  97. snowflake/snowpark_connect/utils/concurrent.py +36 -1
  98. snowflake/snowpark_connect/utils/context.py +195 -37
  99. snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
  100. snowflake/snowpark_connect/utils/env_utils.py +5 -1
  101. snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
  102. snowflake/snowpark_connect/utils/identifiers.py +137 -3
  103. snowflake/snowpark_connect/utils/io_utils.py +57 -1
  104. snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
  105. snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
  106. snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
  107. snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
  108. snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
  109. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
  110. snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
  111. snowflake/snowpark_connect/utils/profiling.py +25 -8
  112. snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
  113. snowflake/snowpark_connect/utils/sequence.py +21 -0
  114. snowflake/snowpark_connect/utils/session.py +64 -28
  115. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
  116. snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
  117. snowflake/snowpark_connect/utils/telemetry.py +192 -40
  118. snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
  119. snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
  120. snowflake/snowpark_connect/utils/udf_cache.py +117 -41
  121. snowflake/snowpark_connect/utils/udf_helper.py +39 -37
  122. snowflake/snowpark_connect/utils/udf_utils.py +133 -14
  123. snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
  124. snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
  125. snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
  126. snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
  127. snowflake/snowpark_connect/version.py +1 -1
  128. snowflake/snowpark_decoder/dp_session.py +6 -2
  129. snowflake/snowpark_decoder/spark_decoder.py +12 -0
  130. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
  131. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
  132. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
  133. snowflake/snowpark_connect/hidden_column.py +0 -39
  134. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  135. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  136. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  137. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  138. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  139. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  140. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  141. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  142. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  143. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  144. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  145. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  146. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  147. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  148. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  149. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  150. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  151. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  152. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  153. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  154. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  155. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  156. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  157. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  158. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  159. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  160. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  161. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  162. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  163. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  164. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  165. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  166. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  167. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  168. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  169. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  170. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  171. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  172. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  173. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  174. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  175. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  176. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  177. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  178. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  179. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  180. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  181. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  182. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  183. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  184. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  185. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  186. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  187. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  188. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  189. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  190. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  191. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  192. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  193. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  194. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
  195. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
  196. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
  197. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
  198. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
  199. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
  200. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
@@ -3,19 +3,45 @@
3
3
  #
4
4
 
5
5
  import copy
6
+ from typing import Any
6
7
 
7
8
  import pyspark.sql.connect.proto.relations_pb2 as relation_proto
9
+ from pyspark.errors.exceptions.base import AnalysisException
8
10
 
9
11
  import snowflake.snowpark.functions as snowpark_fn
10
12
  from snowflake import snowpark
11
13
  from snowflake.snowpark.dataframe_reader import DataFrameReader
12
- from snowflake.snowpark.types import StringType, StructField, StructType
14
+ from snowflake.snowpark.types import (
15
+ DataType,
16
+ DecimalType,
17
+ DoubleType,
18
+ IntegerType,
19
+ LongType,
20
+ StringType,
21
+ StructField,
22
+ StructType,
23
+ _FractionalType,
24
+ _IntegralType,
25
+ )
26
+ from snowflake.snowpark_connect.config import global_config, str_to_bool
13
27
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
28
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
29
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
14
30
  from snowflake.snowpark_connect.relation.read.map_read import CsvReaderConfig
31
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
32
+ add_filename_metadata_to_reader,
33
+ get_non_metadata_fields,
34
+ )
15
35
  from snowflake.snowpark_connect.relation.read.utils import (
36
+ apply_metadata_exclusion_pattern,
16
37
  get_spark_column_names_from_snowpark_columns,
17
38
  rename_columns_as_snowflake_standard,
18
39
  )
40
+ from snowflake.snowpark_connect.type_support import (
41
+ _integral_types_conversion_enabled,
42
+ emulate_integral_types,
43
+ )
44
+ from snowflake.snowpark_connect.utils.io_utils import cached_file_format
19
45
  from snowflake.snowpark_connect.utils.telemetry import (
20
46
  SnowparkConnectNotImplementedError,
21
47
  )
@@ -37,33 +63,68 @@ def map_read_csv(
37
63
 
38
64
  if rel.read.is_streaming is True:
39
65
  # TODO: Structured streaming implementation.
40
- raise SnowparkConnectNotImplementedError(
66
+ exception = SnowparkConnectNotImplementedError(
41
67
  "Streaming is not supported for CSV files."
42
68
  )
69
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
70
+ raise exception
43
71
  else:
44
- snowpark_options = options.convert_to_snowpark_args()
72
+ converted_snowpark_options = options.convert_to_snowpark_args()
73
+ parse_header = converted_snowpark_options.get("PARSE_HEADER", False)
74
+ file_format_options = _parse_csv_snowpark_options(converted_snowpark_options)
75
+ file_format = cached_file_format(session, "csv", file_format_options)
76
+
77
+ snowpark_reader_options = dict()
78
+ snowpark_reader_options["FORMAT_NAME"] = file_format
79
+ snowpark_reader_options["ENFORCE_EXISTING_FILE_FORMAT"] = True
80
+ snowpark_reader_options["INFER_SCHEMA"] = converted_snowpark_options.get(
81
+ "INFER_SCHEMA", False
82
+ )
83
+ snowpark_reader_options[
84
+ "INFER_SCHEMA_OPTIONS"
85
+ ] = converted_snowpark_options.get("INFER_SCHEMA_OPTIONS", {})
86
+
87
+ # Use Try_cast to avoid schema inference errors
88
+ if snowpark_reader_options.get("INFER_SCHEMA", False):
89
+ snowpark_reader_options["TRY_CAST"] = True
90
+
91
+ apply_metadata_exclusion_pattern(converted_snowpark_options)
92
+ snowpark_reader_options["PATTERN"] = converted_snowpark_options.get(
93
+ "PATTERN", None
94
+ )
95
+
45
96
  raw_options = rel.read.data_source.options
97
+
46
98
  if schema is None or (
47
- snowpark_options.get("PARSE_HEADER", False)
48
- and raw_options.get("enforceSchema", "True").lower() == "false"
99
+ parse_header
100
+ and str(raw_options.get("enforceSchema", "True")).lower() == "false"
49
101
  ): # Schema has to equals to header's format
50
- reader = session.read.options(snowpark_options)
102
+ reader = add_filename_metadata_to_reader(
103
+ session.read.options(snowpark_reader_options), raw_options
104
+ )
51
105
  else:
52
- reader = session.read.options(snowpark_options).schema(schema)
106
+ reader = add_filename_metadata_to_reader(
107
+ session.read.options(snowpark_reader_options).schema(schema),
108
+ raw_options,
109
+ )
53
110
  df = read_data(
54
111
  reader,
55
112
  schema,
56
113
  session,
57
114
  paths[0],
58
- snowpark_options,
115
+ file_format_options,
116
+ snowpark_reader_options,
59
117
  raw_options,
118
+ parse_header,
60
119
  )
61
120
  if len(paths) > 1:
62
121
  # TODO: figure out if this is what Spark does.
63
122
  for p in paths[1:]:
64
123
  df = df.union_all(reader.csv(p))
65
124
 
66
- if schema is None:
125
+ if schema is None and not str_to_bool(
126
+ str(raw_options.get("inferSchema", raw_options.get("inferschema", "false")))
127
+ ):
67
128
  df = df.select(
68
129
  [snowpark_fn.col(c).cast("STRING").alias(c) for c in df.schema.names]
69
130
  )
@@ -77,66 +138,284 @@ def map_read_csv(
77
138
  dataframe=renamed_df,
78
139
  spark_column_names=spark_column_names,
79
140
  snowpark_column_names=snowpark_column_names,
80
- snowpark_column_types=[f.datatype for f in df.schema.fields],
141
+ snowpark_column_types=[
142
+ _emulate_integral_types_for_csv(f.datatype) for f in df.schema.fields
143
+ ],
81
144
  )
82
145
 
83
146
 
147
+ _csv_file_format_allowed_options = {
148
+ "COMPRESSION",
149
+ "RECORD_DELIMITER",
150
+ "FIELD_DELIMITER",
151
+ "MULTI_LINE",
152
+ "FILE_EXTENSION",
153
+ "PARSE_HEADER",
154
+ "SKIP_HEADER",
155
+ "SKIP_BLANK_LINES",
156
+ "DATE_FORMAT",
157
+ "TIME_FORMAT",
158
+ "TIMESTAMP_FORMAT",
159
+ "BINARY_FORMAT",
160
+ "ESCAPE",
161
+ "ESCAPE_UNENCLOSED_FIELD",
162
+ "TRIM_SPACE",
163
+ "FIELD_OPTIONALLY_ENCLOSED_BY",
164
+ "NULL_IF",
165
+ "ERROR_ON_COLUMN_COUNT_MISMATCH",
166
+ "REPLACE_INVALID_CHARACTERS",
167
+ "EMPTY_FIELD_AS_NULL",
168
+ "SKIP_BYTE_ORDER_MARK",
169
+ "ENCODING",
170
+ }
171
+
172
+
173
+ def _parse_csv_snowpark_options(snowpark_options: dict[str, Any]) -> dict[str, Any]:
174
+ file_format_options = dict()
175
+ for key, value in snowpark_options.items():
176
+ upper_key = key.upper()
177
+ if upper_key in _csv_file_format_allowed_options:
178
+ file_format_options[upper_key] = value
179
+
180
+ # This option has to be removed, because we cannot use at the same time predefined file format and parse_header option
181
+ # Such combination causes snowpark to raise SQL compilation error: Invalid file format "PARSE_HEADER" is only allowed for CSV INFER_SCHEMA and MATCH_BY_COLUMN_NAME
182
+ parse_header = file_format_options.get("PARSE_HEADER", False)
183
+ if parse_header:
184
+ file_format_options["SKIP_HEADER"] = 1
185
+ del file_format_options["PARSE_HEADER"]
186
+
187
+ return file_format_options
188
+
189
+
190
+ def _deduplicate_column_names_pyspark_style(
191
+ column_names: list[str], case_sensitive: bool
192
+ ) -> list[str]:
193
+ """
194
+ Deduplicate column names following PySpark's behavior in CSVUtils.scala::makeSafeHeader by appending
195
+ global position index to all occurrences of duplicated names.
196
+
197
+ Examples with case_sensitive=False:
198
+ ['ab', 'AB'] -> ['ab0', 'AB1']
199
+ ['ab', 'ab'] -> ['ab0', 'ab1']
200
+ ['a', 'b', 'A', 'c', 'B'] -> ['a0', 'b1', 'A2', 'c', 'B4'] (positions: a=0,2; b=1,4; c=3)
201
+
202
+ Examples with case_sensitive=True:
203
+ ['ab', 'AB'] -> ['ab', 'AB'] (no duplicates, different case)
204
+ ['ab', 'ab'] -> ['ab0', 'ab1'] (exact duplicates at positions 0, 1)
205
+ ['a', 'b', 'A', 'c', 'B'] -> ['a', 'b', 'A', 'c', 'B'] (no duplicates)
206
+
207
+ Edge cases:
208
+ ['a0', 'a0'] -> ['a00', 'a01'] (appends position even if name already has digits)
209
+ ['a', '', 'b'] -> ['a', '_c1', 'b'] (empty names become _c<position>)
210
+ """
211
+ seen = set()
212
+ duplicates = set()
213
+
214
+ for name in column_names:
215
+ # filter out nulls and apply case transformation
216
+ if not name:
217
+ continue
218
+ key = name if case_sensitive else name.lower()
219
+ if key in seen:
220
+ duplicates.add(key)
221
+ else:
222
+ seen.add(key)
223
+
224
+ result = []
225
+ for index, value in enumerate(column_names):
226
+ # Empty/null, append _c<index>
227
+ if value is None or value == "":
228
+ result.append(f"_c{index}")
229
+ # Case-insensitive duplicate, append index
230
+ elif not case_sensitive and value.lower() in duplicates:
231
+ result.append(f"{value}{index}")
232
+ # Case-sensitive duplicate, append index
233
+ elif case_sensitive and value in duplicates:
234
+ result.append(f"{value}{index}")
235
+ else:
236
+ result.append(value)
237
+
238
+ return result
239
+
240
+
84
241
  def get_header_names(
85
242
  session: snowpark.Session,
86
243
  path: list[str],
87
- snowpark_options: dict,
244
+ file_format_options: dict,
245
+ snowpark_read_options: dict,
246
+ raw_options: dict,
247
+ parse_header: bool,
88
248
  ) -> list[str]:
89
- snowpark_options_no_header = copy.copy(snowpark_options)
90
- snowpark_options_no_header["PARSE_HEADER"] = False
91
-
92
- header_df = session.read.options(snowpark_options_no_header).csv(path).limit(1)
93
- header_data = header_df.collect()[0]
94
- return [
95
- f'"{header_data[i]}"'
96
- for i in range(len(header_df.schema.fields))
97
- if header_data[i] is not None
249
+ no_header_file_format_options = copy.copy(file_format_options)
250
+ no_header_file_format_options["PARSE_HEADER"] = False
251
+ no_header_file_format_options.pop("SKIP_HEADER", None)
252
+
253
+ file_format = cached_file_format(session, "csv", no_header_file_format_options)
254
+ no_header_snowpark_read_options = copy.copy(snowpark_read_options)
255
+ no_header_snowpark_read_options["FORMAT_NAME"] = file_format
256
+ no_header_snowpark_read_options.pop("INFER_SCHEMA", None)
257
+
258
+ # If we don't set this, snowpark will try to infer the schema for all rows in the csv file.
259
+ # Since there's no easy way to just read the header from the csv, we use this approach where we force the df reader to infer the schema for 10 rows and
260
+ # and we are only interested in the first row to get the header names and discard the inferred schema.
261
+ no_header_snowpark_read_options["INFER_SCHEMA_OPTIONS"] = {
262
+ "MAX_RECORDS_PER_FILE": 1,
263
+ }
264
+
265
+ header_df = session.read.options(no_header_snowpark_read_options).csv(path).limit(1)
266
+ collected_data = header_df.collect()
267
+
268
+ if len(collected_data) == 0:
269
+ error_msg = f"Path does not exist or contains no data: {path}"
270
+ user_pattern = raw_options.get("pathGlobFilter", None)
271
+ if user_pattern:
272
+ error_msg += f" (with pathGlobFilter: {user_pattern})"
273
+
274
+ exception = AnalysisException(error_msg)
275
+ attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
276
+ raise exception
277
+
278
+ header_data = collected_data[0]
279
+ num_columns = len(header_df.schema.fields)
280
+
281
+ if not parse_header:
282
+ # parse_header=False, use default _c0, _c1, _c2... naming for columns
283
+ return [f'"_c{i}"' for i in range(num_columns)]
284
+
285
+ # parse_header=True: Read first row as column names and deduplicate
286
+ raw_column_names = [
287
+ header_data[i] if header_data[i] is not None else "" for i in range(num_columns)
98
288
  ]
99
289
 
290
+ case_sensitive = global_config.spark_sql_caseSensitive
291
+ deduplicated_names = _deduplicate_column_names_pyspark_style(
292
+ raw_column_names, case_sensitive
293
+ )
294
+
295
+ return [f'"{name}"' for name in deduplicated_names]
296
+
100
297
 
101
298
  def read_data(
102
299
  reader: DataFrameReader,
103
300
  schema: snowpark.types.StructType | None,
104
301
  session: snowpark.Session,
105
302
  path: list[str],
106
- snowpark_options: dict,
303
+ file_format_options: dict,
304
+ snowpark_read_options: dict,
107
305
  raw_options: dict,
306
+ parse_header: bool,
108
307
  ) -> snowpark.DataFrame:
109
- df = reader.csv(path)
110
308
  filename = path.strip("/").split("/")[-1]
309
+
111
310
  if schema is not None:
112
- if len(schema.fields) != len(df.schema.fields):
113
- raise Exception(f"csv load from {filename} failed.")
114
- if raw_options.get("enforceSchema", "True").lower() == "false":
311
+ df = reader.csv(path)
312
+ non_metadata_fields = get_non_metadata_fields(df.schema.fields)
313
+ if len(schema.fields) != len(non_metadata_fields):
314
+ exception = Exception(f"csv load from {filename} failed.")
315
+ attach_custom_error_code(exception, ErrorCodes.INVALID_CAST)
316
+ raise exception
317
+ if str(raw_options.get("enforceSchema", "True")).lower() == "false":
115
318
  for i in range(len(schema.fields)):
116
319
  if (
117
- schema.fields[i].name != df.schema.fields[i].name
118
- and f'"{schema.fields[i].name}"' != df.schema.fields[i].name
320
+ schema.fields[i].name != non_metadata_fields[i].name
321
+ and f'"{schema.fields[i].name}"' != non_metadata_fields[i].name
119
322
  ):
120
- raise Exception("CSV header does not conform to the schema")
323
+ exception = Exception("CSV header does not conform to the schema")
324
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
325
+ raise exception
121
326
  return df
122
327
 
123
- headers = get_header_names(session, path, snowpark_options)
124
-
125
- # Handle mismatch in column count between header and data
126
- if (
127
- len(df.schema.fields) == 1
128
- and df.schema.fields[0].name.upper() == "C1"
129
- and snowpark_options.get("PARSE_HEADER") is True
130
- and len(headers) != len(df.schema.fields)
131
- ):
132
- df = (
133
- session.read.options(snowpark_options)
134
- .schema(StructType([StructField(h, StringType(), True) for h in headers]))
135
- .csv(path)
136
- )
137
- elif snowpark_options.get("PARSE_HEADER") is False and len(headers) != len(
138
- df.schema.fields
139
- ):
140
- return df.select([df.schema.fields[i].name for i in range(len(headers))])
328
+ headers = get_header_names(
329
+ session,
330
+ path,
331
+ file_format_options,
332
+ snowpark_read_options,
333
+ raw_options,
334
+ parse_header,
335
+ )
336
+
337
+ # Create schema with the column names and read CSV
338
+ if len(headers) > 0:
339
+ if (
340
+ not str_to_bool(
341
+ str(
342
+ raw_options.get(
343
+ "inferSchema", raw_options.get("inferschema", "false")
344
+ )
345
+ )
346
+ )
347
+ and schema is None
348
+ ):
349
+ inferred_schema = StructType(
350
+ [StructField(h, StringType(), True) for h in headers]
351
+ )
352
+ df = reader.schema(inferred_schema).csv(path)
353
+ else:
354
+ df = reader.csv(path)
355
+ non_metadata_fields = get_non_metadata_fields(df.schema.fields)
356
+ if len(non_metadata_fields) != len(headers):
357
+ exception = Exception(
358
+ f"CSV header: {headers} does not conform to the schema"
359
+ )
360
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
361
+ raise exception
362
+ if any(
363
+ non_metadata_fields[i].name != headers[i]
364
+ for i in range(len(non_metadata_fields))
365
+ ):
366
+ df = df.select(
367
+ [
368
+ snowpark_fn.col(non_metadata_fields[i].name).alias(headers[i])
369
+ for i in range(len(non_metadata_fields))
370
+ ]
371
+ )
372
+ return df
373
+
374
+ # Fallback: no headers, shouldn't reach here
375
+ return reader.csv(path)
376
+
377
+
378
+ def _emulate_integral_types_for_csv(t: DataType) -> DataType:
379
+ """
380
+ CSV requires different type handling to match OSS Spark CSV schema inference.
381
+
382
+ After applying emulate_integral_types, converts to Spark CSV types:
383
+ - IntegerType, ShortType, ByteType -> IntegerType
384
+ - LongType -> LongType
385
+ - DecimalType with scale > 0 -> DoubleType
386
+ - DecimalType with precision > 18 -> DecimalType (too big for long)
387
+ - DecimalType with precision > 9 -> LongType
388
+ - DecimalType with precision <= 9 -> IntegerType
389
+ - FloatType, DoubleType -> DoubleType
390
+ """
391
+ if not _integral_types_conversion_enabled:
392
+ return t
393
+
394
+ # First apply standard integral type conversion
395
+ t = emulate_integral_types(t)
396
+
397
+ if isinstance(t, LongType):
398
+ return LongType()
399
+
400
+ elif isinstance(t, _IntegralType):
401
+ # ByteType, ShortType, IntegerType -> IntegerType
402
+ return IntegerType()
403
+
404
+ elif isinstance(t, DecimalType):
405
+ # DecimalType with scale > 0 means it has decimal places -> DoubleType
406
+ if t.scale > 0:
407
+ return DoubleType()
408
+ # DecimalType with scale = 0 is integral
409
+ if t.precision > 18:
410
+ # Too big for long, keep as DecimalType
411
+ return DecimalType(t.precision, 0)
412
+ elif t.precision > 9:
413
+ return LongType()
414
+ else:
415
+ return IntegerType()
416
+
417
+ elif isinstance(t, _FractionalType):
418
+ # FloatType, DoubleType -> DoubleType
419
+ return DoubleType()
141
420
 
142
- return df
421
+ return t
@@ -9,11 +9,14 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
9
9
  from snowflake import snowpark
10
10
  from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
11
11
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
12
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
13
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
12
14
  from snowflake.snowpark_connect.relation.read.jdbc_read_dbapi import JdbcDataFrameReader
13
15
  from snowflake.snowpark_connect.relation.read.utils import (
14
16
  Connection,
15
17
  rename_columns_as_snowflake_standard,
16
18
  )
19
+ from snowflake.snowpark_connect.type_support import emulate_integral_types
17
20
  from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
18
21
 
19
22
 
@@ -28,7 +31,9 @@ def create_connection(jdbc_options: dict[str, str]) -> Connection:
28
31
  return jaydebeapi.connect(driver, url, jdbc_options)
29
32
  except Exception as e:
30
33
  jpype.detachThreadFromJVM()
31
- raise Exception(f"Error connecting JDBC datasource: {e}")
34
+ exception = Exception(f"Error connecting JDBC datasource: {e}")
35
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
36
+ raise exception
32
37
 
33
38
 
34
39
  def close_connection(conn: Connection) -> None:
@@ -70,17 +75,23 @@ def map_read_jdbc(
70
75
  dbtable = None
71
76
 
72
77
  if not dbtable and not query:
73
- raise ValueError("Include dbtable or query is required option")
78
+ exception = ValueError("Include dbtable or query is required option")
79
+ attach_custom_error_code(exception, ErrorCodes.INSUFFICIENT_INPUT)
80
+ raise exception
74
81
 
75
82
  if query is not None and dbtable is not None:
76
- raise ValueError(
83
+ exception = ValueError(
77
84
  "Not allowed to specify dbtable and query options at the same time"
78
85
  )
86
+ attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
87
+ raise exception
79
88
 
80
89
  if query is not None and partition_column is not None:
81
- raise ValueError(
90
+ exception = ValueError(
82
91
  "Not allowed to specify partitionColumn and query options at the same time"
83
92
  )
93
+ attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
94
+ raise exception
84
95
 
85
96
  try:
86
97
  df = JdbcDataFrameReader(session, jdbc_options).jdbc_read_dbapi(
@@ -102,7 +113,11 @@ def map_read_jdbc(
102
113
  dataframe=renamed_df,
103
114
  spark_column_names=true_names,
104
115
  snowpark_column_names=snowpark_cols,
105
- snowpark_column_types=[f.datatype for f in df.schema.fields],
116
+ snowpark_column_types=[
117
+ emulate_integral_types(f.datatype) for f in df.schema.fields
118
+ ],
106
119
  )
107
120
  except Exception as e:
108
- raise Exception(f"Error accessing JDBC datasource for read: {e}")
121
+ exception = Exception(f"Error accessing JDBC datasource for read: {e}")
122
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
123
+ raise exception