snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. snowflake/snowpark_connect/__init__.py +1 -0
  2. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
  3. snowflake/snowpark_connect/client/__init__.py +15 -0
  4. snowflake/snowpark_connect/client/error_utils.py +30 -0
  5. snowflake/snowpark_connect/client/exceptions.py +36 -0
  6. snowflake/snowpark_connect/client/query_results.py +90 -0
  7. snowflake/snowpark_connect/client/server.py +680 -0
  8. snowflake/snowpark_connect/client/utils/__init__.py +10 -0
  9. snowflake/snowpark_connect/client/utils/session.py +85 -0
  10. snowflake/snowpark_connect/column_name_handler.py +404 -243
  11. snowflake/snowpark_connect/column_qualifier.py +43 -0
  12. snowflake/snowpark_connect/config.py +237 -23
  13. snowflake/snowpark_connect/constants.py +2 -0
  14. snowflake/snowpark_connect/dataframe_container.py +102 -8
  15. snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
  16. snowflake/snowpark_connect/error/error_codes.py +50 -0
  17. snowflake/snowpark_connect/error/error_utils.py +172 -23
  18. snowflake/snowpark_connect/error/exceptions.py +13 -4
  19. snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
  20. snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
  21. snowflake/snowpark_connect/execute_plan/utils.py +5 -1
  22. snowflake/snowpark_connect/expression/function_defaults.py +9 -2
  23. snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
  24. snowflake/snowpark_connect/expression/literal.py +37 -13
  25. snowflake/snowpark_connect/expression/map_cast.py +123 -5
  26. snowflake/snowpark_connect/expression/map_expression.py +80 -27
  27. snowflake/snowpark_connect/expression/map_extension.py +322 -12
  28. snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
  29. snowflake/snowpark_connect/expression/map_udf.py +85 -20
  30. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
  31. snowflake/snowpark_connect/expression/map_unresolved_function.py +2748 -746
  32. snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
  33. snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
  34. snowflake/snowpark_connect/expression/map_window_function.py +18 -3
  35. snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
  36. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
  37. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
  38. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
  39. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
  40. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
  41. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
  42. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
  43. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
  44. snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
  45. snowflake/snowpark_connect/relation/io_utils.py +110 -10
  46. snowflake/snowpark_connect/relation/map_aggregate.py +196 -255
  47. snowflake/snowpark_connect/relation/map_catalog.py +5 -1
  48. snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
  49. snowflake/snowpark_connect/relation/map_extension.py +263 -29
  50. snowflake/snowpark_connect/relation/map_join.py +683 -442
  51. snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
  52. snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
  53. snowflake/snowpark_connect/relation/map_relation.py +48 -19
  54. snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
  55. snowflake/snowpark_connect/relation/map_show_string.py +13 -6
  56. snowflake/snowpark_connect/relation/map_sql.py +1233 -222
  57. snowflake/snowpark_connect/relation/map_stats.py +48 -9
  58. snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
  59. snowflake/snowpark_connect/relation/map_udtf.py +14 -4
  60. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
  61. snowflake/snowpark_connect/relation/read/map_read.py +134 -43
  62. snowflake/snowpark_connect/relation/read/map_read_csv.py +255 -45
  63. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +17 -5
  64. snowflake/snowpark_connect/relation/read/map_read_json.py +320 -85
  65. snowflake/snowpark_connect/relation/read/map_read_parquet.py +142 -27
  66. snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
  67. snowflake/snowpark_connect/relation/read/map_read_socket.py +11 -3
  68. snowflake/snowpark_connect/relation/read/map_read_table.py +82 -5
  69. snowflake/snowpark_connect/relation/read/map_read_text.py +18 -3
  70. snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
  71. snowflake/snowpark_connect/relation/read/reader_config.py +36 -3
  72. snowflake/snowpark_connect/relation/read/utils.py +50 -5
  73. snowflake/snowpark_connect/relation/stage_locator.py +91 -55
  74. snowflake/snowpark_connect/relation/utils.py +128 -5
  75. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
  76. snowflake/snowpark_connect/relation/write/map_write.py +929 -319
  77. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
  78. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  79. snowflake/snowpark_connect/resources_initializer.py +110 -48
  80. snowflake/snowpark_connect/server.py +546 -456
  81. snowflake/snowpark_connect/server_common/__init__.py +500 -0
  82. snowflake/snowpark_connect/snowflake_session.py +65 -0
  83. snowflake/snowpark_connect/start_server.py +53 -5
  84. snowflake/snowpark_connect/type_mapping.py +349 -27
  85. snowflake/snowpark_connect/typed_column.py +9 -7
  86. snowflake/snowpark_connect/utils/artifacts.py +9 -8
  87. snowflake/snowpark_connect/utils/cache.py +49 -27
  88. snowflake/snowpark_connect/utils/concurrent.py +36 -1
  89. snowflake/snowpark_connect/utils/context.py +187 -37
  90. snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
  91. snowflake/snowpark_connect/utils/env_utils.py +5 -1
  92. snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
  93. snowflake/snowpark_connect/utils/identifiers.py +137 -3
  94. snowflake/snowpark_connect/utils/io_utils.py +57 -1
  95. snowflake/snowpark_connect/utils/java_stored_procedure.py +125 -0
  96. snowflake/snowpark_connect/utils/java_udaf_utils.py +303 -0
  97. snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
  98. snowflake/snowpark_connect/utils/jvm_udf_utils.py +248 -0
  99. snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
  100. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
  101. snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
  102. snowflake/snowpark_connect/utils/profiling.py +25 -8
  103. snowflake/snowpark_connect/utils/scala_udf_utils.py +101 -332
  104. snowflake/snowpark_connect/utils/sequence.py +21 -0
  105. snowflake/snowpark_connect/utils/session.py +64 -28
  106. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
  107. snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
  108. snowflake/snowpark_connect/utils/telemetry.py +163 -22
  109. snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
  110. snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
  111. snowflake/snowpark_connect/utils/udf_cache.py +117 -41
  112. snowflake/snowpark_connect/utils/udf_helper.py +39 -37
  113. snowflake/snowpark_connect/utils/udf_utils.py +133 -14
  114. snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
  115. snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
  116. snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
  117. snowflake/snowpark_connect/version.py +1 -1
  118. snowflake/snowpark_decoder/dp_session.py +6 -2
  119. snowflake/snowpark_decoder/spark_decoder.py +12 -0
  120. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-submit +2 -2
  121. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/METADATA +14 -7
  122. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/RECORD +129 -167
  123. snowflake/snowpark_connect/hidden_column.py +0 -39
  124. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  125. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  126. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  127. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  128. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  129. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  130. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  131. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  132. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  133. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  134. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  135. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  136. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  137. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  138. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  139. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  140. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  141. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  142. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  143. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  144. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  145. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  146. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  147. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  148. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  149. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  150. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  151. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  152. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  153. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  154. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  155. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  156. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  157. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  158. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  159. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  160. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  161. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  162. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  163. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  164. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  165. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  166. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  167. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  168. snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
  169. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  170. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  171. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  172. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  173. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  174. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  175. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  176. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  177. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  178. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  179. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  180. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  181. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  182. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  183. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  184. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  185. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  186. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-connect +0 -0
  187. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-session +0 -0
  188. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/WHEEL +0 -0
  189. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE-binary +0 -0
  190. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE.txt +0 -0
  191. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/NOTICE-binary +0 -0
  192. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@
5
5
  import collections
6
6
  import re
7
7
  from collections.abc import Callable
8
+ from typing import Any
8
9
 
9
10
  import pyspark.sql.connect.proto.relations_pb2 as relation_proto
10
11
 
@@ -20,12 +21,30 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
20
21
  quote_name_without_upper_casing,
21
22
  )
22
23
  from snowflake.snowpark.column import METADATA_FILENAME
23
- from snowflake.snowpark.types import DataType, DoubleType, IntegerType, StringType
24
+ from snowflake.snowpark.types import (
25
+ DataType,
26
+ DoubleType,
27
+ IntegerType,
28
+ StringType,
29
+ StructType,
30
+ )
31
+ from snowflake.snowpark_connect.config import external_table_location
24
32
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
33
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
34
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
35
+ from snowflake.snowpark_connect.relation.read.map_read_partitioned_parquet import (
36
+ read_partitioned_parquet_from_external_table,
37
+ use_external_table,
38
+ )
39
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
40
+ add_filename_metadata_to_reader,
41
+ )
25
42
  from snowflake.snowpark_connect.relation.read.reader_config import ReaderWriterConfig
26
43
  from snowflake.snowpark_connect.relation.read.utils import (
44
+ apply_metadata_exclusion_pattern,
27
45
  rename_columns_as_snowflake_standard,
28
46
  )
47
+ from snowflake.snowpark_connect.utils.io_utils import cached_file_format
29
48
  from snowflake.snowpark_connect.utils.telemetry import (
30
49
  SnowparkConnectNotImplementedError,
31
50
  )
@@ -33,7 +52,7 @@ from snowflake.snowpark_connect.utils.telemetry import (
33
52
 
34
53
  def map_read_parquet(
35
54
  rel: relation_proto.Relation,
36
- schema: str | None,
55
+ schema: StructType | None,
37
56
  session: snowpark.Session,
38
57
  paths: list[str],
39
58
  options: ReaderWriterConfig,
@@ -41,28 +60,62 @@ def map_read_parquet(
41
60
  """Read a Parquet file into a Snowpark DataFrame."""
42
61
 
43
62
  if rel.read.is_streaming is True:
44
- raise SnowparkConnectNotImplementedError(
63
+ exception = SnowparkConnectNotImplementedError(
45
64
  "Streaming is not supported for Parquet files."
46
65
  )
66
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
67
+ raise exception
47
68
 
48
- snowpark_options = options.convert_to_snowpark_args()
49
- assert schema is None, "Read PARQUET does not support user schema"
69
+ converted_snowpark_options = options.convert_to_snowpark_args()
70
+ file_format_options = _parse_parquet_snowpark_options(converted_snowpark_options)
71
+ raw_options = rel.read.data_source.options
50
72
  assert len(paths) > 0, "Read PARQUET expects at least one path"
51
73
 
52
- reader = session.read.options(snowpark_options)
74
+ snowpark_options = {
75
+ # Setting these two options prevents a significant number of additional CREATE TEMPORARY
76
+ # FILE FORMAT and DROP FILE FORMAT queries. If FORMAT_NAME is not set, the Snowpark DF reader
77
+ # will eagerly issue a CREATE TEMPORARY FILE FORMAT when inferring the schema of the result;
78
+ # if ENFORCE_EXISTING_FILE_FORMAT is not set, an additional CREATE ... command will be
79
+ # issued when the lazy DF is materialized by a cache_result call.
80
+ "FORMAT_NAME": converted_snowpark_options.get(
81
+ "FORMAT_NAME",
82
+ cached_file_format(session, "parquet", file_format_options),
83
+ ),
84
+ "ENFORCE_EXISTING_FILE_FORMAT": True,
85
+ }
86
+
87
+ if "PATTERN" in converted_snowpark_options:
88
+ snowpark_options["PATTERN"] = converted_snowpark_options.get("PATTERN")
89
+
90
+ apply_metadata_exclusion_pattern(snowpark_options)
91
+
92
+ reader = add_filename_metadata_to_reader(
93
+ session.read.options(snowpark_options), raw_options
94
+ )
53
95
 
54
96
  if len(paths) == 1:
55
- df = _read_parquet_with_partitions(session, reader, paths[0])
97
+ df, read_using_external_table = _read_parquet_with_partitions(
98
+ session, reader, paths[0], schema, snowpark_options
99
+ )
100
+ can_be_cached = not read_using_external_table
56
101
  else:
57
102
  is_merge_schema = options.config.get("mergeschema")
58
- df = _read_parquet_with_partitions(session, reader, paths[0])
103
+ df, read_using_external_table = _read_parquet_with_partitions(
104
+ session, reader, paths[0], schema, snowpark_options
105
+ )
106
+ can_be_cached = not read_using_external_table
59
107
  schema_cols = df.columns
60
108
  for p in paths[1:]:
61
109
  reader._user_schema = None
110
+ partition_df, read_using_external_table = _read_parquet_with_partitions(
111
+ session, reader, p, schema, snowpark_options
112
+ )
62
113
  df = df.union_all_by_name(
63
- _read_parquet_with_partitions(session, reader, p),
114
+ partition_df,
64
115
  allow_missing_columns=True,
65
116
  )
117
+ can_be_cached = can_be_cached and not read_using_external_table
118
+
66
119
  if not is_merge_schema:
67
120
  df = df.select(*schema_cols)
68
121
 
@@ -74,33 +127,89 @@ def map_read_parquet(
74
127
  spark_column_names=[analyzer_utils.unquote_if_quoted(c) for c in df.columns],
75
128
  snowpark_column_names=snowpark_column_names,
76
129
  snowpark_column_types=[f.datatype for f in df.schema.fields],
130
+ can_be_cached=can_be_cached,
77
131
  )
78
132
 
79
133
 
80
134
  def _read_parquet_with_partitions(
81
- session: Session, reader: DataFrameReader, path: str
82
- ) -> DataFrame:
83
- """Reads parquet files and adds partition columns from subdirectories."""
135
+ session: Session,
136
+ reader: DataFrameReader,
137
+ path: str,
138
+ schema: StructType | None,
139
+ snowpark_options: dict[str, Any],
140
+ ) -> tuple[DataFrame, bool]:
141
+ """
142
+ Reads parquet files and adds partition columns from subdirectories.
143
+ Returns a tuple of read DataFrame and a boolean indicating if DataFrame was read from external table.
144
+ """
84
145
 
85
146
  partition_columns, inferred_types = _discover_partition_columns(session, path)
86
- df = reader.with_metadata(METADATA_FILENAME).parquet(path)
87
147
 
88
- if not partition_columns:
89
- return df.drop(METADATA_FILENAME)
148
+ def _get_df() -> DataFrame:
149
+ if not partition_columns:
150
+ return reader.parquet(path)
151
+ else:
152
+ # In case of too big overhead we can always optimize by using option: MAX_FILE_COUNT and allow user to define how many files should be scanned
153
+ df = reader.with_metadata(METADATA_FILENAME).parquet(path)
154
+
155
+ for col_name in partition_columns:
156
+ quoted_col_name = quote_name_without_upper_casing(col_name)
157
+ escaped_col_name = re.escape(col_name)
158
+ regex_pattern = rf"{escaped_col_name}=([^/]+)"
159
+
160
+ raw_value = snowpark_fn.regexp_extract(
161
+ METADATA_FILENAME, regex_pattern, 1
162
+ )
163
+ value_or_null = snowpark_fn.when(raw_value == "", None).otherwise(
164
+ raw_value
165
+ )
166
+
167
+ df = df.with_column(
168
+ quoted_col_name,
169
+ snowpark_fn.cast(value_or_null, inferred_types[col_name]),
170
+ )
171
+ return df.drop(METADATA_FILENAME)
172
+
173
+ if use_external_table(session, path):
174
+ if schema is None:
175
+ schema = _get_df().schema
176
+ return (
177
+ read_partitioned_parquet_from_external_table(
178
+ session,
179
+ schema,
180
+ external_table_location(),
181
+ path[1:-1],
182
+ partition_columns,
183
+ inferred_types,
184
+ snowpark_options,
185
+ ),
186
+ True,
187
+ )
188
+ else:
189
+ # TODO: SNOW-2736756 support user schema
190
+ assert schema is None, "Read PARQUET does not support user schema"
191
+ return _get_df(), False
90
192
 
91
- for col_name in partition_columns:
92
- quoted_col_name = quote_name_without_upper_casing(col_name)
93
- escaped_col_name = re.escape(col_name)
94
- regex_pattern = rf"{escaped_col_name}=([^/]+)"
95
193
 
96
- raw_value = snowpark_fn.regexp_extract(METADATA_FILENAME, regex_pattern, 1)
97
- value_or_null = snowpark_fn.when(raw_value == "", None).otherwise(raw_value)
194
+ _parquet_file_format_allowed_options = {
195
+ "COMPRESSION",
196
+ "SNAPPY_COMPRESSION",
197
+ "BINARY_AS_TEXT",
198
+ "TRIM_SPACE",
199
+ "USE_LOGICAL_TYPE",
200
+ "USE_VECTORIZED_SCANNER",
201
+ "REPLACE_INVALID_CHARACTERS",
202
+ "NULL_IF",
203
+ }
98
204
 
99
- df = df.with_column(
100
- quoted_col_name, snowpark_fn.cast(value_or_null, inferred_types[col_name])
101
- )
102
205
 
103
- return df.drop(METADATA_FILENAME)
206
+ def _parse_parquet_snowpark_options(snowpark_options: dict[str, Any]) -> dict[str, Any]:
207
+ file_format_options = dict()
208
+ for key, value in snowpark_options.items():
209
+ upper_key = key.upper()
210
+ if upper_key in _parquet_file_format_allowed_options:
211
+ file_format_options[upper_key] = value
212
+ return file_format_options
104
213
 
105
214
 
106
215
  def _extract_partitions_from_path(path: str) -> dict[str, str]:
@@ -149,10 +258,14 @@ def _discover_partition_columns(
149
258
  if i not in dir_level_to_column_name:
150
259
  dir_level_to_column_name[i] = key
151
260
  elif dir_level_to_column_name[i] != key:
152
- raise ValueError(
261
+ exception = ValueError(
153
262
  f"Conflicting partition column names detected: '{dir_level_to_column_name[i]}' and '{key}' "
154
263
  f"at the same directory level"
155
264
  )
265
+ attach_custom_error_code(
266
+ exception, ErrorCodes.INVALID_OPERATION
267
+ )
268
+ raise exception
156
269
 
157
270
  partition_columns_values[key].add(value)
158
271
 
@@ -160,10 +273,12 @@ def _discover_partition_columns(
160
273
  for level in sorted(dir_level_to_column_name.keys()):
161
274
  col_name = dir_level_to_column_name[level]
162
275
  if col_name in seen_columns:
163
- raise ValueError(
276
+ exception = ValueError(
164
277
  f"Found partition column '{col_name}' at multiple directory levels. "
165
278
  f"A partition column can only appear at a single level."
166
279
  )
280
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
281
+ raise exception
167
282
  seen_columns.add(col_name)
168
283
 
169
284
  ordered_columns = [
@@ -0,0 +1,142 @@
1
+ #
2
+ # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3
+ #
4
+
5
+ import re
6
+ from copy import deepcopy
7
+ from typing import Any
8
+
9
+ from snowflake import snowpark
10
+ from snowflake.snowpark import Session
11
+ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
12
+ quote_name_without_upper_casing,
13
+ unquote_if_quoted,
14
+ )
15
+ from snowflake.snowpark.functions import col, lit
16
+ from snowflake.snowpark.types import ArrayType, DataType, MapType, StructType
17
+ from snowflake.snowpark_connect.config import external_table_location
18
+ from snowflake.snowpark_connect.utils.context import (
19
+ get_spark_session_id,
20
+ register_request_external_table,
21
+ )
22
+ from snowflake.snowpark_connect.utils.io_utils import cached_file_format
23
+ from snowflake.snowpark_connect.utils.scala_udf_utils import map_type_to_snowflake_type
24
+
25
+ STRUCTURED_TYPE_PATTERN = re.compile(r"\([^)]*\)")
26
+
27
+
28
+ def use_external_table(session: Session, path: str) -> bool:
29
+ external_table_path = external_table_location()
30
+ stripped_path = path[1:-1]
31
+
32
+ is_external_table_path_defined = external_table_path is not None
33
+ is_stage = stripped_path.startswith("@")
34
+
35
+ return (
36
+ is_external_table_path_defined
37
+ and is_stage
38
+ and _is_external_stage(session, stripped_path)
39
+ )
40
+
41
+
42
+ def _is_external_stage(session: Session, path: str) -> bool:
43
+ try:
44
+ stage_description = (
45
+ session.sql(f"DESCRIBE STAGE {path.split('/')[0][1:]}")
46
+ .filter(col('"property"') == lit("URL"))
47
+ .collect()
48
+ )
49
+ return stage_description[0]["property_value"] != ""
50
+ except Exception:
51
+ return False
52
+
53
+
54
+ def _get_count_of_non_partition_path_parts(path: str) -> int:
55
+ count = 0
56
+ # First element of a path is a stage identifier we need to ignore it to count relative path parts
57
+ for element in path.split("/")[1:]:
58
+ if "=" in element:
59
+ break
60
+ count += 1
61
+ return count
62
+
63
+
64
+ def read_partitioned_parquet_from_external_table(
65
+ session: Session,
66
+ schema: StructType,
67
+ external_table_path: str,
68
+ path: str,
69
+ partition_columns: list[str],
70
+ inferred_types: dict[str, DataType],
71
+ snowpark_options: dict[str, Any],
72
+ ) -> snowpark.DataFrame:
73
+ skip_path_parts = _get_count_of_non_partition_path_parts(path)
74
+ snowpark_partition_columns = ", ".join(
75
+ [quote_name_without_upper_casing(col) for col in partition_columns]
76
+ )
77
+ snowpark_typed_partition_columns = ", ".join(
78
+ [
79
+ f"{quote_name_without_upper_casing(col)} {map_type_to_snowflake_type(inferred_types[col])} as (split_part(split_part(METADATA$FILENAME, '/', {i + skip_path_parts}), '=', 2)::{map_type_to_snowflake_type(inferred_types[col])})"
80
+ for col, i in zip(partition_columns, range(len(partition_columns)))
81
+ ]
82
+ )
83
+ snowpark_schema_columns = ",".join(
84
+ [
85
+ f"{field.name} {_map_snowpark_type_to_simplified_snowflake_type(field.datatype)} as (value:{field.name}::{_map_snowpark_type_to_simplified_snowflake_type(field.datatype)})"
86
+ for field in schema.fields
87
+ if unquote_if_quoted(field.name) not in snowpark_partition_columns
88
+ ]
89
+ )
90
+
91
+ table_name = f"{external_table_path}.{quote_name_without_upper_casing(path + get_spark_session_id())}"
92
+ snowpark_options_copy = deepcopy(snowpark_options)
93
+ # These options are only used in the Snowpark Python reader, but not the actual emitted SQL.
94
+ snowpark_options_copy.pop("PATTERN")
95
+ snowpark_options_copy.pop("FORMAT_NAME")
96
+ snowpark_options_copy.pop("ENFORCE_EXISTING_FILE_FORMAT")
97
+ file_format_name = cached_file_format(session, "parquet", snowpark_options_copy)
98
+ session.sql(
99
+ f"""
100
+ CREATE OR REPLACE EXTERNAL TABLE {table_name} (
101
+ {snowpark_typed_partition_columns},
102
+ {snowpark_schema_columns}
103
+ )
104
+ PARTITION BY ({snowpark_partition_columns})
105
+ WITH LOCATION = {path}
106
+ FILE_FORMAT = {file_format_name}
107
+ PATTERN = '{snowpark_options.get('PATTERN', '.*')}'
108
+ AUTO_REFRESH = false
109
+ """
110
+ ).collect()
111
+ register_request_external_table(table_name)
112
+ map_fields = ", ".join(
113
+ [
114
+ f"{field.name}::{_map_snowpark_type_to_snowflake(field.datatype)} as {field.name}"
115
+ if isinstance(field.datatype, (StructType, MapType, ArrayType))
116
+ else field.name
117
+ for field in schema.fields
118
+ ]
119
+ )
120
+ return session.sql(f"SELECT {map_fields} FROM {table_name}")
121
+
122
+
123
+ def _map_snowpark_type_to_simplified_snowflake_type(datatype: DataType) -> str:
124
+ if isinstance(datatype, StructType):
125
+ return "OBJECT"
126
+ elif isinstance(datatype, MapType):
127
+ return "VARIANT"
128
+ else:
129
+ return STRUCTURED_TYPE_PATTERN.sub("", map_type_to_snowflake_type(datatype))
130
+
131
+
132
+ def _map_snowpark_type_to_snowflake(datatype: DataType) -> str:
133
+ if isinstance(datatype, StructType):
134
+ object_fields = ", ".join(
135
+ [
136
+ f"{field.name} { _map_snowpark_type_to_snowflake(field.datatype)}"
137
+ for field in datatype.fields
138
+ ]
139
+ )
140
+ return f"OBJECT({object_fields})"
141
+ else:
142
+ return map_type_to_snowflake_type(datatype)
@@ -9,6 +9,8 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
9
9
 
10
10
  from snowflake import snowpark
11
11
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
12
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
13
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
12
14
  from snowflake.snowpark_connect.utils.telemetry import (
13
15
  SnowparkConnectNotImplementedError,
14
16
  )
@@ -30,7 +32,9 @@ def map_read_socket(
30
32
  host = options.get("host", None)
31
33
  port = options.get("port", None)
32
34
  if not host or not port:
33
- raise ValueError("Host and port must be provided in options.")
35
+ exception = ValueError("Host and port must be provided in options.")
36
+ attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
37
+ raise exception
34
38
  with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
35
39
  try:
36
40
  s.connect((host, int(port)))
@@ -56,8 +60,12 @@ def map_read_socket(
56
60
  snowpark_column_names=[snowpark_cname],
57
61
  )
58
62
  except OSError as e:
59
- raise Exception(f"Error connecting to {host}:{port} - {e}")
63
+ exception = Exception(f"Error connecting to {host}:{port} - {e}")
64
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
65
+ raise exception
60
66
  else:
61
- raise SnowparkConnectNotImplementedError(
67
+ exception = SnowparkConnectNotImplementedError(
62
68
  "Socket reads are only supported in streaming mode."
63
69
  )
70
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
71
+ raise exception
@@ -11,11 +11,20 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
11
11
  unquote_if_quoted,
12
12
  )
13
13
  from snowflake.snowpark.exceptions import SnowparkSQLException
14
+ from snowflake.snowpark.types import StructField, StructType
15
+ from snowflake.snowpark_connect.column_name_handler import (
16
+ ColumnNameMap,
17
+ make_column_names_snowpark_compatible,
18
+ )
19
+ from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
14
20
  from snowflake.snowpark_connect.config import auto_uppercase_non_column_identifiers
15
21
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
22
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
23
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
16
24
  from snowflake.snowpark_connect.relation.read.utils import (
17
25
  rename_columns_as_snowflake_standard,
18
26
  )
27
+ from snowflake.snowpark_connect.utils.context import get_processed_views
19
28
  from snowflake.snowpark_connect.utils.identifiers import (
20
29
  split_fully_qualified_spark_name,
21
30
  )
@@ -23,6 +32,7 @@ from snowflake.snowpark_connect.utils.session import _get_current_snowpark_sessi
23
32
  from snowflake.snowpark_connect.utils.telemetry import (
24
33
  SnowparkConnectNotImplementedError,
25
34
  )
35
+ from snowflake.snowpark_connect.utils.temporary_view_helper import get_temp_view
26
36
 
27
37
 
28
38
  def post_process_df(
@@ -49,7 +59,7 @@ def post_process_df(
49
59
  spark_column_names=true_names,
50
60
  snowpark_column_names=snowpark_column_names,
51
61
  snowpark_column_types=[f.datatype for f in df.schema.fields],
52
- column_qualifiers=[name_parts] * len(true_names)
62
+ column_qualifiers=[{ColumnQualifier(tuple(name_parts))} for _ in true_names]
53
63
  if source_table_name
54
64
  else None,
55
65
  )
@@ -57,22 +67,85 @@ def post_process_df(
57
67
  # Check if this is a table/view not found error
58
68
  # Snowflake error codes: 002003 (42S02) - Object does not exist or not authorized
59
69
  if hasattr(e, "sql_error_code") and e.sql_error_code == 2003:
60
- raise AnalysisException(
70
+ exception = AnalysisException(
61
71
  f"[TABLE_OR_VIEW_NOT_FOUND] The table or view cannot be found. {source_table_name}"
62
- ) from None # Suppress original exception to reduce message size
72
+ )
73
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
74
+ raise exception from None # Suppress original exception to reduce message size
63
75
  # Re-raise if it's not a table not found error
64
76
  raise
65
77
 
66
78
 
79
+ def _get_temporary_view(
80
+ temp_view: DataFrameContainer, table_name: str, plan_id: int
81
+ ) -> DataFrameContainer:
82
+ fields_names = [field.name for field in temp_view.dataframe.schema.fields]
83
+ fields_types = [field.datatype for field in temp_view.dataframe.schema.fields]
84
+
85
+ snowpark_column_names = make_column_names_snowpark_compatible(
86
+ temp_view.column_map.get_spark_columns(), plan_id
87
+ )
88
+ # Rename columns in dataframe to prevent conflicting names during joins
89
+ renamed_df = temp_view.dataframe.select(
90
+ *(
91
+ temp_view.dataframe.col(orig).alias(alias)
92
+ for orig, alias in zip(fields_names, snowpark_column_names)
93
+ )
94
+ )
95
+ # do not flatten initial rename when reading table
96
+ # TODO: remove once SNOW-2203826 is done
97
+ if renamed_df._select_statement is not None:
98
+ renamed_df._select_statement.flatten_disabled = True
99
+
100
+ new_column_map = ColumnNameMap(
101
+ spark_column_names=temp_view.column_map.get_spark_columns(),
102
+ snowpark_column_names=snowpark_column_names,
103
+ column_metadata=temp_view.column_map.column_metadata,
104
+ column_qualifiers=[
105
+ {ColumnQualifier(tuple(split_fully_qualified_spark_name(table_name)))}
106
+ for _ in range(len(temp_view.column_map.get_spark_columns()))
107
+ ],
108
+ parent_column_name_map=temp_view.column_map.get_parent_column_name_map(),
109
+ )
110
+
111
+ schema = StructType(
112
+ [
113
+ StructField(name, type, _is_column=False)
114
+ for name, type in zip(snowpark_column_names, fields_types)
115
+ ]
116
+ )
117
+ return DataFrameContainer(
118
+ dataframe=renamed_df,
119
+ column_map=new_column_map,
120
+ table_name=temp_view.table_name,
121
+ alias=temp_view.alias,
122
+ partition_hint=temp_view.partition_hint,
123
+ cached_schema_getter=lambda: schema,
124
+ )
125
+
126
+
67
127
  def get_table_from_name(
68
128
  table_name: str, session: snowpark.Session, plan_id: int
69
129
  ) -> DataFrameContainer:
70
130
  """Get table from name returning a container."""
131
+
132
+ # Verify if recursive view read is not attempted
133
+ if table_name in get_processed_views():
134
+ exception = AnalysisException(
135
+ f"[RECURSIVE_VIEW] Recursive view `{table_name}` detected (cycle: `{table_name}` -> `{table_name}`)"
136
+ )
137
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
138
+ raise exception
139
+
71
140
  snowpark_name = ".".join(
72
141
  quote_name_without_upper_casing(part)
73
142
  for part in split_fully_qualified_spark_name(table_name)
74
143
  )
75
144
 
145
+ temp_view = get_temp_view(snowpark_name)
146
+ if temp_view:
147
+ return _get_temporary_view(temp_view, table_name, plan_id)
148
+
76
149
  if auto_uppercase_non_column_identifiers():
77
150
  snowpark_name = snowpark_name.upper()
78
151
 
@@ -101,10 +174,14 @@ def map_read_table(
101
174
  and rel.read.data_source.format.lower() == "iceberg"
102
175
  ):
103
176
  if len(rel.read.data_source.paths) != 1:
104
- raise SnowparkConnectNotImplementedError(
177
+ exception = SnowparkConnectNotImplementedError(
105
178
  f"Unexpected paths: {rel.read.data_source.paths}"
106
179
  )
180
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
181
+ raise exception
107
182
  table_identifier = rel.read.data_source.paths[0]
108
183
  else:
109
- raise ValueError("The relation must have a table identifier.")
184
+ exception = ValueError("The relation must have a table identifier.")
185
+ attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
186
+ raise exception
110
187
  return get_table_from_name(table_identifier, session, rel.common.plan_id)
@@ -8,6 +8,8 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
8
8
 
9
9
  from snowflake import snowpark
10
10
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
11
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
12
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
11
13
  from snowflake.snowpark_connect.relation.read.utils import (
12
14
  get_spark_column_names_from_snowpark_columns,
13
15
  rename_columns_as_snowflake_standard,
@@ -24,11 +26,17 @@ def get_file_paths_from_stage(
24
26
  ) -> typing.List[str]:
25
27
  files_paths = []
26
28
  for listed_path_row in session.sql(f"LIST {path}").collect():
29
+ # Skip _SUCCESS marker files
30
+ if listed_path_row[0].endswith("_SUCCESS"):
31
+ continue
32
+
27
33
  listed_path = listed_path_row[0].split("/")
28
34
  if listed_path_row[0].startswith("s3://") or listed_path_row[0].startswith(
29
35
  "s3a://"
30
36
  ):
31
37
  listed_path = listed_path[3:]
38
+ elif listed_path_row[0].startswith("azure://"):
39
+ listed_path = listed_path[4:]
32
40
  else:
33
41
  listed_path = listed_path[1:]
34
42
  files_paths.append("/".join(listed_path))
@@ -43,7 +51,12 @@ def read_text(
43
51
  ) -> snowpark.DataFrame:
44
52
  # TODO: handle stage name with double quotes
45
53
  files_paths = get_file_paths_from_stage(path, session)
46
- stage_name = path.split("/")[0]
54
+ # Remove matching quotes from both ends of the path to get the stage name, if present.
55
+ if path and len(path) > 1 and path[0] == path[-1] and path[0] in ('"', "'"):
56
+ unquoted_path = path[1:-1]
57
+ else:
58
+ unquoted_path = path
59
+ stage_name = unquoted_path.split("/")[0]
47
60
  line_sep = options.get("lineSep") or "\n"
48
61
  column_name = (
49
62
  schema[0].name if schema is not None and len(schema.fields) > 0 else '"value"'
@@ -59,7 +72,7 @@ def read_text(
59
72
  )
60
73
  for fp in files_paths:
61
74
  content = session.sql(
62
- f"SELECT T.$1 AS {default_column_name} FROM {stage_name}/{fp} (FILE_FORMAT => {text_file_format}) AS T"
75
+ f"SELECT T.$1 AS {default_column_name} FROM '{stage_name}/{fp}' (FILE_FORMAT => {text_file_format}) AS T"
63
76
  ).collect()
64
77
  for row in content:
65
78
  result.append(row[0])
@@ -77,9 +90,11 @@ def map_read_text(
77
90
  """
78
91
  if rel.read.is_streaming is True:
79
92
  # TODO: Structured streaming implementation.
80
- raise SnowparkConnectNotImplementedError(
93
+ exception = SnowparkConnectNotImplementedError(
81
94
  "Streaming is not supported for CSV files."
82
95
  )
96
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
97
+ raise exception
83
98
 
84
99
  df = read_text(paths[0], schema, session, rel.read.data_source.options)
85
100
  if len(paths) > 1: