snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. snowflake/snowpark_connect/__init__.py +1 -0
  2. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
  3. snowflake/snowpark_connect/client/__init__.py +15 -0
  4. snowflake/snowpark_connect/client/error_utils.py +30 -0
  5. snowflake/snowpark_connect/client/exceptions.py +36 -0
  6. snowflake/snowpark_connect/client/query_results.py +90 -0
  7. snowflake/snowpark_connect/client/server.py +680 -0
  8. snowflake/snowpark_connect/client/utils/__init__.py +10 -0
  9. snowflake/snowpark_connect/client/utils/session.py +85 -0
  10. snowflake/snowpark_connect/column_name_handler.py +404 -243
  11. snowflake/snowpark_connect/column_qualifier.py +43 -0
  12. snowflake/snowpark_connect/config.py +237 -23
  13. snowflake/snowpark_connect/constants.py +2 -0
  14. snowflake/snowpark_connect/dataframe_container.py +102 -8
  15. snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
  16. snowflake/snowpark_connect/error/error_codes.py +50 -0
  17. snowflake/snowpark_connect/error/error_utils.py +172 -23
  18. snowflake/snowpark_connect/error/exceptions.py +13 -4
  19. snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
  20. snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
  21. snowflake/snowpark_connect/execute_plan/utils.py +5 -1
  22. snowflake/snowpark_connect/expression/function_defaults.py +9 -2
  23. snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
  24. snowflake/snowpark_connect/expression/literal.py +37 -13
  25. snowflake/snowpark_connect/expression/map_cast.py +123 -5
  26. snowflake/snowpark_connect/expression/map_expression.py +80 -27
  27. snowflake/snowpark_connect/expression/map_extension.py +322 -12
  28. snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
  29. snowflake/snowpark_connect/expression/map_udf.py +85 -20
  30. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
  31. snowflake/snowpark_connect/expression/map_unresolved_function.py +2748 -746
  32. snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
  33. snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
  34. snowflake/snowpark_connect/expression/map_window_function.py +18 -3
  35. snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
  36. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
  37. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
  38. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
  39. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
  40. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
  41. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
  42. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
  43. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
  44. snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
  45. snowflake/snowpark_connect/relation/io_utils.py +110 -10
  46. snowflake/snowpark_connect/relation/map_aggregate.py +196 -255
  47. snowflake/snowpark_connect/relation/map_catalog.py +5 -1
  48. snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
  49. snowflake/snowpark_connect/relation/map_extension.py +263 -29
  50. snowflake/snowpark_connect/relation/map_join.py +683 -442
  51. snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
  52. snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
  53. snowflake/snowpark_connect/relation/map_relation.py +48 -19
  54. snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
  55. snowflake/snowpark_connect/relation/map_show_string.py +13 -6
  56. snowflake/snowpark_connect/relation/map_sql.py +1233 -222
  57. snowflake/snowpark_connect/relation/map_stats.py +48 -9
  58. snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
  59. snowflake/snowpark_connect/relation/map_udtf.py +14 -4
  60. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
  61. snowflake/snowpark_connect/relation/read/map_read.py +134 -43
  62. snowflake/snowpark_connect/relation/read/map_read_csv.py +255 -45
  63. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +17 -5
  64. snowflake/snowpark_connect/relation/read/map_read_json.py +320 -85
  65. snowflake/snowpark_connect/relation/read/map_read_parquet.py +142 -27
  66. snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
  67. snowflake/snowpark_connect/relation/read/map_read_socket.py +11 -3
  68. snowflake/snowpark_connect/relation/read/map_read_table.py +82 -5
  69. snowflake/snowpark_connect/relation/read/map_read_text.py +18 -3
  70. snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
  71. snowflake/snowpark_connect/relation/read/reader_config.py +36 -3
  72. snowflake/snowpark_connect/relation/read/utils.py +50 -5
  73. snowflake/snowpark_connect/relation/stage_locator.py +91 -55
  74. snowflake/snowpark_connect/relation/utils.py +128 -5
  75. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
  76. snowflake/snowpark_connect/relation/write/map_write.py +929 -319
  77. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
  78. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  79. snowflake/snowpark_connect/resources_initializer.py +110 -48
  80. snowflake/snowpark_connect/server.py +546 -456
  81. snowflake/snowpark_connect/server_common/__init__.py +500 -0
  82. snowflake/snowpark_connect/snowflake_session.py +65 -0
  83. snowflake/snowpark_connect/start_server.py +53 -5
  84. snowflake/snowpark_connect/type_mapping.py +349 -27
  85. snowflake/snowpark_connect/typed_column.py +9 -7
  86. snowflake/snowpark_connect/utils/artifacts.py +9 -8
  87. snowflake/snowpark_connect/utils/cache.py +49 -27
  88. snowflake/snowpark_connect/utils/concurrent.py +36 -1
  89. snowflake/snowpark_connect/utils/context.py +187 -37
  90. snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
  91. snowflake/snowpark_connect/utils/env_utils.py +5 -1
  92. snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
  93. snowflake/snowpark_connect/utils/identifiers.py +137 -3
  94. snowflake/snowpark_connect/utils/io_utils.py +57 -1
  95. snowflake/snowpark_connect/utils/java_stored_procedure.py +125 -0
  96. snowflake/snowpark_connect/utils/java_udaf_utils.py +303 -0
  97. snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
  98. snowflake/snowpark_connect/utils/jvm_udf_utils.py +248 -0
  99. snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
  100. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
  101. snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
  102. snowflake/snowpark_connect/utils/profiling.py +25 -8
  103. snowflake/snowpark_connect/utils/scala_udf_utils.py +101 -332
  104. snowflake/snowpark_connect/utils/sequence.py +21 -0
  105. snowflake/snowpark_connect/utils/session.py +64 -28
  106. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
  107. snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
  108. snowflake/snowpark_connect/utils/telemetry.py +163 -22
  109. snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
  110. snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
  111. snowflake/snowpark_connect/utils/udf_cache.py +117 -41
  112. snowflake/snowpark_connect/utils/udf_helper.py +39 -37
  113. snowflake/snowpark_connect/utils/udf_utils.py +133 -14
  114. snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
  115. snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
  116. snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
  117. snowflake/snowpark_connect/version.py +1 -1
  118. snowflake/snowpark_decoder/dp_session.py +6 -2
  119. snowflake/snowpark_decoder/spark_decoder.py +12 -0
  120. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-submit +2 -2
  121. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/METADATA +14 -7
  122. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/RECORD +129 -167
  123. snowflake/snowpark_connect/hidden_column.py +0 -39
  124. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  125. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  126. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  127. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  128. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  129. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  130. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  131. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  132. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  133. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  134. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  135. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  136. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  137. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  138. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  139. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  140. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  141. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  142. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  143. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  144. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  145. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  146. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  147. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  148. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  149. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  150. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  151. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  152. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  153. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  154. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  155. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  156. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  157. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  158. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  159. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  160. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  161. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  162. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  163. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  164. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  165. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  166. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  167. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  168. snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
  169. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  170. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  171. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  172. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  173. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  174. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  175. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  176. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  177. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  178. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  179. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  180. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  181. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  182. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  183. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  184. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  185. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  186. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-connect +0 -0
  187. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-session +0 -0
  188. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/WHEEL +0 -0
  189. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE-binary +0 -0
  190. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE.txt +0 -0
  191. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/NOTICE-binary +0 -0
  192. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,170 @@
1
+ #
2
+ # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3
+ #
4
+
5
+ """
6
+ Utilities for handling internal metadata columns in file-based DataFrames.
7
+ """
8
+
9
+ import os
10
+
11
+ import pandas
12
+ from pyspark.errors.exceptions.base import AnalysisException
13
+
14
+ from snowflake import snowpark
15
+ from snowflake.snowpark.column import METADATA_FILENAME
16
+ from snowflake.snowpark.functions import col
17
+ from snowflake.snowpark.types import StructField, StructType
18
+ from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
19
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
20
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
21
+
22
+ # Constant for the metadata filename column name
23
+ METADATA_FILENAME_COLUMN = "METADATA$FILENAME"
24
+
25
+
26
+ def add_filename_metadata_to_reader(
27
+ reader: snowpark.DataFrameReader,
28
+ options: dict | None = None,
29
+ ) -> snowpark.DataFrameReader:
30
+ """
31
+ Add filename metadata to a DataFrameReader based on configuration.
32
+
33
+ Args:
34
+ reader: Snowpark DataFrameReader instance
35
+ options: Dictionary of options to check for metadata configuration
36
+
37
+ Returns:
38
+ DataFrameReader with filename metadata enabled if configured, otherwise unchanged
39
+ """
40
+ # NOTE: SNOWPARK_POPULATE_FILE_METADATA_DEFAULT is an internal environment variable
41
+ # used only for CI testing to verify no metadata columns leak in regular file operations.
42
+ # This environment variable should NOT be exposed to end users. Users should only use snowpark.populateFileMetadata
43
+ # to enable metadata population.
44
+ metadata_default = os.environ.get(
45
+ "SNOWPARK_POPULATE_FILE_METADATA_DEFAULT", "false"
46
+ )
47
+
48
+ populate_metadata = (
49
+ options.get("snowpark.populateFileMetadata", metadata_default)
50
+ if options
51
+ else metadata_default
52
+ ).lower() == "true"
53
+
54
+ if populate_metadata:
55
+ return reader.with_metadata(METADATA_FILENAME)
56
+ else:
57
+ return reader
58
+
59
+
60
+ def get_non_metadata_fields(schema_fields: list[StructField]) -> list[StructField]:
61
+ """
62
+ Filter out METADATA$FILENAME fields from a list of schema fields.
63
+
64
+ Args:
65
+ schema_fields: List of StructField objects from a DataFrame schema
66
+
67
+ Returns:
68
+ List of StructField objects excluding METADATA$FILENAME
69
+ """
70
+ return [field for field in schema_fields if field.name != METADATA_FILENAME_COLUMN]
71
+
72
+
73
+ def get_non_metadata_column_names(schema_fields: list[StructField]) -> list[str]:
74
+ """
75
+ Get column names from schema fields, excluding METADATA$FILENAME.
76
+
77
+ Args:
78
+ schema_fields: List of StructField objects from a DataFrame schema
79
+
80
+ Returns:
81
+ List of column names (strings) excluding METADATA$FILENAME
82
+ """
83
+ return [
84
+ field.name for field in schema_fields if field.name != METADATA_FILENAME_COLUMN
85
+ ]
86
+
87
+
88
+ def filter_metadata_column_name(column_names: list[str]) -> list[str]:
89
+ """
90
+ Get column names from column_names, excluding METADATA$FILENAME.
91
+
92
+ Returns:
93
+ List of column names (strings) excluding METADATA$FILENAME
94
+ """
95
+ return [
96
+ col_name for col_name in column_names if col_name != METADATA_FILENAME_COLUMN
97
+ ]
98
+
99
+
100
+ def without_internal_columns(
101
+ result_container: DataFrameContainer | pandas.DataFrame | None,
102
+ ) -> DataFrameContainer | pandas.DataFrame | None:
103
+ """
104
+ Filters internal columns like:
105
+ * METADATA$FILENAME from DataFrame container for execution and write operations
106
+ * hidden columns needed for outer joins implementation
107
+
108
+ Args:
109
+ result_container: DataFrameContainer or pandas DataFrame to filter
110
+
111
+ Returns:
112
+ Filtered container (callers can access dataframe via container.dataframe)
113
+ """
114
+ # Handle pandas DataFrame case - return as-is
115
+ if isinstance(result_container, pandas.DataFrame):
116
+ return result_container
117
+
118
+ if result_container is None:
119
+ return None
120
+
121
+ result_container = result_container.without_hidden_columns()
122
+ result_df = result_container.dataframe
123
+ if not isinstance(result_df, snowpark.DataFrame):
124
+ return result_container
125
+
126
+ df_columns = result_container.column_map.get_snowpark_columns()
127
+ has_metadata_filename = any(name == METADATA_FILENAME_COLUMN for name in df_columns)
128
+
129
+ if not has_metadata_filename:
130
+ return result_container
131
+
132
+ non_metadata_columns = filter_metadata_column_name(df_columns)
133
+
134
+ if len(non_metadata_columns) == 0:
135
+ # DataFrame contains only metadata columns (METADATA$FILENAME), no actual data columns remaining.
136
+ # We don't have a way to return an empty dataframe.
137
+ exception = AnalysisException(
138
+ "[DATAFRAME_MISSING_DATA_COLUMNS] Cannot perform operation on DataFrame that contains no data columns."
139
+ )
140
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
141
+ raise exception
142
+
143
+ filtered_df = result_df.select([col(name) for name in non_metadata_columns])
144
+
145
+ original_spark_columns = result_container.column_map.get_spark_columns()
146
+ original_snowpark_columns = result_container.column_map.get_snowpark_columns()
147
+
148
+ filtered_spark_columns = []
149
+ filtered_snowpark_columns = []
150
+
151
+ for i, colname in enumerate(df_columns):
152
+ if colname != METADATA_FILENAME_COLUMN:
153
+ filtered_spark_columns.append(original_spark_columns[i])
154
+ filtered_snowpark_columns.append(original_snowpark_columns[i])
155
+
156
+ new_container = DataFrameContainer.create_with_column_mapping(
157
+ dataframe=filtered_df,
158
+ spark_column_names=filtered_spark_columns,
159
+ snowpark_column_names=filtered_snowpark_columns,
160
+ column_metadata=result_container.column_map.column_metadata,
161
+ table_name=result_container.table_name,
162
+ alias=result_container.alias,
163
+ partition_hint=result_container.partition_hint,
164
+ # we don't want to evaluate `filtered_df` schema since it will always trigger a describe query
165
+ cached_schema_getter=lambda: StructType(
166
+ [f for f in result_df.schema if f.name != METADATA_FILENAME_COLUMN]
167
+ ),
168
+ )
169
+
170
+ return new_container
@@ -5,7 +5,7 @@
5
5
  from dataclasses import dataclass
6
6
  from typing import Any
7
7
 
8
- from snowflake.snowpark_connect.config import str_to_bool
8
+ from snowflake.snowpark_connect.config import global_config, str_to_bool
9
9
  from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
10
10
 
11
11
 
@@ -126,6 +126,8 @@ CSV_READ_SUPPORTED_OPTIONS = lowercase_set(
126
126
  "compression",
127
127
  # "escapeQuotes",
128
128
  # "quoteAll",
129
+ "rowsToInferSchema", # Snowflake specific option, number of rows to infer schema
130
+ "relaxTypesToInferSchema", # Snowflake specific option, whether to relax types to infer schema
129
131
  }
130
132
  )
131
133
 
@@ -201,6 +203,21 @@ def csv_convert_to_snowpark_args(snowpark_config: dict[str, Any]) -> dict[str, A
201
203
  if snowpark_config["escape"] and snowpark_config["escape"] == "\\":
202
204
  snowpark_config["escape"] = "\\\\"
203
205
 
206
+ # Snowflake specific option, number of rows to infer schema for CSV files
207
+ if "rowstoinferschema" in snowpark_config:
208
+ rows_to_infer_schema = snowpark_config["rowstoinferschema"]
209
+ del snowpark_config["rowstoinferschema"]
210
+ relax_types_to_infer_schema = True
211
+ if "relaxtypestoinferschema" in snowpark_config:
212
+ relax_types_to_infer_schema = str_to_bool(
213
+ str(snowpark_config["relaxtypestoinferschema"])
214
+ )
215
+ del snowpark_config["relaxtypestoinferschema"]
216
+ snowpark_config["INFER_SCHEMA_OPTIONS"] = {
217
+ "MAX_RECORDS_PER_FILE": int(rows_to_infer_schema),
218
+ "USE_RELAXED_TYPES": relax_types_to_infer_schema,
219
+ }
220
+
204
221
  # Rename the keys to match the Snowpark configuration.
205
222
  for spark_arg, snowpark_arg in renamed_args.items():
206
223
  if spark_arg not in snowpark_config:
@@ -339,7 +356,7 @@ class JsonReaderConfig(ReaderWriterConfig):
339
356
  "dropFieldIfAllNull",
340
357
  "encoding",
341
358
  # "locale",
342
- # "pathGlobFilter",
359
+ "pathGlobFilter",
343
360
  # "recursiveFileLookup",
344
361
  # "modifiedBefore",
345
362
  # "modifiedAfter",
@@ -366,6 +383,7 @@ class JsonReaderConfig(ReaderWriterConfig):
366
383
  "dateFormat": "DATE_FORMAT",
367
384
  "timestampFormat": "TIMESTAMP_FORMAT",
368
385
  "multiLine": "STRIP_OUTER_ARRAY",
386
+ "pathGlobFilter": "PATTERN",
369
387
  }
370
388
  renamed_args = lowercase_dict_keys(renamed_args)
371
389
  snowpark_config = super().convert_to_snowpark_args()
@@ -385,7 +403,7 @@ class ParquetReaderConfig(ReaderWriterConfig):
385
403
  default_config={},
386
404
  supported_options={
387
405
  # "mergeSchema",
388
- # "pathGlobFilter",
406
+ "pathGlobFilter",
389
407
  # "recursiveFileLookup",
390
408
  # "modifiedBefore",
391
409
  # "modifiedAfter",
@@ -402,10 +420,25 @@ class ParquetReaderConfig(ReaderWriterConfig):
402
420
  )
403
421
 
404
422
  def convert_to_snowpark_args(self) -> dict[str, Any]:
423
+ renamed_args = {
424
+ "pathGlobFilter": "PATTERN",
425
+ }
426
+ renamed_args = lowercase_dict_keys(renamed_args)
405
427
  snowpark_args = super().convert_to_snowpark_args()
406
428
 
429
+ for spark_arg, snowpark_arg in renamed_args.items():
430
+ if spark_arg not in snowpark_args:
431
+ continue
432
+ snowpark_args[snowpark_arg] = snowpark_args[spark_arg]
433
+ del snowpark_args[spark_arg]
434
+
407
435
  # Should be determined by spark.sql.parquet.binaryAsString, but currently Snowpark Connect only supports
408
436
  # the default value (false). TODO: Add support for spark.sql.parquet.binaryAsString equal to "true".
409
437
  snowpark_args["BINARY_AS_TEXT"] = False
410
438
 
439
+ # Set USE_VECTORIZED_SCANNER from global config. This will become the default in a future BCR.
440
+ snowpark_args["USE_VECTORIZED_SCANNER"] = global_config._get_config_setting(
441
+ "snowpark.connect.parquet.useVectorizedScanner"
442
+ )
443
+
411
444
  return snowpark_args
@@ -40,6 +40,47 @@ DATA_SOURCE_SQL_COMMENT = (
40
40
  INDEXED_COLUMN_NAME_PATTERN = re.compile(r"(^\"c)(\d+)(\"$)")
41
41
 
42
42
 
43
+ def apply_metadata_exclusion_pattern(options: dict) -> None:
44
+ """
45
+ Exclude metadata and hidden files from reads, matching Spark's behavior.
46
+
47
+ Automatically filters out internal metadata files that should never be read as data:
48
+ - _SUCCESS, _metadata, _common_metadata (Spark/Parquet metadata)
49
+ - .crc (Hadoop checksum files)
50
+ - .DS_Store (macOS system files)
51
+ - Any file starting with _ or .
52
+
53
+ Pattern used: ".*/[^_.][^/]*$|^[^_.][^/]*$"
54
+ - Matches files where filename does NOT start with _ or .
55
+ - Works at any directory depth (flat or partitioned data)
56
+ - Allows files with or without extensions
57
+
58
+ Examples of excluded files:
59
+ ❌ _SUCCESS, _metadata, _common_metadata (Spark/Parquet metadata)
60
+ ❌ .crc, .DS_Store, .hidden (system/hidden files)
61
+ ❌ year=2024/_SUCCESS (metadata in partitioned directories)
62
+
63
+ Examples of allowed files:
64
+ ✅ part-00000.parquet, data.csv, output.json (data files)
65
+ ✅ success, myfile (files without extensions, don't start with _ or .)
66
+ ✅ year=2024/month=01/part-00000.parquet (partitioned data)
67
+
68
+ User pattern handling:
69
+ - No pattern or "*" or ".*" → Apply metadata exclusion
70
+ - Custom patterns → Default to user provided pattern.
71
+
72
+ Leak cases (user explicitly requests metadata files and are intentional):
73
+ ⚠️ "_*" → Matches _SUCCESS, _metadata (explicit underscore prefix)
74
+ ⚠️ "*SUCCESS*" → Matches _SUCCESS (broad wildcard side effect)
75
+ ⚠️ "[_.].*" → Matches _SUCCESS, .crc (character class includes _)
76
+
77
+ Args:
78
+ options: Dictionary of Snowpark read options (modified in place)
79
+ """
80
+ if "PATTERN" not in options or options["PATTERN"] in ("*", ".*"):
81
+ options["PATTERN"] = ".*/[^_.][^/]*$|^[^_.][^/]*$"
82
+
83
+
43
84
  def subtract_one(match: re.Match[str]) -> str:
44
85
  """Spark column names are 0 indexed, Snowpark is 1 indexed."""
45
86
  return f"_c{str(int(match.group(2)) - 1)}"
@@ -73,13 +114,17 @@ def rename_columns_as_snowflake_standard(
73
114
  return df, []
74
115
 
75
116
  new_columns = make_column_names_snowpark_compatible(df.columns, plan_id)
76
- return (
77
- df.select(
78
- *(df.col(orig).alias(alias) for orig, alias in zip(df.columns, new_columns))
79
- ),
80
- new_columns,
117
+ result_df = df.select(
118
+ *(df.col(orig).alias(alias) for orig, alias in zip(df.columns, new_columns))
81
119
  )
82
120
 
121
+ # do not flatten initial rename when reading table
122
+ # TODO: remove once SNOW-2203826 is done
123
+ if result_df._select_statement is not None:
124
+ result_df._select_statement.flatten_disabled = True
125
+
126
+ return result_df, new_columns
127
+
83
128
 
84
129
  class Connection(Protocol):
85
130
  """External datasource connection created from user-input create_connection function."""
@@ -5,17 +5,20 @@
5
5
  import os
6
6
 
7
7
  from fsspec.core import url_to_fs
8
+ from pyspark.errors.exceptions.base import AnalysisException
8
9
  from s3fs.core import S3FileSystem
9
10
 
10
11
  from snowflake import snowpark
11
12
  from snowflake.snowpark.session import Session
12
13
  from snowflake.snowpark_connect.config import sessions_config
14
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
15
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
13
16
  from snowflake.snowpark_connect.relation.io_utils import (
14
17
  get_cloud_from_url,
15
18
  parse_azure_url,
16
19
  )
17
20
  from snowflake.snowpark_connect.relation.utils import random_string
18
- from snowflake.snowpark_connect.utils.context import get_session_id
21
+ from snowflake.snowpark_connect.utils.context import get_spark_session_id
19
22
  from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
20
23
 
21
24
 
@@ -33,37 +36,44 @@ def get_paths_from_stage(
33
36
 
34
37
  # TODO : What if GCP?
35
38
  # TODO: What if already stage path?
36
- if get_cloud_from_url(paths[0]) == "azure":
37
- rewrite_paths = []
38
- for p in paths:
39
- _, bucket_name, path = parse_azure_url(p)
40
- rewrite_paths.append(f"{stage_name}/{path}")
41
- paths = rewrite_paths
42
- else:
43
- filesystem, parsed_path = url_to_fs(paths[0])
44
- if isinstance(filesystem, S3FileSystem): # aws
45
- # Remove bucket name from the path since the stage name will replace
46
- # the bucket name in the path.
47
- paths = [
48
- f"{stage_name}/{'/'.join(url_to_fs(p)[1].split('/')[1:])}"
49
- for p in paths
50
- ]
51
- else: # local
52
- # For local files, we need to preserve directory structure for partitioned data
53
- # Instead of just using basename, we'll use the last few path components
54
- new_paths = []
39
+ match get_cloud_from_url(paths[0]):
40
+ case "azure":
41
+ rewrite_paths = []
55
42
  for p in paths:
56
- # Split the path and take the last 2-3 components to preserve structure
57
- # but avoid very long paths
58
- path_parts = p.split(os.sep)
59
- if len(path_parts) >= 2:
60
- # Take last 2 components (e.g., "base_case/x=abc")
61
- relative_path = "/".join(path_parts[-2:])
62
- else:
63
- # Single component, use basename
64
- relative_path = os.path.basename(p)
65
- new_paths.append(f"{stage_name}/{relative_path}")
66
- paths = new_paths
43
+ _, bucket_name, path = parse_azure_url(p)
44
+ rewrite_paths.append(f"{stage_name}/{path}")
45
+ paths = rewrite_paths
46
+ case "gcp":
47
+ exception = AnalysisException(
48
+ "You must configure an integration for Google Cloud Storage to perform I/O operations rather than accessing the URL directly. Reference: https://docs.snowflake.com/en/user-guide/data-load-gcs-config"
49
+ )
50
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
51
+ raise exception
52
+ case _:
53
+ filesystem, parsed_path = url_to_fs(paths[0])
54
+ if isinstance(filesystem, S3FileSystem): # aws
55
+ # Remove bucket name from the path since the stage name will replace
56
+ # the bucket name in the path.
57
+ paths = [
58
+ f"{stage_name}/{'/'.join(url_to_fs(p)[1].split('/')[1:])}"
59
+ for p in paths
60
+ ]
61
+ else: # local
62
+ # For local files, we need to preserve directory structure for partitioned data
63
+ # Instead of just using basename, we'll use the last few path components
64
+ new_paths = []
65
+ for p in paths:
66
+ # Split the path and take the last 2-3 components to preserve structure
67
+ # but avoid very long paths
68
+ path_parts = p.split(os.sep)
69
+ if len(path_parts) >= 2:
70
+ # Take last 2 components (e.g., "base_case/x=abc")
71
+ relative_path = "/".join(path_parts[-2:])
72
+ else:
73
+ # Single component, use basename
74
+ relative_path = os.path.basename(p)
75
+ new_paths.append(f"{stage_name}/{relative_path}")
76
+ paths = new_paths
67
77
 
68
78
  return paths
69
79
 
@@ -89,7 +99,7 @@ class StageLocator:
89
99
  self,
90
100
  url: str = "/",
91
101
  ) -> str:
92
- spark_session_id = get_session_id()
102
+ spark_session_id = get_spark_session_id()
93
103
 
94
104
  match get_cloud_from_url(url):
95
105
  case "azure":
@@ -102,15 +112,21 @@ class StageLocator:
102
112
  sql_query = f"CREATE OR REPLACE TEMP STAGE {stage_name[1:]} URL='azure://{account}.blob.core.windows.net/{bucket_name}'"
103
113
 
104
114
  credential_session_key = (
105
- f"fs.azure.sas.{bucket_name}.{account}.blob.core.windows.net"
115
+ f"fs.azure.sas.fixed.token.{account}.dfs.core.windows.net",
116
+ f"fs.azure.sas.{bucket_name}.{account}.blob.core.windows.net",
106
117
  )
107
118
  credential = sessions_config.get(spark_session_id, None)
108
- if (
109
- credential is not None
110
- and credential.get(credential_session_key) is not None
111
- and credential.get(credential_session_key).strip() != ""
112
- ):
113
- sql_query += f" CREDENTIALS = (AZURE_SAS_TOKEN = '{credential.get(credential_session_key)}')"
119
+ sas_token = None
120
+ for session_key in credential_session_key:
121
+ if (
122
+ credential is not None
123
+ and credential.get(session_key) is not None
124
+ and credential.get(session_key).strip() != ""
125
+ ):
126
+ sas_token = credential.get(session_key)
127
+ break
128
+ if sas_token is not None:
129
+ sql_query += f" CREDENTIALS = (AZURE_SAS_TOKEN = '{sas_token}')"
114
130
 
115
131
  logger.info(self.session.sql(sql_query).collect())
116
132
  self.stages_for_azure[bucket_name] = stage_name
@@ -128,24 +144,44 @@ class StageLocator:
128
144
  # but the rest of the time it's used, it does. We just drop it here.
129
145
  sql_query = f"CREATE OR REPLACE TEMP STAGE {stage_name[1:]} URL='s3://{parsed_path.split('/')[0]}'"
130
146
  credential = sessions_config.get(spark_session_id, None)
131
- if (
132
- credential is not None
133
- and credential.get("spark.hadoop.fs.s3a.access.key") is not None
134
- and credential.get("spark.hadoop.fs.s3a.secret.key") is not None
135
- and credential.get("spark.hadoop.fs.s3a.access.key").strip()
136
- != ""
137
- and credential.get("spark.hadoop.fs.s3a.secret.key").strip()
138
- != ""
139
- ):
140
- aws_keys = f" AWS_KEY_ID = '{credential.get('spark.hadoop.fs.s3a.access.key')}'"
141
- aws_keys += f" AWS_SECRET_KEY = '{credential.get('spark.hadoop.fs.s3a.secret.key')}'"
142
- if (
143
- credential.get("spark.hadoop.fs.s3a.session.token")
147
+ if credential is not None:
148
+ if ( # USE AWS KEYS to connect
149
+ credential.get("spark.hadoop.fs.s3a.access.key") is not None
150
+ and credential.get("spark.hadoop.fs.s3a.secret.key")
151
+ is not None
152
+ and credential.get("spark.hadoop.fs.s3a.access.key").strip()
153
+ != ""
154
+ and credential.get("spark.hadoop.fs.s3a.secret.key").strip()
155
+ != ""
156
+ ):
157
+ aws_keys = f" AWS_KEY_ID = '{credential.get('spark.hadoop.fs.s3a.access.key')}'"
158
+ aws_keys += f" AWS_SECRET_KEY = '{credential.get('spark.hadoop.fs.s3a.secret.key')}'"
159
+ if (
160
+ credential.get("spark.hadoop.fs.s3a.session.token")
161
+ is not None
162
+ ):
163
+ aws_keys += f" AWS_TOKEN = '{credential.get('spark.hadoop.fs.s3a.session.token')}'"
164
+ sql_query += f" CREDENTIALS = ({aws_keys})"
165
+ sql_query += " ENCRYPTION = ( TYPE = 'AWS_SSE_S3' )"
166
+ elif ( # USE AWS ROLE and KMS KEY to connect
167
+ credential.get(
168
+ "spark.hadoop.fs.s3a.server-side-encryption.key"
169
+ )
170
+ is not None
171
+ and credential.get(
172
+ "spark.hadoop.fs.s3a.server-side-encryption.key"
173
+ ).strip()
174
+ != ""
175
+ and credential.get("spark.hadoop.fs.s3a.assumed.role.arn")
144
176
  is not None
177
+ and credential.get(
178
+ "spark.hadoop.fs.s3a.assumed.role.arn"
179
+ ).strip()
180
+ != ""
145
181
  ):
146
- aws_keys += f" AWS_TOKEN = '{credential.get('spark.hadoop.fs.s3a.session.token')}'"
147
- sql_query += f" CREDENTIALS = ({aws_keys})"
148
- sql_query += " ENCRYPTION = ( TYPE = 'AWS_SSE_S3' )"
182
+ aws_role = f" AWS_ROLE = '{credential.get('spark.hadoop.fs.s3a.assumed.role.arn')}'"
183
+ sql_query += f" CREDENTIALS = ({aws_role})"
184
+ sql_query += f" ENCRYPTION = ( TYPE='AWS_SSE_KMS' KMS_KEY_ID = '{credential.get('spark.hadoop.fs.s3a.server-side-encryption.key')}' )"
149
185
 
150
186
  logger.info(self.session.sql(sql_query).collect())
151
187
  self.stages_for_aws[bucket_name] = stage_name