snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. snowflake/snowpark_connect/__init__.py +1 -0
  2. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
  3. snowflake/snowpark_connect/client/__init__.py +15 -0
  4. snowflake/snowpark_connect/client/error_utils.py +30 -0
  5. snowflake/snowpark_connect/client/exceptions.py +36 -0
  6. snowflake/snowpark_connect/client/query_results.py +90 -0
  7. snowflake/snowpark_connect/client/server.py +717 -0
  8. snowflake/snowpark_connect/client/utils/__init__.py +10 -0
  9. snowflake/snowpark_connect/client/utils/session.py +85 -0
  10. snowflake/snowpark_connect/column_name_handler.py +404 -243
  11. snowflake/snowpark_connect/column_qualifier.py +43 -0
  12. snowflake/snowpark_connect/config.py +309 -26
  13. snowflake/snowpark_connect/constants.py +2 -0
  14. snowflake/snowpark_connect/dataframe_container.py +102 -8
  15. snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
  16. snowflake/snowpark_connect/error/error_codes.py +50 -0
  17. snowflake/snowpark_connect/error/error_utils.py +172 -23
  18. snowflake/snowpark_connect/error/exceptions.py +13 -4
  19. snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
  20. snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
  21. snowflake/snowpark_connect/execute_plan/utils.py +5 -1
  22. snowflake/snowpark_connect/expression/error_utils.py +28 -0
  23. snowflake/snowpark_connect/expression/function_defaults.py +9 -2
  24. snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
  25. snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
  26. snowflake/snowpark_connect/expression/literal.py +37 -13
  27. snowflake/snowpark_connect/expression/map_cast.py +224 -15
  28. snowflake/snowpark_connect/expression/map_expression.py +80 -27
  29. snowflake/snowpark_connect/expression/map_extension.py +322 -12
  30. snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
  31. snowflake/snowpark_connect/expression/map_udf.py +86 -20
  32. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
  33. snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
  34. snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
  35. snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
  36. snowflake/snowpark_connect/expression/map_window_function.py +18 -3
  37. snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
  39. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
  43. snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
  44. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
  45. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
  46. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
  47. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
  48. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
  49. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
  50. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
  51. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
  52. snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
  53. snowflake/snowpark_connect/relation/io_utils.py +110 -10
  54. snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
  55. snowflake/snowpark_connect/relation/map_catalog.py +5 -1
  56. snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
  57. snowflake/snowpark_connect/relation/map_extension.py +263 -29
  58. snowflake/snowpark_connect/relation/map_join.py +683 -442
  59. snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
  60. snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
  61. snowflake/snowpark_connect/relation/map_relation.py +48 -19
  62. snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
  63. snowflake/snowpark_connect/relation/map_show_string.py +13 -6
  64. snowflake/snowpark_connect/relation/map_sql.py +1233 -222
  65. snowflake/snowpark_connect/relation/map_stats.py +48 -9
  66. snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
  67. snowflake/snowpark_connect/relation/map_udtf.py +14 -4
  68. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
  69. snowflake/snowpark_connect/relation/read/map_read.py +134 -43
  70. snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
  71. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
  72. snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
  73. snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
  74. snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
  75. snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
  76. snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
  77. snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
  78. snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
  79. snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
  80. snowflake/snowpark_connect/relation/read/utils.py +50 -5
  81. snowflake/snowpark_connect/relation/stage_locator.py +91 -55
  82. snowflake/snowpark_connect/relation/utils.py +128 -5
  83. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
  84. snowflake/snowpark_connect/relation/write/map_write.py +929 -319
  85. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
  86. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  87. snowflake/snowpark_connect/resources_initializer.py +171 -48
  88. snowflake/snowpark_connect/server.py +528 -473
  89. snowflake/snowpark_connect/server_common/__init__.py +503 -0
  90. snowflake/snowpark_connect/snowflake_session.py +65 -0
  91. snowflake/snowpark_connect/start_server.py +53 -5
  92. snowflake/snowpark_connect/type_mapping.py +349 -27
  93. snowflake/snowpark_connect/type_support.py +130 -0
  94. snowflake/snowpark_connect/typed_column.py +9 -7
  95. snowflake/snowpark_connect/utils/artifacts.py +9 -8
  96. snowflake/snowpark_connect/utils/cache.py +49 -27
  97. snowflake/snowpark_connect/utils/concurrent.py +36 -1
  98. snowflake/snowpark_connect/utils/context.py +195 -37
  99. snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
  100. snowflake/snowpark_connect/utils/env_utils.py +5 -1
  101. snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
  102. snowflake/snowpark_connect/utils/identifiers.py +137 -3
  103. snowflake/snowpark_connect/utils/io_utils.py +57 -1
  104. snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
  105. snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
  106. snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
  107. snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
  108. snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
  109. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
  110. snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
  111. snowflake/snowpark_connect/utils/profiling.py +25 -8
  112. snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
  113. snowflake/snowpark_connect/utils/sequence.py +21 -0
  114. snowflake/snowpark_connect/utils/session.py +64 -28
  115. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
  116. snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
  117. snowflake/snowpark_connect/utils/telemetry.py +192 -40
  118. snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
  119. snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
  120. snowflake/snowpark_connect/utils/udf_cache.py +117 -41
  121. snowflake/snowpark_connect/utils/udf_helper.py +39 -37
  122. snowflake/snowpark_connect/utils/udf_utils.py +133 -14
  123. snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
  124. snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
  125. snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
  126. snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
  127. snowflake/snowpark_connect/version.py +1 -1
  128. snowflake/snowpark_decoder/dp_session.py +6 -2
  129. snowflake/snowpark_decoder/spark_decoder.py +12 -0
  130. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
  131. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
  132. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
  133. snowflake/snowpark_connect/hidden_column.py +0 -39
  134. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  135. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  136. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  137. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  138. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  139. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  140. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  141. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  142. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  143. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  144. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  145. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  146. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  147. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  148. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  149. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  150. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  151. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  152. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  153. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  154. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  155. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  156. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  157. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  158. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  159. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  160. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  161. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  162. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  163. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  164. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  165. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  166. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  167. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  168. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  169. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  170. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  171. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  172. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  173. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  174. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  175. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  176. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  177. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  178. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  179. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  180. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  181. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  182. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  183. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  184. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  185. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  186. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  187. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  188. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  189. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  190. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  191. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  192. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  193. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  194. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
  195. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
  196. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
  197. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
  198. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
  199. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
  200. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
@@ -13,18 +13,21 @@ from functools import cached_property
13
13
  from pyspark.errors.exceptions.base import AnalysisException
14
14
 
15
15
  from snowflake.snowpark import DataFrame
16
- from snowflake.snowpark._internal.analyzer.analyzer_utils import (
17
- quote_name_without_upper_casing,
18
- unquote_if_quoted,
19
- )
16
+ from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
20
17
  from snowflake.snowpark._internal.utils import quote_name
21
18
  from snowflake.snowpark.types import StructType
19
+ from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
22
20
  from snowflake.snowpark_connect.config import global_config
23
- from snowflake.snowpark_connect.hidden_column import HiddenColumn
24
- from snowflake.snowpark_connect.utils.context import get_current_operation_scope
21
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
22
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
23
+ from snowflake.snowpark_connect.utils.context import (
24
+ get_current_operation_scope,
25
+ get_is_processing_order_by,
26
+ )
25
27
  from snowflake.snowpark_connect.utils.identifiers import (
26
28
  split_fully_qualified_spark_name,
27
29
  )
30
+ from snowflake.snowpark_connect.utils.sequence import next_unique_num
28
31
 
29
32
  ALREADY_QUOTED = re.compile('^(".+")$', re.DOTALL)
30
33
 
@@ -44,6 +47,7 @@ def set_schema_getter(df: DataFrame, get_schema: Callable[[], StructType]) -> No
44
47
  df.__class__ = PatchedDataFrame
45
48
 
46
49
 
50
+ # TODO replace plan_id-offset with single unique value
47
51
  def make_column_names_snowpark_compatible(
48
52
  names: list[str], plan_id: int, offset: int = 0
49
53
  ) -> list[str]:
@@ -76,42 +80,42 @@ def make_column_names_snowpark_compatible(
76
80
  In this case the function call should be `make_column_names_snowpark_compatible(['a', 'b'], 5, 2)`,
77
81
  to avoid naming conflicts between the new columns and the old columns.
78
82
  """
83
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
84
+ METADATA_FILENAME_COLUMN,
85
+ )
86
+
79
87
  return [
88
+ # Skip METADATA$FILENAME - preserve original name without quoting
89
+ name if name == METADATA_FILENAME_COLUMN else
80
90
  # Use `-` in the name to force df.column to return double-quoted names
81
91
  quote_name(f"{unquote_if_quoted(name)}-{plan_id:08x}-{i + offset}")
82
92
  for i, name in enumerate(names)
83
93
  ]
84
94
 
85
95
 
96
+ def make_unique_snowpark_name(spark_name: str) -> str:
97
+ """
98
+ Returns a snowpark column name that's guaranteed to be unique in this session,
99
+ by appending "#<unique number>" to the given spark name.
100
+ """
101
+ return quote_name(f"{spark_name}-{next_unique_num():x}")
102
+
103
+
86
104
  @dataclass(frozen=True)
87
105
  class ColumnNames:
88
106
  spark_name: str
89
107
  snowpark_name: str
90
- qualifiers: list[str]
108
+ qualifiers: set[ColumnQualifier]
109
+ equivalent_snowpark_names: set[str] | None = ((None,),)
91
110
  catalog_info: str | None = None # Catalog from fully qualified name
92
111
  database_info: str | None = None # Database from fully qualified name
112
+ is_hidden: bool = False # Hidden columns are only accessible via qualified names
93
113
 
94
-
95
- def get_list_of_spark_names_for_column(column_names: ColumnNames) -> list[str]:
96
- """
97
- Returns a list of Spark names for a given ColumnNames object.
98
- This is useful when a single Spark name maps to multiple names due to table alias.
99
-
100
- For example, if the column name is 'id' and the qualifiers are ['db', 'table'],
101
- then the possible Spark names are:
102
- ['id', 'db.table.id', 'table.id']
103
- """
104
- spark_name = column_names.spark_name
105
- qualifiers = column_names.qualifiers
106
-
107
- qualifier_suffixes_list = [
108
- ".".join(quote_name_without_upper_casing(x) for x in qualifiers[i:])
109
- for i in range(len(qualifiers))
110
- ]
111
- return [spark_name] + [
112
- f"{qualifier_suffix}.{spark_name}"
113
- for qualifier_suffix in qualifier_suffixes_list
114
- ]
114
+ def all_spark_names_including_qualified_names(self):
115
+ all_names = [self.spark_name]
116
+ for qualifier in self.qualifiers:
117
+ all_names.extend(qualifier.all_qualified_names(self.spark_name))
118
+ return all_names
115
119
 
116
120
 
117
121
  class ColumnNameMap:
@@ -123,32 +127,32 @@ class ColumnNameMap:
123
127
  [], bool
124
128
  ] = lambda: global_config.spark_sql_caseSensitive,
125
129
  column_metadata: dict | None = None,
126
- column_qualifiers: list[list[str]] | None = None,
127
- hidden_columns: set[HiddenColumn] | None = None,
130
+ column_qualifiers: list[set[ColumnQualifier]] = None,
128
131
  parent_column_name_map: ColumnNameMap | None = None,
132
+ equivalent_snowpark_names: list[set[str]] | None = None,
133
+ column_is_hidden: list[bool] | None = None,
129
134
  ) -> None:
130
135
  """
131
136
  spark_column_names: Original spark column names
132
137
  snowpark_column_names: Snowpark column names
133
- column_metadata: This field is used to store metadata related to columns. Since Snowparks Struct type does not support metadata,
138
+ column_metadata: This field is used to store metadata related to columns. Since Snowpark's Struct type does not support metadata,
134
139
  we use this attribute to store any metadata related to the columns.
135
140
  The key is the original Spark column name, and the value is the metadata.
136
141
  example: Dict('age', {'foo': 'bar'})
137
142
  column_qualifiers: Optional qualifiers for the columns, used to handle table aliases or DataFrame aliases.
138
- hidden_columns: Optional set of HiddenColumn objects.
139
143
  parent_column_name_map: parent ColumnNameMap
144
+ column_is_hidden: Optional list of booleans indicating whether each column is hidden
140
145
  """
141
146
  self.columns: list[ColumnNames] = []
142
- self.spark_to_col = defaultdict(list)
147
+ self.spark_to_col: defaultdict[str, list[ColumnNames]] = defaultdict(list)
143
148
  self.uppercase_spark_to_col = defaultdict(list)
144
149
  self.snowpark_to_col = defaultdict(list)
145
150
  self.is_case_sensitive = is_case_sensitive
146
151
  self.column_metadata = column_metadata
147
- self.hidden_columns = hidden_columns
148
152
 
149
153
  # Rename chain dictionary to track column renaming history
150
154
  self.rename_chains: dict[str, str] = {} # old_name -> new_name mapping
151
- self.current_columns: set[str] = set() # Current column names
155
+ self.current_columns: set[str] = set() # current column names
152
156
 
153
157
  # Parent ColumnNameMap classes
154
158
  self._parent_column_name_map = parent_column_name_map
@@ -179,21 +183,22 @@ class ColumnNameMap:
179
183
  c = ColumnNames(
180
184
  spark_name=spark_name,
181
185
  snowpark_name=snowpark_column_names[i],
182
- qualifiers=column_qualifiers[i] if column_qualifiers else [],
186
+ qualifiers=column_qualifiers[i]
187
+ if column_qualifiers and column_qualifiers[i]
188
+ else set(),
189
+ equivalent_snowpark_names=equivalent_snowpark_names[i]
190
+ if equivalent_snowpark_names and equivalent_snowpark_names[i]
191
+ else set(),
183
192
  catalog_info=catalog_info,
184
193
  database_info=database_info,
194
+ is_hidden=column_is_hidden[i] if column_is_hidden else False,
185
195
  )
186
196
  self.columns.append(c)
187
197
 
188
- # we want to store all the spark names including qualifiers (these are generated from table alias or dataframe alias)
189
- spark_names_including_qualifier = get_list_of_spark_names_for_column(c)
190
-
191
- for spark_name_including_qualifier in spark_names_including_qualifier:
198
+ for spark_name in c.all_spark_names_including_qualified_names():
192
199
  # the same spark name can map to multiple snowpark names
193
- self.spark_to_col[spark_name_including_qualifier].append(c)
194
- self.uppercase_spark_to_col[
195
- spark_name_including_qualifier.upper()
196
- ].append(c)
200
+ self.spark_to_col[spark_name].append(c)
201
+ self.uppercase_spark_to_col[spark_name.upper()].append(c)
197
202
 
198
203
  # the same snowpark name can map to multiple spark column
199
204
  # e.g. df.select(date_format('dt', 'yyy'), date_format('dt', 'yyyy')) ->
@@ -286,9 +291,10 @@ class ColumnNameMap:
286
291
  self,
287
292
  spark_column_names: list[str],
288
293
  return_first: bool = False,
294
+ original_snowpark_names: list[str] | None = None,
289
295
  ) -> list[str]:
290
296
  snowpark_column_names = self._get_snowpark_column_names_from_spark_column_names(
291
- spark_column_names, return_first
297
+ spark_column_names, return_first, original_snowpark_names
292
298
  )
293
299
  if snowpark_column_names:
294
300
  return snowpark_column_names
@@ -302,7 +308,7 @@ class ColumnNameMap:
302
308
  and self._parent_column_name_map is not None
303
309
  ):
304
310
  snowpark_column_names = self._parent_column_name_map.get_snowpark_column_names_from_spark_column_names(
305
- spark_column_names, return_first
311
+ spark_column_names, return_first, original_snowpark_names
306
312
  )
307
313
 
308
314
  return snowpark_column_names
@@ -311,9 +317,10 @@ class ColumnNameMap:
311
317
  self,
312
318
  spark_column_names: list[str],
313
319
  return_first: bool = False,
320
+ original_snowpark_names: list[str] | None = None,
314
321
  ) -> list[str]:
315
322
  snowpark_column_names = []
316
- for name in spark_column_names:
323
+ for i, name in enumerate(spark_column_names):
317
324
  if not global_config.spark_sql_caseSensitive:
318
325
  name = name.upper()
319
326
  mapping = self.uppercase_spark_to_col
@@ -325,8 +332,26 @@ class ColumnNameMap:
325
332
 
326
333
  columns = mapping[name]
327
334
 
335
+ # make sure the column matches the original snowpark name, if given
336
+ if original_snowpark_names:
337
+ oname = original_snowpark_names[i]
338
+ columns = [
339
+ c
340
+ for c in columns
341
+ if c.snowpark_name == oname or oname in c.equivalent_snowpark_names
342
+ ]
343
+
344
+ # Filter out hidden columns for unqualified lookups
345
+ # A qualified lookup contains a dot (e.g., "b.id"), unqualified doesn't (e.g., "id")
346
+ # Hidden columns should only be accessible via qualified names
347
+ is_qualified_lookup = "." in name or original_snowpark_names
348
+ if not is_qualified_lookup:
349
+ # Unqualified lookup: only include visible columns
350
+ columns = [c for c in columns if not c.is_hidden]
351
+
328
352
  if return_first:
329
- snowpark_column_names.append(columns[0].snowpark_name)
353
+ if columns: # Only append if we have columns after filtering
354
+ snowpark_column_names.append(columns[0].snowpark_name)
330
355
  else:
331
356
  snowpark_column_names.extend([c.snowpark_name for c in columns])
332
357
 
@@ -338,8 +363,7 @@ class ColumnNameMap:
338
363
  *,
339
364
  allow_non_exists: bool = False,
340
365
  return_first: bool = False,
341
- is_qualified: bool = False,
342
- source_qualifiers: list[str] | None = None,
366
+ original_snowpark_name: str | None = None,
343
367
  ) -> str | None:
344
368
  assert isinstance(spark_column_name, str)
345
369
  resolved_name = (
@@ -347,52 +371,85 @@ class ColumnNameMap:
347
371
  if self.rename_chains
348
372
  else spark_column_name
349
373
  )
350
-
351
- # We need to check hidden columns first. We want to avoid the code path
352
- # within get_snowpark_column_names_from_spark_column_names that checks the parent ColumnNameMap.
353
- # This is because that will return the name of the using column that's been dropped from the result
354
- # dataframe. We want to fetch and resolve the hidden column to its visible using column name instead.
355
- # Even if this is an unqualified reference or one to the visible column, it will resolve correctly to
356
- # the visible name anyway.
357
- snowpark_names = []
358
- # Only check hidden columns for qualified references with source qualifiers
359
- if is_qualified and source_qualifiers is not None and self.hidden_columns:
360
- column_name = spark_column_name
361
-
362
- # Check each hidden column for column name AND qualifier match
363
- for hidden_col in self.hidden_columns:
364
- if (
365
- hidden_col.spark_name == column_name
366
- and hidden_col.qualifiers == source_qualifiers
367
- ):
368
- if not global_config.spark_sql_caseSensitive:
369
- if hidden_col.spark_name.upper() == column_name.upper() and [
370
- q.upper() for q in hidden_col.qualifiers
371
- ] == [q.upper() for q in source_qualifiers]:
372
- snowpark_names.append(hidden_col.visible_snowpark_name)
373
- else:
374
- snowpark_names.append(hidden_col.visible_snowpark_name)
375
-
376
- # If not found in hidden columns, proceed with normal lookup
377
- if not snowpark_names:
378
- snowpark_names = self.get_snowpark_column_names_from_spark_column_names(
379
- [resolved_name], return_first
380
- )
374
+ snowpark_names = self.get_snowpark_column_names_from_spark_column_names(
375
+ [resolved_name],
376
+ return_first,
377
+ [original_snowpark_name] if original_snowpark_name else None,
378
+ )
381
379
 
382
380
  snowpark_names_len = len(snowpark_names)
383
381
  if snowpark_names_len > 1:
384
- raise AnalysisException(
385
- f"Ambiguous spark column name {spark_column_name}, potential snowpark column names {snowpark_names}"
386
- )
382
+ # Check if this is a case where we have identical expressions that can be safely resolved to the first one
383
+ # This commonly happens with GROUP BY expressions that also appear in SELECT clauses
384
+ if (
385
+ get_is_processing_order_by()
386
+ and self._can_resolve_ambiguous_identical_expressions(
387
+ resolved_name, snowpark_names
388
+ )
389
+ ):
390
+ # All the ambiguous columns represent the same expression, so we can safely use the first one
391
+ return snowpark_names[0]
392
+ else:
393
+ exception = AnalysisException(
394
+ f"Ambiguous spark column name {spark_column_name}, potential snowpark column names {snowpark_names}"
395
+ )
396
+ attach_custom_error_code(exception, ErrorCodes.AMBIGUOUS_COLUMN_NAME)
397
+ raise exception
387
398
  elif snowpark_names_len == 0:
388
399
  if allow_non_exists:
389
400
  return None
390
401
  else:
391
- raise AnalysisException(
402
+ exception = AnalysisException(
392
403
  f"Spark column name {spark_column_name} does not exist"
393
404
  )
405
+ attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
406
+ raise exception
394
407
  return snowpark_names[0]
395
408
 
409
+ def _can_resolve_ambiguous_identical_expressions(
410
+ self, spark_column_name: str, snowpark_names: list[str]
411
+ ) -> bool:
412
+ """
413
+ Determine if ambiguous columns represent identical expressions that can be safely resolved to the first one.
414
+
415
+ This handles the common case where the same expression (like a UDF call) appears multiple times
416
+ in a SELECT clause within a GROUP BY query. Since they're the same expression operating on the
417
+ same grouped data, they will have identical values, so we can safely resolve to any of them.
418
+
419
+ Args:
420
+ spark_column_name: The Spark column name that has multiple mappings, make sure resolve this reforehand
421
+ snowpark_names: List of Snowpark column names that map to this Spark column name
422
+
423
+ Returns:
424
+ True if we can safely resolve to the first snowpark column, False otherwise
425
+ """
426
+ if spark_column_name not in self.spark_to_col:
427
+ return False
428
+
429
+ columns: list[ColumnNames] = self.spark_to_col[spark_column_name]
430
+
431
+ # If we don't have multiple columns, there's no ambiguity to resolve
432
+ if len(columns) <= 1:
433
+ return False
434
+
435
+ # Check if all the snowpark names correspond to columns that have identical underlying expressions
436
+ # We'll compare the actual column objects to see if they represent the same computation
437
+ first_column = columns[0]
438
+
439
+ for column in columns[1:]:
440
+ if first_column.qualifiers != column.qualifiers:
441
+ return False
442
+
443
+ # Additional safety check: ensure all snowpark names are actually in our mapping
444
+ for snowpark_name in snowpark_names:
445
+ if snowpark_name not in self.snowpark_to_col:
446
+ return False
447
+
448
+ # If we reach here, the columns appear to be identical expressions from the same context
449
+ # This commonly happens in GROUP BY scenarios where the same expression appears in both
450
+ # the grouping clause and the select clause
451
+ return True
452
+
396
453
  def get_spark_column_names_from_snowpark_column_names(
397
454
  self,
398
455
  snowpark_column_names: list[str],
@@ -418,98 +475,79 @@ class ColumnNameMap:
418
475
  )
419
476
  spark_names_len = len(spark_names)
420
477
  if spark_names_len > 1:
421
- raise AnalysisException(
478
+ exception = AnalysisException(
422
479
  f"Ambiguous snowpark column name {snowpark_column_name}, potential spark column names {spark_names}"
423
480
  )
481
+ attach_custom_error_code(exception, ErrorCodes.AMBIGUOUS_COLUMN_NAME)
482
+ raise exception
424
483
  elif spark_names_len == 0:
425
484
  if allow_non_exists:
426
485
  return None
427
486
  else:
428
- raise AnalysisException(
487
+ exception = AnalysisException(
429
488
  f"Snowpark column name {snowpark_column_name} does not exist"
430
489
  )
490
+ attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
491
+ raise exception
431
492
  return spark_names[0]
432
493
 
433
494
  def get_spark_column_name(self, idx: int) -> str:
434
495
  return self.columns[idx].spark_name
435
496
 
436
497
  def get_spark_columns(self) -> list[str]:
437
- return [c.spark_name for c in self.columns]
498
+ return [c.spark_name for c in self.columns if not c.is_hidden]
438
499
 
439
500
  def get_spark_and_snowpark_columns_with_qualifier_for_qualifier(
440
- self, qualifiers_input: list[str]
441
- ) -> tuple[list[str], list[str], list[list[str]]]:
501
+ self, target_qualifier: ColumnQualifier
502
+ ) -> tuple[list[str], list[str], list[set[ColumnQualifier]]]:
442
503
  """
443
- Returns the Spark and Snowpark column names along with their qualifiers for the specified qualifiers.
444
- If a column does not have a qualifier, it will be None.
504
+ Returns the Spark and Snowpark column names along with their qualifiers for the specified qualifier.
445
505
  """
446
- spark_columns = []
447
- snowpark_columns = []
448
- qualifiers = []
506
+ spark_columns: list[str] = []
507
+ snowpark_columns: list[str] = []
508
+ qualifiers: list[set[ColumnQualifier]] = []
449
509
 
510
+ normalized_qualifier = target_qualifier
450
511
  if not self.is_case_sensitive():
451
- qualifiers_input = [q.upper() for q in qualifiers_input]
512
+ normalized_qualifier = target_qualifier.to_upper()
452
513
 
453
- for c in self.columns:
454
- col_qualifiers = (
455
- [q.upper() for q in c.qualifiers]
514
+ for column in self.columns:
515
+ # Normalize all qualifiers for comparison
516
+ column_qualifiers: set[ColumnQualifier] = (
517
+ {q.to_upper() for q in iter(column.qualifiers)}
456
518
  if not self.is_case_sensitive()
457
- else c.qualifiers
519
+ else column.qualifiers
458
520
  )
459
- if len(col_qualifiers) < len(qualifiers_input):
460
- # If the column has fewer qualifiers than the input, it cannot match
461
- continue
462
- if col_qualifiers[-len(qualifiers_input) :] == qualifiers_input:
463
- spark_columns.append(c.spark_name)
464
- snowpark_columns.append(c.snowpark_name)
465
- qualifiers.append(c.qualifiers)
466
-
467
- # Note: The following code is commented out because there is a bug with handling duplicate columns in
468
- # qualified select *'s. This needs to be revisited once a solution for that is found.
469
- # TODO: https://snowflakecomputing.atlassian.net/browse/SNOW-2265240
470
-
471
- # # Handles fetching/resolving the hidden columns if they also match the qualifiers
472
- # # This method is only ever called for qualified references, so we need to check hidden columns as well.
473
- # if self.hidden_columns:
474
- # for hidden_col in self.hidden_columns:
475
- # col_qualifiers = (
476
- # [q.upper() for q in hidden_col.qualifiers]
477
- # if not self.is_case_sensitive()
478
- # else hidden_col.qualifiers
479
- # )
480
- # if len(col_qualifiers) < len(qualifiers_input):
481
- # continue
482
- # if col_qualifiers[-len(qualifiers_input) :] == qualifiers_input:
483
- # # This hidden column matches! Add it to the results
484
- # spark_columns.append(hidden_col.spark_name)
485
- # snowpark_columns.append(hidden_col.visible_snowpark_name)
486
- # qualifiers.append(hidden_col.qualifiers)
521
+ if any([q.matches(normalized_qualifier) for q in column_qualifiers]):
522
+ spark_columns.append(column.spark_name)
523
+ snowpark_columns.append(column.snowpark_name)
524
+ qualifiers.append(column.qualifiers)
487
525
 
488
526
  return spark_columns, snowpark_columns, qualifiers
489
527
 
490
528
  def get_snowpark_columns(self) -> list[str]:
491
- return [c.snowpark_name for c in self.columns]
529
+ return [c.snowpark_name for c in self.columns if not c.is_hidden]
492
530
 
493
- def get_snowpark_columns_after_drop(self, cols_to_drop: list[str]) -> list[str]:
531
+ def get_snowpark_columns_after_drop(
532
+ self, cols_to_drop: list[str]
533
+ ) -> list[ColumnNames]:
494
534
  return [
495
535
  c
496
- for c in self.get_snowpark_columns()
497
- if self._quote_if_unquoted(c) not in cols_to_drop
536
+ for c in self.columns
537
+ if self._quote_if_unquoted(c.snowpark_name) not in cols_to_drop
498
538
  ]
499
539
 
500
- def get_qualifiers(self) -> list[list[str]]:
540
+ def get_qualifiers(self) -> list[set[ColumnQualifier]]:
501
541
  """
502
542
  Returns the qualifiers for the columns.
503
- If a column does not have a qualifier, it will be None.
504
543
  """
505
- return [c.qualifiers for c in self.columns]
544
+ return [c.qualifiers for c in self.columns if not c.is_hidden]
506
545
 
507
546
  def get_qualifiers_for_columns_after_drop(
508
547
  self, cols_to_drop: list[str]
509
- ) -> list[list[str]]:
548
+ ) -> list[set[ColumnQualifier]]:
510
549
  """
511
550
  Returns the qualifiers for the columns after dropping the specified columns.
512
- If a column is dropped, its qualifier will be None.
513
551
  """
514
552
  return [
515
553
  c.qualifiers
@@ -517,27 +555,40 @@ class ColumnNameMap:
517
555
  if self._quote_if_unquoted(c.snowpark_name) not in cols_to_drop
518
556
  ]
519
557
 
520
- def get_qualifier_for_spark_column(
558
+ def get_qualifiers_for_snowpark_column(
521
559
  self,
522
- spark_column_name: str,
523
- ) -> list[str]:
560
+ snowpark_name: str,
561
+ ) -> set[ColumnQualifier]:
524
562
  """
525
- Returns the qualifier for the specified Spark column name.
526
- If the column does not exist, returns None.
563
+ Returns the qualifier for the specified snowpark column name.
564
+ If the column does not exist, returns empty ColumnQualifier.
527
565
  """
528
- if not self.is_case_sensitive():
529
- name = spark_column_name.upper()
530
- mapping = self.uppercase_spark_to_col
531
- else:
532
- name = spark_column_name
533
- mapping = self.spark_to_col
566
+ for c in self.columns:
567
+ if c.snowpark_name == snowpark_name:
568
+ return c.qualifiers
569
+
570
+ return set()
571
+
572
+ def get_equivalent_snowpark_names(self) -> list[set[str]]:
573
+ return [c.equivalent_snowpark_names for c in self.columns]
534
574
 
535
- col = mapping.get(name)
575
+ def get_equivalent_snowpark_names_for_snowpark_name(
576
+ self, snowpark_name: str | None
577
+ ) -> set[str]:
578
+ """
579
+ Helper method to get the set of old, equivalent snowpark names for the given column. Used to pass
580
+ this information to child column maps.
581
+ """
582
+ if not snowpark_name:
583
+ return set()
536
584
 
537
- if col is None or len(col) == 0:
538
- return []
585
+ name = self._quote_if_unquoted(snowpark_name)
586
+ for c in self.columns:
587
+ if name == c.snowpark_name:
588
+ return c.equivalent_snowpark_names
539
589
 
540
- return col[0].qualifiers
590
+ # no equivalent names found
591
+ return set()
541
592
 
542
593
  @staticmethod
543
594
  def _quote_if_unquoted(s: str) -> str:
@@ -555,19 +606,20 @@ class ColumnNameMap:
555
606
  def snowpark_to_spark_map(self) -> dict[str, str]:
556
607
  return {c.snowpark_name: c.spark_name for c in self.columns}
557
608
 
558
- def spark_to_snowpark_for_pattern(self, pattern: str) -> list[tuple[str, str]]:
559
- pattern_regex = re.compile(
560
- pattern, 0 if self.is_case_sensitive() else re.IGNORECASE
561
- )
562
- return [
563
- (c.spark_name, c.snowpark_name)
564
- for c in self.columns
565
- if pattern_regex.fullmatch(c.spark_name)
566
- ]
609
+ def get_columns_matching_pattern(self, pattern: str) -> list[ColumnNames]:
610
+ try:
611
+ pattern_regex = re.compile(
612
+ pattern, 0 if self.is_case_sensitive() else re.IGNORECASE
613
+ )
614
+ return [c for c in self.columns if pattern_regex.fullmatch(c.spark_name)]
615
+ except re.error as e:
616
+ exception = AnalysisException(f"Invalid regex pattern '{pattern}': {e}")
617
+ attach_custom_error_code(exception, ErrorCodes.INVALID_FUNCTION_ARGUMENT)
618
+ raise exception
567
619
 
568
620
  def with_columns(
569
621
  self, new_spark_columns: list[str], new_snowpark_columns: list[str]
570
- ) -> tuple[list[str], list[str], list[list[str]]]:
622
+ ) -> tuple[list[str], list[str], list[set[ColumnQualifier]], list[set[str]]]:
571
623
  """
572
624
  Returns an ordered list of spark and snowpark column names after adding the new columns through a withColumns call.
573
625
  All replaced columns retain their ordering in the dataframe. The new columns are added to the end of the list.
@@ -588,6 +640,7 @@ class ColumnNameMap:
588
640
  snowpark_columns = []
589
641
  removed_index: set[int] = set()
590
642
  qualifiers = []
643
+ equivalent_snowpark_names = []
591
644
 
592
645
  for c in self.columns:
593
646
  column_name = self._normalized_spark_name(c.spark_name)
@@ -596,19 +649,22 @@ class ColumnNameMap:
596
649
  removed_index.add(index)
597
650
  spark_columns.append(new_spark_columns[index])
598
651
  snowpark_columns.append(new_snowpark_columns[index])
599
- qualifiers.append([])
652
+ qualifiers.append(set())
653
+ equivalent_snowpark_names.append(set())
600
654
  else:
601
655
  spark_columns.append(c.spark_name)
602
656
  snowpark_columns.append(c.snowpark_name)
603
657
  qualifiers.append(c.qualifiers)
658
+ equivalent_snowpark_names.append(c.equivalent_snowpark_names)
604
659
 
605
660
  for i, _ in enumerate(new_spark_columns):
606
661
  if i not in removed_index:
607
662
  spark_columns.append(new_spark_columns[i])
608
663
  snowpark_columns.append(new_snowpark_columns[i])
609
- qualifiers.append([])
664
+ qualifiers.append(set())
665
+ equivalent_snowpark_names.append(set())
610
666
 
611
- return spark_columns, snowpark_columns, qualifiers
667
+ return spark_columns, snowpark_columns, qualifiers, equivalent_snowpark_names
612
668
 
613
669
  def _normalized_spark_name(self, spark_name: str) -> str:
614
670
  if self.is_case_sensitive():
@@ -616,34 +672,77 @@ class ColumnNameMap:
616
672
  else:
617
673
  return spark_name.upper()
618
674
 
619
- def is_hidden_column_reference(
620
- self, spark_column_name: str, source_qualifiers: list[str] | None = None
621
- ) -> bool:
675
+ def get_columns_after_join(
676
+ self, right: ColumnNameMap, join_columns: list[str], join_type: str
677
+ ) -> list[ColumnNames]:
622
678
  """
623
- Check if a column reference would be resolved through hidden columns.
679
+ Returns a list of columns (names and qualifiers) after a using_columns join with the given column map
624
680
  """
625
- if not self.hidden_columns or source_qualifiers is None:
626
- return False
627
681
 
628
- # For qualified references with source_qualifiers
629
- column_name = (
630
- spark_column_name # When has_plan_id=True, this is just the column name
631
- )
682
+ # first, let's gather right-side join columns for qualifier lookup
683
+ # and the remaining columns to append them to the result
684
+ join_column_names = [self._normalized_spark_name(c) for c in join_columns]
685
+ right_join_columns: dict[str, ColumnNames] = {}
686
+ right_remaining_columns: list[ColumnNames] = []
687
+ for oc in right.columns:
688
+ col_name = self._normalized_spark_name(oc.spark_name)
689
+ # only take the first matching column
690
+ if col_name in join_column_names and col_name not in right_join_columns:
691
+ right_join_columns[col_name] = oc
692
+ else:
693
+ right_remaining_columns.append(oc)
632
694
 
633
- for hidden_col in self.hidden_columns:
634
- if (
635
- hidden_col.spark_name == column_name
636
- and hidden_col.qualifiers == source_qualifiers
637
- ):
638
- if not global_config.spark_sql_caseSensitive:
639
- if hidden_col.spark_name.upper() == column_name.upper() and [
640
- q.upper() for q in hidden_col.qualifiers
641
- ] == [q.upper() for q in source_qualifiers]:
642
- return True
643
- else:
644
- return True
695
+ # now gather left-side columns
696
+ left_join_columns: dict[str, ColumnNames] = {}
697
+ left_remaining_columns: list[ColumnNames] = []
698
+ for c in self.columns:
699
+ col_name = self._normalized_spark_name(c.spark_name)
700
+ if col_name in join_column_names and col_name not in left_join_columns:
701
+ equivalent_snowpark_names = set()
702
+ # only assign join-side qualifier for outer joins
703
+ match join_type:
704
+ case "left":
705
+ qualifiers = c.qualifiers
706
+ case "right":
707
+ qualifiers = right_join_columns[col_name].qualifiers
708
+ case _:
709
+ qualifiers = (
710
+ c.qualifiers | right_join_columns[col_name].qualifiers
711
+ )
712
+ equivalent_snowpark_names.update(
713
+ c.equivalent_snowpark_names,
714
+ right_join_columns[col_name].equivalent_snowpark_names,
715
+ {right_join_columns[col_name].snowpark_name},
716
+ )
717
+
718
+ left_join_columns[col_name] = ColumnNames(
719
+ c.spark_name, c.snowpark_name, qualifiers, equivalent_snowpark_names
720
+ )
721
+ else:
722
+ left_remaining_columns.append(c)
723
+
724
+ # join columns go first in the user-given order,
725
+ # then the remaining left-side columns, then remaining right-side columns
726
+ match join_type:
727
+ case "right":
728
+ ordered_join_columns = [
729
+ right_join_columns[name] for name in join_column_names
730
+ ]
731
+ case _:
732
+ ordered_join_columns = [
733
+ left_join_columns[name] for name in join_column_names
734
+ ]
735
+ return ordered_join_columns + left_remaining_columns + right_remaining_columns
645
736
 
646
- return False
737
+ def get_conflicting_snowpark_columns(self, other: ColumnNameMap) -> set[str]:
738
+ conflicting_columns = set()
739
+ snowpark_names = {c.snowpark_name for c in self.columns}
740
+
741
+ for c in other.columns:
742
+ if c.snowpark_name in snowpark_names:
743
+ conflicting_columns.add(c.snowpark_name)
744
+
745
+ return conflicting_columns
647
746
 
648
747
 
649
748
  class JoinColumnNameMap(ColumnNameMap):
@@ -654,9 +753,6 @@ class JoinColumnNameMap(ColumnNameMap):
654
753
  ) -> None:
655
754
  self.left_column_mapping: ColumnNameMap = left_colmap
656
755
  self.right_column_mapping: ColumnNameMap = right_colmap
657
- # Ensure attributes expected by base-class helpers exist to avoid AttributeError
658
- # when generic code paths (e.g., hidden column checks) touch them.
659
- self.hidden_columns: set[HiddenColumn] | None = None
660
756
 
661
757
  def get_snowpark_column_name_from_spark_column_name(
662
758
  self,
@@ -664,20 +760,20 @@ class JoinColumnNameMap(ColumnNameMap):
664
760
  *,
665
761
  allow_non_exists: bool = False,
666
762
  return_first: bool = False,
667
- # JoinColumnNameMap will never be called with using columns, so these parameters are not used.
668
- is_qualified: bool = False,
669
- source_qualifiers: list[str] | None = None,
763
+ original_snowpark_name: str | None = None,
670
764
  ) -> str | None:
671
765
  snowpark_column_name_in_left = (
672
766
  self.left_column_mapping.get_snowpark_column_name_from_spark_column_name(
673
767
  spark_column_name,
674
768
  allow_non_exists=True,
769
+ original_snowpark_name=original_snowpark_name,
675
770
  )
676
771
  )
677
772
  snowpark_column_name_in_right = (
678
773
  self.right_column_mapping.get_snowpark_column_name_from_spark_column_name(
679
774
  spark_column_name,
680
775
  allow_non_exists=True,
776
+ original_snowpark_name=original_snowpark_name,
681
777
  )
682
778
  )
683
779
 
@@ -688,14 +784,37 @@ class JoinColumnNameMap(ColumnNameMap):
688
784
  if allow_non_exists:
689
785
  return None
690
786
  else:
691
- raise AnalysisException(
787
+ exception = AnalysisException(
692
788
  f"Spark column name {spark_column_name} does not exist in either left or right DataFrame"
693
789
  )
790
+ attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
791
+ raise exception
694
792
 
793
+ # special case for join conditions, if the column has a match on both sides, and exactly one of those
794
+ # matches is the original snowpark name, that match should be used
695
795
  if (snowpark_column_name_in_right is not None) and (
696
796
  snowpark_column_name_in_left is not None
697
797
  ):
698
- raise AnalysisException(f"Ambiguous column name {spark_column_name}")
798
+ if (
799
+ snowpark_column_name_in_left == original_snowpark_name
800
+ and snowpark_column_name_in_right != original_snowpark_name
801
+ ):
802
+ snowpark_column_name_in_right = None
803
+
804
+ if (
805
+ snowpark_column_name_in_right == original_snowpark_name
806
+ and snowpark_column_name_in_left != original_snowpark_name
807
+ ):
808
+ snowpark_column_name_in_left = None
809
+
810
+ if (snowpark_column_name_in_right is not None) and (
811
+ snowpark_column_name_in_left is not None
812
+ ):
813
+ exception = AnalysisException(
814
+ f"Ambiguous column name `{spark_column_name}` in join condition"
815
+ )
816
+ attach_custom_error_code(exception, ErrorCodes.AMBIGUOUS_COLUMN_NAME)
817
+ raise exception
699
818
 
700
819
  snowpark_name = (
701
820
  snowpark_column_name_in_right
@@ -703,86 +822,128 @@ class JoinColumnNameMap(ColumnNameMap):
703
822
  else snowpark_column_name_in_left
704
823
  )
705
824
 
706
- # this means that the reference is for the column in right dataframe but same snowpark name exist in left dataframe as well
707
- # or vice versa, so we need to append _left or _right to the snowpark name
708
- if (
709
- snowpark_name in self.left_column_mapping.get_snowpark_columns()
710
- and snowpark_column_name_in_right is not None
711
- ):
712
- snowpark_name = quote_name(f"{unquote_if_quoted(snowpark_name)}_right")
713
- elif (
714
- snowpark_name in self.right_column_mapping.get_snowpark_columns()
715
- and snowpark_column_name_in_left is not None
716
- ):
717
- snowpark_name = quote_name(f"{unquote_if_quoted(snowpark_name)}_left")
718
-
719
825
  return snowpark_name
720
826
 
721
827
  def get_snowpark_column_names_from_spark_column_names(
722
- self, spark_column_names: list[str], return_first: bool = False
828
+ self,
829
+ spark_column_names: list[str],
830
+ return_first: bool = False,
831
+ original_snowpark_names: list[str] | None = None,
723
832
  ) -> list[str]:
724
- raise NotImplementedError("Method not implemented!")
833
+ exception = NotImplementedError("Method not implemented!")
834
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
835
+ raise exception
725
836
 
726
837
  def get_spark_column_names_from_snowpark_column_names(
727
838
  self,
728
839
  snowpark_column_names: list[str],
729
840
  ) -> list[str]:
730
- raise NotImplementedError("Method not implemented!")
841
+ exception = NotImplementedError("Method not implemented!")
842
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
843
+ raise exception
731
844
 
732
845
  def get_spark_column_name_from_snowpark_column_name(
733
- self, snowpark_column_name: str
846
+ self,
847
+ snowpark_column_name: str,
848
+ allow_non_exists: bool = False,
734
849
  ) -> str:
735
- raise NotImplementedError("Method not implemented!")
850
+ exception = NotImplementedError("Method not implemented!")
851
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
852
+ raise exception
736
853
 
737
854
  def get_spark_columns(self) -> list[str]:
738
- raise NotImplementedError("Method not implemented!")
855
+ exception = NotImplementedError("Method not implemented!")
856
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
857
+ raise exception
739
858
 
740
859
  def get_snowpark_columns(self) -> list[str]:
741
- raise NotImplementedError("Method not implemented!")
860
+ exception = NotImplementedError("Method not implemented!")
861
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
862
+ raise exception
742
863
 
743
- def get_snowpark_columns_after_drop(self, cols_to_drop: list[str]) -> list[str]:
744
- raise NotImplementedError("Method not implemented!")
864
+ def get_snowpark_columns_after_drop(
865
+ self, cols_to_drop: list[str]
866
+ ) -> list[ColumnNames]:
867
+ exception = NotImplementedError("Method not implemented!")
868
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
869
+ raise exception
745
870
 
746
871
  def get_renamed_nested_column_name(self, name) -> str | None:
747
- raise NotImplementedError("Method not implemented!")
872
+ exception = NotImplementedError("Method not implemented!")
873
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
874
+ raise exception
748
875
 
749
876
  def has_spark_column(self, spark_column_name: str) -> bool:
750
- raise NotImplementedError("Method not implemented!")
877
+ exception = NotImplementedError("Method not implemented!")
878
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
879
+ raise exception
751
880
 
752
881
  def snowpark_to_spark_map(self) -> dict[str, str]:
753
- raise NotImplementedError("Method not implemented!")
882
+ exception = NotImplementedError("Method not implemented!")
883
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
884
+ raise exception
754
885
 
755
- def spark_to_snowpark_for_pattern(self, pattern: str) -> list[tuple[str, str]]:
756
- raise NotImplementedError("Method not implemented!")
886
+ def get_columns_matching_pattern(self, pattern: str) -> list[tuple[str, str]]:
887
+ exception = NotImplementedError("Method not implemented!")
888
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
889
+ raise exception
757
890
 
758
891
  def with_columns(
759
892
  self, new_spark_columns: list[str], new_snowpark_columns: list[str]
760
- ) -> tuple[list[str], list[str], list[list[str]]]:
761
- raise NotImplementedError("Method not implemented!")
893
+ ) -> tuple[list[str], list[str], list[set[ColumnQualifier]]]:
894
+ exception = NotImplementedError("Method not implemented!")
895
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
896
+ raise exception
762
897
 
763
- def get_qualifiers(self) -> list[list[str]]:
764
- raise NotImplementedError("Method not implemented!")
898
+ def get_qualifiers(self) -> list[set[ColumnQualifier]]:
899
+ exception = NotImplementedError("Method not implemented!")
900
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
901
+ raise exception
765
902
 
766
903
  def get_qualifiers_for_columns_after_drop(
767
904
  self, cols_to_drop: list[str]
768
- ) -> list[list[str]]:
769
- raise NotImplementedError("Method not implemented!")
905
+ ) -> list[set[ColumnQualifier]]:
906
+ exception = NotImplementedError("Method not implemented!")
907
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
908
+ raise exception
770
909
 
771
910
  def get_spark_and_snowpark_columns_with_qualifier_for_qualifier(
772
- self, qualifiers_input: list[str]
773
- ) -> tuple[list[str], list[str], list[list[str]]]:
774
- raise NotImplementedError("Method not implemented!")
775
-
776
- def get_qualifier_for_spark_column(self, spark_column_name: str) -> list[str]:
777
-
778
- qualifier_left = self.left_column_mapping.get_qualifier_for_spark_column(
779
- spark_column_name
911
+ self, target_qualifier: list[str]
912
+ ) -> tuple[list[str], list[str], list[set[ColumnQualifier]]]:
913
+ exception = NotImplementedError("Method not implemented!")
914
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
915
+ raise exception
916
+
917
+ def get_qualifiers_for_snowpark_column(
918
+ self, snowpark_name: str
919
+ ) -> set[ColumnQualifier]:
920
+ qualifiers_left = self.left_column_mapping.get_qualifiers_for_snowpark_column(
921
+ snowpark_name
780
922
  )
781
- qualifier_right = self.right_column_mapping.get_qualifier_for_spark_column(
782
- spark_column_name
923
+ qualifiers_right = self.right_column_mapping.get_qualifiers_for_snowpark_column(
924
+ snowpark_name
783
925
  )
784
926
 
785
- if (len(qualifier_left) > 0) and (len(qualifier_right) > 0):
786
- raise AnalysisException(f"Ambiguous column name {spark_column_name}")
787
-
788
- return qualifier_right if len(qualifier_left) == 0 else qualifier_left
927
+ if (len(qualifiers_left) > 0) and (len(qualifiers_right) > 0):
928
+ exception = AnalysisException(f"Ambiguous column name {snowpark_name}")
929
+ attach_custom_error_code(exception, ErrorCodes.AMBIGUOUS_COLUMN_NAME)
930
+ raise exception
931
+
932
+ return qualifiers_right if len(qualifiers_left) == 0 else qualifiers_left
933
+
934
+ def get_columns_after_join(
935
+ self, right: ColumnNameMap, join_columns: list[str], join_type: str
936
+ ) -> list[ColumnNames]:
937
+ exception = NotImplementedError("Method not implemented!")
938
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
939
+ raise exception
940
+
941
+ def get_equivalent_snowpark_names_for_snowpark_name(self, snowpark_name: str):
942
+ exception = NotImplementedError("Method not implemented!")
943
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
944
+ raise exception
945
+
946
+ def get_equivalent_snowpark_names(self):
947
+ exception = NotImplementedError("Method not implemented!")
948
+ attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
949
+ raise exception