snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. snowflake/snowpark_connect/__init__.py +1 -0
  2. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
  3. snowflake/snowpark_connect/client/__init__.py +15 -0
  4. snowflake/snowpark_connect/client/error_utils.py +30 -0
  5. snowflake/snowpark_connect/client/exceptions.py +36 -0
  6. snowflake/snowpark_connect/client/query_results.py +90 -0
  7. snowflake/snowpark_connect/client/server.py +717 -0
  8. snowflake/snowpark_connect/client/utils/__init__.py +10 -0
  9. snowflake/snowpark_connect/client/utils/session.py +85 -0
  10. snowflake/snowpark_connect/column_name_handler.py +404 -243
  11. snowflake/snowpark_connect/column_qualifier.py +43 -0
  12. snowflake/snowpark_connect/config.py +309 -26
  13. snowflake/snowpark_connect/constants.py +2 -0
  14. snowflake/snowpark_connect/dataframe_container.py +102 -8
  15. snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
  16. snowflake/snowpark_connect/error/error_codes.py +50 -0
  17. snowflake/snowpark_connect/error/error_utils.py +172 -23
  18. snowflake/snowpark_connect/error/exceptions.py +13 -4
  19. snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
  20. snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
  21. snowflake/snowpark_connect/execute_plan/utils.py +5 -1
  22. snowflake/snowpark_connect/expression/error_utils.py +28 -0
  23. snowflake/snowpark_connect/expression/function_defaults.py +9 -2
  24. snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
  25. snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
  26. snowflake/snowpark_connect/expression/literal.py +37 -13
  27. snowflake/snowpark_connect/expression/map_cast.py +224 -15
  28. snowflake/snowpark_connect/expression/map_expression.py +80 -27
  29. snowflake/snowpark_connect/expression/map_extension.py +322 -12
  30. snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
  31. snowflake/snowpark_connect/expression/map_udf.py +86 -20
  32. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
  33. snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
  34. snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
  35. snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
  36. snowflake/snowpark_connect/expression/map_window_function.py +18 -3
  37. snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
  39. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
  43. snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
  44. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
  45. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
  46. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
  47. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
  48. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
  49. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
  50. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
  51. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
  52. snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
  53. snowflake/snowpark_connect/relation/io_utils.py +110 -10
  54. snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
  55. snowflake/snowpark_connect/relation/map_catalog.py +5 -1
  56. snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
  57. snowflake/snowpark_connect/relation/map_extension.py +263 -29
  58. snowflake/snowpark_connect/relation/map_join.py +683 -442
  59. snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
  60. snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
  61. snowflake/snowpark_connect/relation/map_relation.py +48 -19
  62. snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
  63. snowflake/snowpark_connect/relation/map_show_string.py +13 -6
  64. snowflake/snowpark_connect/relation/map_sql.py +1233 -222
  65. snowflake/snowpark_connect/relation/map_stats.py +48 -9
  66. snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
  67. snowflake/snowpark_connect/relation/map_udtf.py +14 -4
  68. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
  69. snowflake/snowpark_connect/relation/read/map_read.py +134 -43
  70. snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
  71. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
  72. snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
  73. snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
  74. snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
  75. snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
  76. snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
  77. snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
  78. snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
  79. snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
  80. snowflake/snowpark_connect/relation/read/utils.py +50 -5
  81. snowflake/snowpark_connect/relation/stage_locator.py +91 -55
  82. snowflake/snowpark_connect/relation/utils.py +128 -5
  83. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
  84. snowflake/snowpark_connect/relation/write/map_write.py +929 -319
  85. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
  86. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  87. snowflake/snowpark_connect/resources_initializer.py +171 -48
  88. snowflake/snowpark_connect/server.py +528 -473
  89. snowflake/snowpark_connect/server_common/__init__.py +503 -0
  90. snowflake/snowpark_connect/snowflake_session.py +65 -0
  91. snowflake/snowpark_connect/start_server.py +53 -5
  92. snowflake/snowpark_connect/type_mapping.py +349 -27
  93. snowflake/snowpark_connect/type_support.py +130 -0
  94. snowflake/snowpark_connect/typed_column.py +9 -7
  95. snowflake/snowpark_connect/utils/artifacts.py +9 -8
  96. snowflake/snowpark_connect/utils/cache.py +49 -27
  97. snowflake/snowpark_connect/utils/concurrent.py +36 -1
  98. snowflake/snowpark_connect/utils/context.py +195 -37
  99. snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
  100. snowflake/snowpark_connect/utils/env_utils.py +5 -1
  101. snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
  102. snowflake/snowpark_connect/utils/identifiers.py +137 -3
  103. snowflake/snowpark_connect/utils/io_utils.py +57 -1
  104. snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
  105. snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
  106. snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
  107. snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
  108. snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
  109. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
  110. snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
  111. snowflake/snowpark_connect/utils/profiling.py +25 -8
  112. snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
  113. snowflake/snowpark_connect/utils/sequence.py +21 -0
  114. snowflake/snowpark_connect/utils/session.py +64 -28
  115. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
  116. snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
  117. snowflake/snowpark_connect/utils/telemetry.py +192 -40
  118. snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
  119. snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
  120. snowflake/snowpark_connect/utils/udf_cache.py +117 -41
  121. snowflake/snowpark_connect/utils/udf_helper.py +39 -37
  122. snowflake/snowpark_connect/utils/udf_utils.py +133 -14
  123. snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
  124. snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
  125. snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
  126. snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
  127. snowflake/snowpark_connect/version.py +1 -1
  128. snowflake/snowpark_decoder/dp_session.py +6 -2
  129. snowflake/snowpark_decoder/spark_decoder.py +12 -0
  130. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
  131. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
  132. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
  133. snowflake/snowpark_connect/hidden_column.py +0 -39
  134. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  135. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  136. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  137. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  138. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  139. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  140. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  141. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  142. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  143. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  144. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  145. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  146. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  147. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  148. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  149. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  150. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  151. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  152. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  153. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  154. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  155. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  156. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  157. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  158. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  159. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  160. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  161. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  162. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  163. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  164. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  165. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  166. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  167. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  168. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  169. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  170. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  171. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  172. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  173. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  174. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  175. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  176. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  177. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  178. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  179. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  180. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  181. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  182. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  183. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  184. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  185. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  186. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  187. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  188. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  189. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  190. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  191. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  192. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  193. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  194. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
  195. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
  196. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
  197. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
  198. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
  199. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
  200. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,15 @@
1
1
  #
2
2
  # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
-
5
-
4
+ import pandas
6
5
  import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
7
6
  import pyspark.sql.connect.proto.relations_pb2 as relation_proto
8
7
  from pyspark.errors.exceptions.base import AnalysisException, IllegalArgumentException
9
8
 
10
9
  import snowflake.snowpark_connect.relation.utils as utils
11
10
  from snowflake import snowpark
12
- from snowflake.snowpark.functions import col, expr as snowpark_expr
11
+ from snowflake.snowpark._internal.error_message import SnowparkClientExceptionMessages
12
+ from snowflake.snowpark.functions import col, expr as snowpark_expr, lit
13
13
  from snowflake.snowpark.types import (
14
14
  BooleanType,
15
15
  ByteType,
@@ -20,21 +20,90 @@ from snowflake.snowpark.types import (
20
20
  LongType,
21
21
  NullType,
22
22
  ShortType,
23
+ StructField,
24
+ StructType,
25
+ )
26
+ from snowflake.snowpark_connect.column_name_handler import (
27
+ ColumnNameMap,
28
+ schema_getter,
29
+ set_schema_getter,
23
30
  )
24
- from snowflake.snowpark_connect.column_name_handler import ColumnNameMap, schema_getter
25
31
  from snowflake.snowpark_connect.config import global_config
26
32
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
33
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
34
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
27
35
  from snowflake.snowpark_connect.expression.literal import get_literal_field_and_name
28
36
  from snowflake.snowpark_connect.expression.map_expression import (
29
37
  map_single_column_expression,
30
38
  )
31
39
  from snowflake.snowpark_connect.expression.typer import ExpressionTyper
32
40
  from snowflake.snowpark_connect.relation.map_relation import map_relation
41
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
42
+ without_internal_columns,
43
+ )
44
+ from snowflake.snowpark_connect.utils.identifiers import (
45
+ split_fully_qualified_spark_name,
46
+ )
33
47
  from snowflake.snowpark_connect.utils.telemetry import (
34
48
  SnowparkConnectNotImplementedError,
35
49
  )
36
50
 
37
51
 
52
+ def cast_columns(
53
+ df_container: DataFrameContainer,
54
+ df_dtypes: list[snowpark.types.DataType],
55
+ target_dtypes: list[snowpark.types.DataType],
56
+ column_map: ColumnNameMap,
57
+ ):
58
+ df: snowpark.DataFrame = df_container.dataframe
59
+ if df_dtypes == target_dtypes:
60
+ return df_container
61
+ # Use cached schema if available to avoid triggering extra queries
62
+ if (
63
+ hasattr(df_container, "cached_schema_getter")
64
+ and df_container.cached_schema_getter is not None
65
+ ):
66
+ df_schema = df_container.cached_schema_getter()
67
+ else:
68
+ df_schema = df.schema # Get current schema
69
+ new_columns = []
70
+
71
+ for i, field in enumerate(df_schema.fields):
72
+ col_name = field.name
73
+ current_type = field.datatype
74
+ target_type = target_dtypes[i]
75
+
76
+ if current_type != target_type:
77
+ new_columns.append(df[col_name].cast(target_type).alias(col_name))
78
+ else:
79
+ new_columns.append(df[col_name])
80
+
81
+ new_df = df.select(new_columns)
82
+ return DataFrameContainer.create_with_column_mapping(
83
+ dataframe=new_df,
84
+ spark_column_names=column_map.get_spark_columns(),
85
+ snowpark_column_names=column_map.get_snowpark_columns(),
86
+ snowpark_column_types=target_dtypes,
87
+ column_metadata=column_map.column_metadata,
88
+ parent_column_name_map=column_map,
89
+ )
90
+
91
+
92
+ def get_schema_from_result(
93
+ result: DataFrameContainer,
94
+ ) -> StructType:
95
+ """
96
+ Get schema from a DataFrameContainer, using cached schema if available to avoid extra queries.
97
+ """
98
+ if (
99
+ hasattr(result, "cached_schema_getter")
100
+ and result.cached_schema_getter is not None
101
+ ):
102
+ return result.cached_schema_getter()
103
+ else:
104
+ return result.dataframe.schema
105
+
106
+
38
107
  def map_deduplicate(
39
108
  rel: relation_proto.Relation,
40
109
  ) -> DataFrameContainer:
@@ -43,16 +112,18 @@ def map_deduplicate(
43
112
 
44
113
  The deduplicate is a list of columns that is applied to the DataFrame.
45
114
  """
46
- input_container = map_relation(rel.deduplicate.input)
115
+ input_container = without_internal_columns(map_relation(rel.deduplicate.input))
47
116
  input_df = input_container.dataframe
48
117
 
49
118
  if (
50
119
  rel.deduplicate.HasField("within_watermark")
51
120
  and rel.deduplicate.within_watermark
52
121
  ):
53
- raise AnalysisException(
122
+ exception = AnalysisException(
54
123
  "dropDuplicatesWithinWatermark is not supported with batch DataFrames/DataSets"
55
124
  )
125
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
126
+ raise exception
56
127
 
57
128
  if (
58
129
  rel.deduplicate.HasField("all_columns_as_keys")
@@ -81,7 +152,7 @@ def map_dropna(
81
152
  """
82
153
  Drop NA values from the input DataFrame.
83
154
  """
84
- input_container = map_relation(rel.drop_na.input)
155
+ input_container = without_internal_columns(map_relation(rel.drop_na.input))
85
156
  input_df = input_container.dataframe
86
157
 
87
158
  if rel.drop_na.HasField("min_non_nulls"):
@@ -122,15 +193,23 @@ def map_fillna(
122
193
 
123
194
  The `fill_value` is a scalar value that will be used to replace NaN values.
124
195
  """
125
- input_container = map_relation(rel.fill_na.input)
196
+ input_container = without_internal_columns(map_relation(rel.fill_na.input))
126
197
  input_df = input_container.dataframe
127
198
 
128
199
  if len(rel.fill_na.cols) > 0:
200
+ if rel.fill_na.cols == ["*"]:
201
+ # Expand "*" to all columns
202
+ spark_col_names = input_container.column_map.get_spark_columns()
203
+ else:
204
+ spark_col_names = list(rel.fill_na.cols)
205
+
206
+ # We don't validate the fully qualified spark name here as fillNa is no-op for structured type colums.
207
+ # It only works for scalar type columns like float, int, string or bool.
129
208
  columns: list[str] = [
130
209
  input_container.column_map.get_snowpark_column_name_from_spark_column_name(
131
- c
210
+ split_fully_qualified_spark_name(c)[0]
132
211
  )
133
- for c in rel.fill_na.cols
212
+ for c in spark_col_names
134
213
  ]
135
214
  values = [get_literal_field_and_name(v)[0] for v in rel.fill_na.values]
136
215
  if len(values) == 1:
@@ -177,29 +256,16 @@ def map_union(
177
256
 
178
257
  The two DataFrames must have the same schema.
179
258
  """
180
- left_result = map_relation(rel.set_op.left_input)
181
- right_result = map_relation(rel.set_op.right_input)
259
+ left_result = without_internal_columns(map_relation(rel.set_op.left_input))
260
+ right_result = without_internal_columns(map_relation(rel.set_op.right_input))
182
261
  left_df = left_result.dataframe
183
262
  right_df = right_result.dataframe
184
263
  allow_missing_columns = bool(rel.set_op.allow_missing_columns)
185
264
 
186
265
  # workaround for unstructured type vs structured type
187
266
  # Use cached schema if available to avoid triggering extra queries
188
- if (
189
- hasattr(left_result, "cached_schema_getter")
190
- and left_result.cached_schema_getter is not None
191
- ):
192
- left_schema = left_result.cached_schema_getter()
193
- else:
194
- left_schema = left_df.schema
195
-
196
- if (
197
- hasattr(right_result, "cached_schema_getter")
198
- and right_result.cached_schema_getter is not None
199
- ):
200
- right_schema = right_result.cached_schema_getter()
201
- else:
202
- right_schema = right_df.schema
267
+ left_schema = get_schema_from_result(left_result)
268
+ right_schema = get_schema_from_result(right_result)
203
269
 
204
270
  left_dtypes = [field.datatype for field in left_schema.fields]
205
271
  right_dtypes = [field.datatype for field in right_schema.fields]
@@ -207,7 +273,9 @@ def map_union(
207
273
  spark_sql_ansi_enabled = global_config.spark_sql_ansi_enabled
208
274
  if left_dtypes != right_dtypes and not rel.set_op.by_name:
209
275
  if len(left_dtypes) != len(right_dtypes):
210
- raise AnalysisException("UNION: the number of columns must match")
276
+ exception = AnalysisException("UNION: the number of columns must match")
277
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
278
+ raise exception
211
279
  target_left_dtypes, target_right_dtypes = [], []
212
280
  for left_type, right_type in zip(left_dtypes, right_dtypes):
213
281
  match (left_type, right_type):
@@ -235,6 +303,29 @@ def map_union(
235
303
  # Union of any type with null type is of the other type
236
304
  target_left_dtypes.append(other_t)
237
305
  target_right_dtypes.append(other_t)
306
+ case (snowpark.types.DecimalType(), snowpark.types.DecimalType()):
307
+ # Widen decimal types to accommodate both sides
308
+ # Calculate the maximum scale and maximum integer digits
309
+ left_integer_digits = left_type.precision - left_type.scale
310
+ right_integer_digits = right_type.precision - right_type.scale
311
+
312
+ # The common type needs to accommodate:
313
+ # - The maximum number of digits after the decimal point (scale)
314
+ # - The maximum number of digits before the decimal point (integer digits)
315
+ common_scale = max(left_type.scale, right_type.scale)
316
+ common_integer_digits = max(
317
+ left_integer_digits, right_integer_digits
318
+ )
319
+ common_precision = min(38, common_scale + common_integer_digits)
320
+
321
+ # Ensure scale doesn't exceed precision
322
+ common_scale = min(common_scale, common_precision)
323
+
324
+ common_type = snowpark.types.DecimalType(
325
+ common_precision, common_scale
326
+ )
327
+ target_left_dtypes.append(common_type)
328
+ target_right_dtypes.append(common_type)
238
329
  case (snowpark.types.BooleanType(), _) | (
239
330
  _,
240
331
  snowpark.types.BooleanType(),
@@ -243,54 +334,31 @@ def map_union(
243
334
  not spark_sql_ansi_enabled
244
335
  or snowpark.types.StringType() not in [left_type, right_type]
245
336
  ): # In ansi mode , string type union boolean type is acceptable
246
- raise AnalysisException(
337
+ exception = AnalysisException(
247
338
  f"""[INCOMPATIBLE_COLUMN_TYPE] UNION can only be performed on tables with compatible column types. "{str(left_type)}" type which is not compatible with "{str(right_type)}". """
248
339
  )
340
+ attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
341
+ raise exception
249
342
  target_left_dtypes.append(left_type)
250
343
  target_right_dtypes.append(right_type)
344
+ case (
345
+ snowpark.types.TimestampType()
346
+ | snowpark.types.DateType()
347
+ | snowpark.types._NumericType(),
348
+ snowpark.types.StringType(),
349
+ ) | (
350
+ snowpark.types.StringType(),
351
+ snowpark.types.TimestampType()
352
+ | snowpark.types.DateType()
353
+ | snowpark.types._NumericType(),
354
+ ) if not spark_sql_ansi_enabled:
355
+ common_type = snowpark.types.StringType()
356
+ target_left_dtypes.append(common_type)
357
+ target_right_dtypes.append(common_type)
251
358
  case _:
252
359
  target_left_dtypes.append(left_type)
253
360
  target_right_dtypes.append(right_type)
254
361
 
255
- def cast_columns(
256
- df_container: DataFrameContainer,
257
- df_dtypes: list[snowpark.types.DataType],
258
- target_dtypes: list[snowpark.types.DataType],
259
- column_map: ColumnNameMap,
260
- ):
261
- df: snowpark.DataFrame = df_container.dataframe
262
- if df_dtypes == target_dtypes:
263
- return df_container
264
- # Use cached schema if available to avoid triggering extra queries
265
- if (
266
- hasattr(df_container, "cached_schema_getter")
267
- and df_container.cached_schema_getter is not None
268
- ):
269
- df_schema = df_container.cached_schema_getter()
270
- else:
271
- df_schema = df.schema # Get current schema
272
- new_columns = []
273
-
274
- for i, field in enumerate(df_schema.fields):
275
- col_name = field.name
276
- current_type = field.datatype
277
- target_type = target_dtypes[i]
278
-
279
- if current_type != target_type:
280
- new_columns.append(df[col_name].cast(target_type).alias(col_name))
281
- else:
282
- new_columns.append(df[col_name])
283
-
284
- new_df = df.select(new_columns)
285
- return DataFrameContainer.create_with_column_mapping(
286
- dataframe=new_df,
287
- spark_column_names=column_map.get_spark_columns(),
288
- snowpark_column_names=column_map.get_snowpark_columns(),
289
- snowpark_column_types=target_dtypes,
290
- column_metadata=column_map.column_metadata,
291
- parent_column_name_map=column_map,
292
- )
293
-
294
362
  left_result = cast_columns(
295
363
  left_result,
296
364
  left_dtypes,
@@ -318,23 +386,37 @@ def map_union(
318
386
  right_column_map = right_result.column_map
319
387
  columns_to_restore: dict[str, tuple[str, str]] = {}
320
388
 
321
- for column in right_df.columns:
389
+ original_right_schema = right_df.schema
390
+ right_renamed_fields = []
391
+ for field in original_right_schema.fields:
322
392
  spark_name = (
323
- right_column_map.get_spark_column_name_from_snowpark_column_name(column)
393
+ right_column_map.get_spark_column_name_from_snowpark_column_name(
394
+ field.name
395
+ )
396
+ )
397
+ right_df = right_df.withColumnRenamed(field.name, spark_name)
398
+ columns_to_restore[spark_name.upper()] = (spark_name, field.name)
399
+ right_renamed_fields.append(
400
+ StructField(spark_name, field.datatype, field.nullable)
324
401
  )
325
- right_df = right_df.withColumnRenamed(column, spark_name)
326
- columns_to_restore[spark_name.upper()] = (spark_name, column)
402
+ set_schema_getter(right_df, lambda: StructType(right_renamed_fields))
327
403
 
328
- for column in left_df.columns:
404
+ original_left_schema = left_df.schema
405
+ left_renamed_fields = []
406
+ for field in original_left_schema.fields:
329
407
  spark_name = (
330
- left_column_map.get_spark_column_name_from_snowpark_column_name(column)
408
+ left_column_map.get_spark_column_name_from_snowpark_column_name(
409
+ field.name
410
+ )
411
+ )
412
+ left_df = left_df.withColumnRenamed(field.name, spark_name)
413
+ columns_to_restore[spark_name.upper()] = (spark_name, field.name)
414
+ left_renamed_fields.append(
415
+ StructField(spark_name, field.datatype, field.nullable)
331
416
  )
332
- left_df = left_df.withColumnRenamed(column, spark_name)
333
- columns_to_restore[spark_name.upper()] = (spark_name, column)
417
+ set_schema_getter(left_df, lambda: StructType(left_renamed_fields))
334
418
 
335
- result = left_df.unionAllByName(
336
- right_df, allow_missing_columns=allow_missing_columns
337
- )
419
+ result = _union_by_name_optimized(left_df, right_df, allow_missing_columns)
338
420
 
339
421
  if allow_missing_columns:
340
422
  spark_columns = []
@@ -421,8 +503,8 @@ def map_intersect(
421
503
  | b| 3|
422
504
  +---+---+
423
505
  """
424
- left_result = map_relation(rel.set_op.left_input)
425
- right_result = map_relation(rel.set_op.right_input)
506
+ left_result = without_internal_columns(map_relation(rel.set_op.left_input))
507
+ right_result = without_internal_columns(map_relation(rel.set_op.right_input))
426
508
  left_df = left_result.dataframe
427
509
  right_df = right_result.dataframe
428
510
 
@@ -484,11 +566,53 @@ def map_except(
484
566
  | c| 4|
485
567
  +---+---+
486
568
  """
487
- left_result = map_relation(rel.set_op.left_input)
488
- right_result = map_relation(rel.set_op.right_input)
569
+ left_result = without_internal_columns(map_relation(rel.set_op.left_input))
570
+ right_result = without_internal_columns(map_relation(rel.set_op.right_input))
489
571
  left_df = left_result.dataframe
490
572
  right_df = right_result.dataframe
491
573
 
574
+ # workaround for unstructured type vs structured type
575
+ # Use cached schema if available to avoid triggering extra queries
576
+ left_schema = get_schema_from_result(left_result)
577
+ right_schema = get_schema_from_result(right_result)
578
+
579
+ left_dtypes = [field.datatype for field in left_schema.fields]
580
+ right_dtypes = [field.datatype for field in right_schema.fields]
581
+
582
+ if left_dtypes != right_dtypes and not rel.set_op.by_name:
583
+ if len(left_dtypes) != len(right_dtypes):
584
+ exception = AnalysisException("UNION: the number of columns must match")
585
+ attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
586
+ raise exception
587
+ target_left_dtypes, target_right_dtypes = [], []
588
+ for left_type, right_type in zip(left_dtypes, right_dtypes):
589
+ match (left_type, right_type):
590
+ case (snowpark.types._NumericType(), snowpark.types.StringType()) | (
591
+ snowpark.types.StringType(),
592
+ snowpark.types._NumericType(),
593
+ ):
594
+ common_type = snowpark.types.StringType()
595
+ target_left_dtypes.append(common_type)
596
+ target_right_dtypes.append(common_type)
597
+ case _:
598
+ target_left_dtypes.append(left_type)
599
+ target_right_dtypes.append(right_type)
600
+
601
+ left_result = cast_columns(
602
+ left_result,
603
+ left_dtypes,
604
+ target_left_dtypes,
605
+ left_result.column_map,
606
+ )
607
+ right_result = cast_columns(
608
+ right_result,
609
+ right_dtypes,
610
+ target_right_dtypes,
611
+ right_result.column_map,
612
+ )
613
+ left_df = left_result.dataframe
614
+ right_df = right_result.dataframe
615
+
492
616
  if rel.set_op.is_all:
493
617
  # Snowflake except removes all duplicated rows. In order to handle the case,
494
618
  # we add a partition row number column to the df to make duplicated rows unique to
@@ -573,13 +697,18 @@ def map_filter(
573
697
 
574
698
  def map_limit(
575
699
  rel: relation_proto.Relation,
576
- ) -> DataFrameContainer:
700
+ ) -> DataFrameContainer | pandas.DataFrame:
577
701
  """
578
702
  Limit a DataFrame based on a Relation's limit.
579
703
 
580
704
  The limit is an integer that is applied to the DataFrame.
581
705
  """
582
- input_container = map_relation(rel.limit.input)
706
+
707
+ input_container = without_internal_columns(map_relation(rel.limit.input))
708
+
709
+ if isinstance(input_container, pandas.DataFrame):
710
+ return input_container.head(rel.limit.limit)
711
+
583
712
  input_df = input_container.dataframe
584
713
 
585
714
  result: snowpark.DataFrame = input_df.limit(rel.limit.limit)
@@ -601,7 +730,7 @@ def map_offset(
601
730
 
602
731
  The offset is an integer that is applied to the DataFrame.
603
732
  """
604
- input_container = map_relation(rel.offset.input)
733
+ input_container = without_internal_columns(map_relation(rel.offset.input))
605
734
  input_df = input_container.dataframe
606
735
 
607
736
  # TODO: This is a terrible way to have to do this, but Snowpark does not
@@ -629,7 +758,7 @@ def map_replace(
629
758
  values to replace. The values in the dictionary are the values to replace
630
759
  and the keys are the values to replace them with.
631
760
  """
632
- result = map_relation(rel.replace.input)
761
+ result = without_internal_columns(map_relation(rel.replace.input))
633
762
  input_df = result.dataframe
634
763
  ordered_columns = input_df.columns
635
764
  column_map = result.column_map
@@ -752,12 +881,14 @@ def map_sample(
752
881
  """
753
882
  Sample a DataFrame based on a Relation's sample.
754
883
  """
755
- input_container = map_relation(rel.sample.input)
884
+ input_container = without_internal_columns(map_relation(rel.sample.input))
756
885
  input_df = input_container.dataframe
757
886
 
758
887
  frac = rel.sample.upper_bound - rel.sample.lower_bound
759
888
  if frac < 0 or frac > 1:
760
- raise IllegalArgumentException("Sample fraction must be between 0 and 1")
889
+ exception = IllegalArgumentException("Sample fraction must be between 0 and 1")
890
+ attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
891
+ raise exception
761
892
  # The seed argument is not supported here. There are a number of reasons that implementing
762
893
  # this will be complicated in Snowflake. Here is a list of complications:
763
894
  #
@@ -772,9 +903,11 @@ def map_sample(
772
903
  # these issues.
773
904
  if rel.sample.with_replacement:
774
905
  # TODO: Use a random number generator with ROW_NUMBER and SELECT.
775
- raise SnowparkConnectNotImplementedError(
906
+ exception = SnowparkConnectNotImplementedError(
776
907
  "Sample with replacement is not supported"
777
908
  )
909
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
910
+ raise exception
778
911
  else:
779
912
  result: snowpark.DataFrame = input_df.sample(frac=frac)
780
913
  return DataFrameContainer(
@@ -794,7 +927,7 @@ def map_tail(
794
927
 
795
928
  The tail is an integer that is applied to the DataFrame.
796
929
  """
797
- input_container = map_relation(rel.tail.input)
930
+ input_container = without_internal_columns(map_relation(rel.tail.input))
798
931
  input_df = input_container.dataframe
799
932
 
800
933
  num_rows = input_df.count()
@@ -809,3 +942,89 @@ def map_tail(
809
942
  alias=input_container.alias,
810
943
  cached_schema_getter=lambda: input_df.schema,
811
944
  )
945
+
946
+
947
+ def _union_by_name_optimized(
948
+ left_df: snowpark.DataFrame,
949
+ right_df: snowpark.DataFrame,
950
+ allow_missing_columns: bool = False,
951
+ ) -> snowpark.DataFrame:
952
+ """
953
+ This implementation is an optimized version of Snowpark's Dataframe::_union_by_name_internal.
954
+ The only change is, that it avoids redundant schema queries that occur in the standard Snowpark,
955
+ by reusing already-fetched/calculated schemas.
956
+ """
957
+
958
+ left_schema = left_df.schema
959
+ right_schema = right_df.schema
960
+
961
+ left_cols = {field.name for field in left_schema.fields}
962
+ right_cols = {field.name for field in right_schema.fields}
963
+ right_field_map = {field.name: field for field in right_schema.fields}
964
+
965
+ missing_left = right_cols - left_cols
966
+ missing_right = left_cols - right_cols
967
+
968
+ def add_nulls(
969
+ missing_cols: set[str], to_df: snowpark.DataFrame, from_df: snowpark.DataFrame
970
+ ) -> snowpark.DataFrame:
971
+ dt_map = {field.name: field.datatype for field in from_df.schema.fields}
972
+ result = to_df.select(
973
+ "*",
974
+ *[lit(None).cast(dt_map[col]).alias(col) for col in missing_cols],
975
+ )
976
+
977
+ result_fields = []
978
+ for field in to_df.schema.fields:
979
+ result_fields.append(
980
+ StructField(field.name, field.datatype, field.nullable)
981
+ )
982
+ for col_name in missing_cols:
983
+ from_field = next(
984
+ field for field in from_df.schema.fields if field.name == col_name
985
+ )
986
+ result_fields.append(
987
+ StructField(col_name, from_field.datatype, from_field.nullable)
988
+ )
989
+
990
+ set_schema_getter(result, lambda: StructType(result_fields))
991
+
992
+ return result
993
+
994
+ if missing_left or missing_right:
995
+ if allow_missing_columns:
996
+ left = left_df
997
+ right = right_df
998
+ if missing_left:
999
+ left = add_nulls(missing_left, left, right)
1000
+ if missing_right:
1001
+ right = add_nulls(missing_right, right, left)
1002
+ result = left._union_by_name_internal(right, is_all=True)
1003
+
1004
+ result_fields = []
1005
+ for field in left_schema.fields:
1006
+ result_fields.append(
1007
+ StructField(field.name, field.datatype, field.nullable)
1008
+ )
1009
+ for col_name in missing_left:
1010
+ right_field = right_field_map[col_name]
1011
+ result_fields.append(
1012
+ StructField(col_name, right_field.datatype, right_field.nullable)
1013
+ )
1014
+
1015
+ set_schema_getter(result, lambda: StructType(result_fields))
1016
+ return result
1017
+ else:
1018
+ exception = (
1019
+ SnowparkClientExceptionMessages.DF_CANNOT_RESOLVE_COLUMN_NAME_AMONG(
1020
+ missing_left, missing_right
1021
+ )
1022
+ )
1023
+ attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
1024
+ raise exception
1025
+
1026
+ result = left_df.unionAllByName(
1027
+ right_df, allow_missing_columns=allow_missing_columns
1028
+ )
1029
+ set_schema_getter(result, lambda: left_df.schema)
1030
+ return result
@@ -15,6 +15,9 @@ from snowflake.snowpark_connect.column_name_handler import set_schema_getter
15
15
  from snowflake.snowpark_connect.config import global_config
16
16
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
17
17
  from snowflake.snowpark_connect.relation.map_relation import map_relation
18
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
19
+ without_internal_columns,
20
+ )
18
21
 
19
22
 
20
23
  def map_show_string(rel: relation_proto.Relation) -> pandas.DataFrame:
@@ -26,14 +29,17 @@ def map_show_string(rel: relation_proto.Relation) -> pandas.DataFrame:
26
29
  Buffer object as a single cell.
27
30
  """
28
31
  input_df_container: DataFrameContainer = map_relation(rel.show_string.input)
29
- raw_input_df = input_df_container.dataframe
30
- input_df = _handle_datetype_columns(raw_input_df)
32
+ filtered_container = without_internal_columns(input_df_container)
33
+ display_df = filtered_container.dataframe
34
+ display_spark_columns = filtered_container.column_map.get_spark_columns()
35
+
36
+ input_df = _handle_datetype_columns(display_df)
31
37
 
32
38
  show_string = input_df._show_string_spark(
33
39
  num_rows=rel.show_string.num_rows,
34
40
  truncate=rel.show_string.truncate,
35
41
  vertical=rel.show_string.vertical,
36
- _spark_column_names=input_df_container.column_map.get_spark_columns(),
42
+ _spark_column_names=display_spark_columns,
37
43
  _spark_session_tz=global_config.spark_sql_session_timeZone,
38
44
  )
39
45
  return pandas.DataFrame({"show_string": [show_string]})
@@ -44,14 +50,15 @@ def map_repr_html(rel: relation_proto.Relation) -> pandas.DataFrame:
44
50
  Generate the html string representation of the input dataframe.
45
51
  """
46
52
  input_df_container: DataFrameContainer = map_relation(rel.html_string.input)
47
- input_df = input_df_container.dataframe
48
53
 
54
+ filtered_container = without_internal_columns(input_df_container)
55
+ input_df = filtered_container.dataframe
49
56
  input_panda = input_df.toPandas()
50
57
  input_panda.rename(
51
58
  columns={
52
59
  analyzer_utils.unquote_if_quoted(
53
- input_df_container.column_map.get_snowpark_columns()[i]
54
- ): input_df_container.column_map.get_spark_columns()[i]
60
+ filtered_container.column_map.get_snowpark_columns()[i]
61
+ ): filtered_container.column_map.get_spark_columns()[i]
55
62
  for i in range(len(input_panda.columns))
56
63
  },
57
64
  inplace=True,