snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. snowflake/snowpark_connect/__init__.py +1 -0
  2. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
  3. snowflake/snowpark_connect/client/__init__.py +15 -0
  4. snowflake/snowpark_connect/client/error_utils.py +30 -0
  5. snowflake/snowpark_connect/client/exceptions.py +36 -0
  6. snowflake/snowpark_connect/client/query_results.py +90 -0
  7. snowflake/snowpark_connect/client/server.py +680 -0
  8. snowflake/snowpark_connect/client/utils/__init__.py +10 -0
  9. snowflake/snowpark_connect/client/utils/session.py +85 -0
  10. snowflake/snowpark_connect/column_name_handler.py +404 -243
  11. snowflake/snowpark_connect/column_qualifier.py +43 -0
  12. snowflake/snowpark_connect/config.py +237 -23
  13. snowflake/snowpark_connect/constants.py +2 -0
  14. snowflake/snowpark_connect/dataframe_container.py +102 -8
  15. snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
  16. snowflake/snowpark_connect/error/error_codes.py +50 -0
  17. snowflake/snowpark_connect/error/error_utils.py +172 -23
  18. snowflake/snowpark_connect/error/exceptions.py +13 -4
  19. snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
  20. snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
  21. snowflake/snowpark_connect/execute_plan/utils.py +5 -1
  22. snowflake/snowpark_connect/expression/function_defaults.py +9 -2
  23. snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
  24. snowflake/snowpark_connect/expression/literal.py +37 -13
  25. snowflake/snowpark_connect/expression/map_cast.py +123 -5
  26. snowflake/snowpark_connect/expression/map_expression.py +80 -27
  27. snowflake/snowpark_connect/expression/map_extension.py +322 -12
  28. snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
  29. snowflake/snowpark_connect/expression/map_udf.py +85 -20
  30. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
  31. snowflake/snowpark_connect/expression/map_unresolved_function.py +2748 -746
  32. snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
  33. snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
  34. snowflake/snowpark_connect/expression/map_window_function.py +18 -3
  35. snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
  36. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
  37. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
  38. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
  39. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
  40. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
  41. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
  42. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
  43. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
  44. snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
  45. snowflake/snowpark_connect/relation/io_utils.py +110 -10
  46. snowflake/snowpark_connect/relation/map_aggregate.py +196 -255
  47. snowflake/snowpark_connect/relation/map_catalog.py +5 -1
  48. snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
  49. snowflake/snowpark_connect/relation/map_extension.py +263 -29
  50. snowflake/snowpark_connect/relation/map_join.py +683 -442
  51. snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
  52. snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
  53. snowflake/snowpark_connect/relation/map_relation.py +48 -19
  54. snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
  55. snowflake/snowpark_connect/relation/map_show_string.py +13 -6
  56. snowflake/snowpark_connect/relation/map_sql.py +1233 -222
  57. snowflake/snowpark_connect/relation/map_stats.py +48 -9
  58. snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
  59. snowflake/snowpark_connect/relation/map_udtf.py +14 -4
  60. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
  61. snowflake/snowpark_connect/relation/read/map_read.py +134 -43
  62. snowflake/snowpark_connect/relation/read/map_read_csv.py +255 -45
  63. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +17 -5
  64. snowflake/snowpark_connect/relation/read/map_read_json.py +320 -85
  65. snowflake/snowpark_connect/relation/read/map_read_parquet.py +142 -27
  66. snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
  67. snowflake/snowpark_connect/relation/read/map_read_socket.py +11 -3
  68. snowflake/snowpark_connect/relation/read/map_read_table.py +82 -5
  69. snowflake/snowpark_connect/relation/read/map_read_text.py +18 -3
  70. snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
  71. snowflake/snowpark_connect/relation/read/reader_config.py +36 -3
  72. snowflake/snowpark_connect/relation/read/utils.py +50 -5
  73. snowflake/snowpark_connect/relation/stage_locator.py +91 -55
  74. snowflake/snowpark_connect/relation/utils.py +128 -5
  75. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
  76. snowflake/snowpark_connect/relation/write/map_write.py +929 -319
  77. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
  78. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  79. snowflake/snowpark_connect/resources_initializer.py +110 -48
  80. snowflake/snowpark_connect/server.py +546 -456
  81. snowflake/snowpark_connect/server_common/__init__.py +500 -0
  82. snowflake/snowpark_connect/snowflake_session.py +65 -0
  83. snowflake/snowpark_connect/start_server.py +53 -5
  84. snowflake/snowpark_connect/type_mapping.py +349 -27
  85. snowflake/snowpark_connect/typed_column.py +9 -7
  86. snowflake/snowpark_connect/utils/artifacts.py +9 -8
  87. snowflake/snowpark_connect/utils/cache.py +49 -27
  88. snowflake/snowpark_connect/utils/concurrent.py +36 -1
  89. snowflake/snowpark_connect/utils/context.py +187 -37
  90. snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
  91. snowflake/snowpark_connect/utils/env_utils.py +5 -1
  92. snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
  93. snowflake/snowpark_connect/utils/identifiers.py +137 -3
  94. snowflake/snowpark_connect/utils/io_utils.py +57 -1
  95. snowflake/snowpark_connect/utils/java_stored_procedure.py +125 -0
  96. snowflake/snowpark_connect/utils/java_udaf_utils.py +303 -0
  97. snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
  98. snowflake/snowpark_connect/utils/jvm_udf_utils.py +248 -0
  99. snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
  100. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
  101. snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
  102. snowflake/snowpark_connect/utils/profiling.py +25 -8
  103. snowflake/snowpark_connect/utils/scala_udf_utils.py +101 -332
  104. snowflake/snowpark_connect/utils/sequence.py +21 -0
  105. snowflake/snowpark_connect/utils/session.py +64 -28
  106. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
  107. snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
  108. snowflake/snowpark_connect/utils/telemetry.py +163 -22
  109. snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
  110. snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
  111. snowflake/snowpark_connect/utils/udf_cache.py +117 -41
  112. snowflake/snowpark_connect/utils/udf_helper.py +39 -37
  113. snowflake/snowpark_connect/utils/udf_utils.py +133 -14
  114. snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
  115. snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
  116. snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
  117. snowflake/snowpark_connect/version.py +1 -1
  118. snowflake/snowpark_decoder/dp_session.py +6 -2
  119. snowflake/snowpark_decoder/spark_decoder.py +12 -0
  120. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-submit +2 -2
  121. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/METADATA +14 -7
  122. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/RECORD +129 -167
  123. snowflake/snowpark_connect/hidden_column.py +0 -39
  124. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  125. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  126. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  127. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  128. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  129. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  130. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  131. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  132. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  133. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  134. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  135. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  136. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  137. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  138. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  139. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  140. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  141. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  142. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  143. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  144. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  145. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  146. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  147. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  148. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  149. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  150. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  151. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  152. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  153. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  154. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  155. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  156. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  157. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  158. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  159. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  160. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  161. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  162. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  163. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  164. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  165. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  166. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  167. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  168. snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
  169. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  170. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  171. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  172. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  173. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  174. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  175. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  176. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  177. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  178. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  179. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  180. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  181. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  182. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  183. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  184. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  185. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  186. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-connect +0 -0
  187. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-session +0 -0
  188. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/WHEEL +0 -0
  189. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE-binary +0 -0
  190. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE.txt +0 -0
  191. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/NOTICE-binary +0 -0
  192. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,12 @@
2
2
  # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
4
 
5
+ import concurrent.futures
5
6
  import copy
6
7
  import json
8
+ import os
7
9
  import typing
10
+ import uuid
8
11
  from contextlib import suppress
9
12
  from datetime import datetime
10
13
 
@@ -12,6 +15,7 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
12
15
 
13
16
  from snowflake import snowpark
14
17
  from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
18
+ from snowflake.snowpark._internal.utils import is_in_stored_procedure
15
19
  from snowflake.snowpark.row import Row
16
20
  from snowflake.snowpark.types import (
17
21
  ArrayType,
@@ -25,14 +29,21 @@ from snowflake.snowpark.types import (
25
29
  TimestampType,
26
30
  )
27
31
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
32
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
33
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
28
34
  from snowflake.snowpark_connect.relation.read.map_read import JsonReaderConfig
35
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
36
+ add_filename_metadata_to_reader,
37
+ )
29
38
  from snowflake.snowpark_connect.relation.read.utils import (
39
+ apply_metadata_exclusion_pattern,
30
40
  get_spark_column_names_from_snowpark_columns,
31
41
  rename_columns_as_snowflake_standard,
32
42
  )
33
43
  from snowflake.snowpark_connect.type_mapping import (
34
44
  cast_to_match_snowpark_type,
35
45
  map_simple_types,
46
+ merge_different_types,
36
47
  )
37
48
  from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
38
49
  from snowflake.snowpark_connect.utils.telemetry import (
@@ -40,6 +51,10 @@ from snowflake.snowpark_connect.utils.telemetry import (
40
51
  )
41
52
 
42
53
 
54
+ def _append_node_in_trace_stack(trace_stack: str, node: str) -> str:
55
+ return f"{trace_stack}:{node}"
56
+
57
+
43
58
  def map_read_json(
44
59
  rel: relation_proto.Relation,
45
60
  schema: StructType | None,
@@ -58,30 +73,42 @@ def map_read_json(
58
73
 
59
74
  if rel.read.is_streaming is True:
60
75
  # TODO: Structured streaming implementation.
61
- raise SnowparkConnectNotImplementedError(
76
+ exception = SnowparkConnectNotImplementedError(
62
77
  "Streaming is not supported for JSON files."
63
78
  )
79
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
80
+ raise exception
64
81
  else:
65
82
  snowpark_options = options.convert_to_snowpark_args()
83
+ raw_options = rel.read.data_source.options
66
84
  snowpark_options["infer_schema"] = True
67
85
 
68
86
  rows_to_infer_schema = snowpark_options.pop("rowstoinferschema", 1000)
69
87
  dropFieldIfAllNull = snowpark_options.pop("dropfieldifallnull", False)
70
88
  batch_size = snowpark_options.pop("batchsize", 1000)
71
89
 
72
- reader = session.read.options(snowpark_options)
90
+ apply_metadata_exclusion_pattern(snowpark_options)
91
+
92
+ reader = add_filename_metadata_to_reader(
93
+ session.read.options(snowpark_options), raw_options
94
+ )
73
95
 
74
96
  df = reader.json(paths[0])
75
97
  if len(paths) > 1:
76
98
  # TODO: figure out if this is what Spark does.
77
99
  for p in paths[1:]:
78
- df = df.union_all(session.read.options(snowpark_options).json(p))
100
+ df = df.union_all(
101
+ add_filename_metadata_to_reader(
102
+ session.read.options(snowpark_options), raw_options
103
+ ).json(p)
104
+ )
79
105
 
80
106
  if schema is None:
81
107
  schema = copy.deepcopy(df.schema)
82
108
  infer_row_counts = 0
83
109
 
84
110
  columns_with_valid_contents = set()
111
+ string_nodes_finalized = set[str]()
85
112
  for row in df.to_local_iterator():
86
113
  infer_row_counts += 1
87
114
  if (
@@ -90,7 +117,11 @@ def map_read_json(
90
117
  ):
91
118
  break
92
119
  schema = merge_row_schema(
93
- schema, row, columns_with_valid_contents, dropFieldIfAllNull
120
+ schema,
121
+ row,
122
+ columns_with_valid_contents,
123
+ string_nodes_finalized,
124
+ dropFieldIfAllNull,
94
125
  )
95
126
 
96
127
  if dropFieldIfAllNull:
@@ -100,6 +131,10 @@ def map_read_json(
100
131
  if unquote_if_quoted(sf.name) in columns_with_valid_contents
101
132
  ]
102
133
 
134
+ new_schema, fields_changed = validate_and_update_schema(schema)
135
+ if fields_changed:
136
+ schema = new_schema
137
+
103
138
  df = construct_dataframe_by_schema(
104
139
  schema, df.to_local_iterator(), session, snowpark_options, batch_size
105
140
  )
@@ -117,66 +152,199 @@ def map_read_json(
117
152
  )
118
153
 
119
154
 
155
+ def should_drop_field(field: StructField) -> bool:
156
+ if isinstance(field.datatype, StructType):
157
+ # "a" : {} => drop the field
158
+ if len(field.datatype.fields) == 0:
159
+ return True
160
+ elif (
161
+ isinstance(field.datatype, ArrayType)
162
+ and field.datatype.element_type is not None
163
+ and isinstance(field.datatype.element_type, StructType)
164
+ ):
165
+ if len(field.datatype.element_type.fields) == 0:
166
+ # "a" : [{}] => drop the field
167
+ return True
168
+ return False
169
+
170
+
171
+ # Validate the schema to ensure it is valid for Snowflake
172
+ # Handles these cases:
173
+ # 1. Drops StructField([])
174
+ # 2. Drops ArrayType(StructType([]))
175
+ # 3. ArrayType() -> ArrayType(StringType())
176
+ def validate_and_update_schema(schema: StructType | None) -> (StructType | None, bool):
177
+ if not isinstance(schema, StructType):
178
+ return schema, False
179
+ new_fields = []
180
+ fields_changed = False
181
+ for sf in schema.fields:
182
+ if should_drop_field(sf):
183
+ fields_changed = True
184
+ continue
185
+ if isinstance(sf.datatype, StructType):
186
+ # If the schema is a struct, validate the child schema
187
+ if len(sf.datatype.fields) == 0:
188
+ # No fields in the struct, drop the field
189
+ fields_changed = True
190
+ continue
191
+ child_field = StructField(sf.name, sf.datatype, sf.nullable)
192
+ # Recursively validate the child schema
193
+ child_field.datatype, child_field_changes = validate_and_update_schema(
194
+ sf.datatype
195
+ )
196
+ if should_drop_field(child_field):
197
+ fields_changed = True
198
+ continue
199
+ new_fields.append(child_field)
200
+ fields_changed = fields_changed or child_field_changes
201
+ elif isinstance(sf.datatype, ArrayType):
202
+ # If the schema is an array, validate the element schema
203
+ if sf.datatype.element_type is not None and isinstance(
204
+ sf.datatype.element_type, StructType
205
+ ):
206
+ # If the element schema is a struct, validate the element schema
207
+ if len(sf.datatype.element_type.fields) == 0:
208
+ # No fields in the struct, drop the field
209
+ fields_changed = True
210
+ continue
211
+ else:
212
+ # Recursively validate the element schema
213
+ element_schema, element_field_changes = validate_and_update_schema(
214
+ sf.datatype.element_type
215
+ )
216
+ if element_field_changes:
217
+ sf.datatype.element_type = element_schema
218
+ fields_changed = True
219
+ if should_drop_field(sf):
220
+ fields_changed = True
221
+ continue
222
+ elif sf.datatype.element_type is None:
223
+ fields_changed = True
224
+ sf.datatype.element_type = StringType()
225
+ new_fields.append(sf)
226
+ else:
227
+ new_fields.append(sf)
228
+ if fields_changed:
229
+ schema.fields = new_fields
230
+ return schema, fields_changed
231
+
232
+
120
233
  def merge_json_schema(
121
234
  content: typing.Any,
122
235
  schema: StructType | None,
236
+ trace_stack: str,
237
+ string_nodes_finalized: set[str],
123
238
  dropFieldIfAllNull: bool = False,
124
239
  ) -> DataType:
240
+ """
241
+ Merge the JSON content's schema into an existing schema structure.
242
+
243
+ This function recursively processes JSON content (dict, list, or primitive values) and merges
244
+ its inferred schema with an existing schema if provided. It handles nested structures like
245
+ objects (StructType) and arrays (ArrayType), and can optionally drop fields that are always null.
246
+
247
+ Args:
248
+ content: The JSON content to infer schema from. Can be a dict, list, primitive value, or None.
249
+ schema: The existing schema to merge with, or None if inferring from scratch.
250
+ trace_stack: A string representing the current position in the schema hierarchy,
251
+ used for tracking/debugging nested structures.
252
+ string_nodes_finalized: A set of strings representing the nodes that have been finalized as strings.
253
+ dropFieldIfAllNull: If True, fields that only contain null values will be excluded
254
+ from the resulting schema. Defaults to False.
255
+
256
+ Returns:
257
+ The merged schema as a DataType. Returns NullType if content is None and no existing
258
+ schema is provided. For dicts, returns StructType; for lists, returns ArrayType;
259
+ for primitives, returns the appropriate primitive type (StringType, IntegerType, etc.).
260
+ """
125
261
  if content is None:
126
262
  if schema is not None:
127
263
  return schema
128
264
  return NullType()
129
265
 
130
- if isinstance(content, str):
131
- with suppress(json.JSONDecodeError):
132
- json_content = json.loads(content)
133
- if not isinstance(json_content, str):
134
- content = json_content
266
+ if trace_stack in string_nodes_finalized:
267
+ return StringType()
135
268
 
136
269
  if isinstance(content, dict):
137
- current_schema = StructType()
270
+ additional_schemas = list[StructField]()
138
271
 
139
272
  existed_schema = {}
140
- if schema is not None and schema.type_name() == "struct":
141
- for sf in schema.fields:
142
- existed_schema[sf.name] = sf.datatype
273
+ if schema is not None:
274
+ if schema.type_name() == "struct":
275
+ for sf in schema.fields:
276
+ existed_schema[sf.name] = sf.datatype
277
+ else:
278
+ string_nodes_finalized.add(trace_stack)
279
+ return StringType()
143
280
 
144
281
  for k, v in content.items():
145
282
  col_name = f'"{unquote_if_quoted(k)}"'
146
283
  existed_data_type = existed_schema.get(col_name, None)
147
284
  next_level_schema = merge_json_schema(
148
- v, existed_data_type, dropFieldIfAllNull
285
+ v,
286
+ existed_data_type,
287
+ _append_node_in_trace_stack(trace_stack, col_name),
288
+ string_nodes_finalized,
289
+ dropFieldIfAllNull,
149
290
  )
150
291
 
151
- if (
152
- existed_data_type is not None
153
- or not dropFieldIfAllNull
154
- or not isinstance(next_level_schema, NullType)
155
- ):
292
+ if not dropFieldIfAllNull or not isinstance(next_level_schema, NullType):
156
293
  # Drop field if it's always null
157
- current_schema.add(StructField(col_name, next_level_schema))
158
- if existed_data_type is not None:
159
- del existed_schema[col_name]
294
+ if col_name in existed_schema:
295
+ existed_schema[col_name] = next_level_schema
296
+ else:
297
+ additional_schemas.append(StructField(col_name, next_level_schema))
160
298
 
161
- for k, v in existed_schema.items():
162
- col_name = f'"{unquote_if_quoted(k)}"'
163
- current_schema.add(StructField(col_name, v))
299
+ current_schema = StructType()
300
+ if schema is not None and schema.type_name() == "struct":
301
+ # Keep the order of columns in the schema
302
+ for sf in schema.fields:
303
+ col_name = f'"{unquote_if_quoted(sf.name)}"'
304
+ if (
305
+ not dropFieldIfAllNull
306
+ or existed_schema.get(col_name, NullType()) != NullType()
307
+ ):
308
+ current_schema.add(
309
+ StructField(col_name, existed_schema.get(col_name, NullType()))
310
+ )
311
+
312
+ for additional_schema in additional_schemas:
313
+ current_schema.add(additional_schema)
164
314
 
165
315
  elif isinstance(content, list):
166
316
  # ArrayType(*) need to have element schema inside, it would be NullType() as placeholder and keep updating while enumerating
167
317
  inner_schema = NullType()
168
- if schema is not None and schema.type_name() == "list":
169
- inner_schema = schema.element_type
170
- if len(content) > 0:
171
- for v in content:
172
- inner_schema = merge_json_schema(v, inner_schema, dropFieldIfAllNull)
173
- if isinstance(inner_schema, NullType) and dropFieldIfAllNull:
174
- return NullType()
318
+ next_level_trace_stack = _append_node_in_trace_stack(trace_stack, "$array")
319
+
320
+ if schema is not None:
321
+ if schema.type_name() in ("list", "array"):
322
+ inner_schema = schema.element_type
323
+ else:
324
+ string_nodes_finalized.add(trace_stack)
325
+ return StringType()
326
+
327
+ if next_level_trace_stack in string_nodes_finalized:
328
+ inner_schema = StringType()
329
+ else:
330
+ if len(content) > 0:
331
+ for v in content:
332
+ inner_schema = merge_json_schema(
333
+ v,
334
+ inner_schema,
335
+ next_level_trace_stack,
336
+ string_nodes_finalized,
337
+ dropFieldIfAllNull,
338
+ )
339
+ if isinstance(inner_schema, StringType):
340
+ string_nodes_finalized.add(next_level_trace_stack)
341
+ break
342
+ if isinstance(inner_schema, NullType) and dropFieldIfAllNull:
343
+ return NullType()
175
344
  current_schema = ArrayType(inner_schema)
176
345
  else:
177
346
  current_schema = map_simple_types(type(content).__name__)
178
347
 
179
- # If there's conflict , use StringType
180
348
  if (
181
349
  schema is not None
182
350
  and schema != NullType()
@@ -184,28 +352,55 @@ def merge_json_schema(
184
352
  and current_schema != NullType()
185
353
  and schema.type_name() != current_schema.type_name()
186
354
  ):
187
- return StringType()
355
+ current_schema = merge_different_types(schema, current_schema)
356
+
357
+ if isinstance(current_schema, StructType) or isinstance(current_schema, ArrayType):
358
+ current_schema.structured = True
188
359
 
189
- current_schema.structured = True
360
+ if isinstance(current_schema, StringType):
361
+ string_nodes_finalized.add(trace_stack)
190
362
  return current_schema
191
363
 
192
364
 
193
365
  def merge_row_schema(
194
366
  schema: StructType | None,
195
367
  row: Row,
196
- columns_with_valid_contents: set,
368
+ columns_with_valid_contents: set[str],
369
+ string_nodes_finalized: set[str],
197
370
  dropFieldIfAllNull: bool = False,
198
371
  ) -> StructType | NullType:
372
+ """
373
+ Merge the schema inferred from a single row with the existing schema.
374
+
375
+ This function updates the schema by examining each row of data and merging
376
+ type information. It handles nested structures (StructType, MapType, ArrayType)
377
+ and attempts to parse JSON strings to infer deeper schema structures.
378
+
379
+ Args:
380
+ schema: The current schema to merge with
381
+ row: A single row of data to examine
382
+ columns_with_valid_contents: Set to track columns that have non-null values
383
+ string_nodes_finalized: Set to track nodes that have been finalized as strings
384
+ dropFieldIfAllNull: If True, fields that are always null will be dropped
385
+
386
+ Returns:
387
+ The merged schema as a StructType, or NullType if the row is None and no schema exists
388
+ """
389
+
199
390
  if row is None:
200
391
  if schema is not None:
201
392
  return schema
202
393
  return NullType()
394
+
203
395
  new_schema = StructType()
204
396
 
205
397
  for sf in schema.fields:
206
398
  col_name = unquote_if_quoted(sf.name)
207
- if isinstance(sf.datatype, (StructType, MapType, StringType)):
399
+ if col_name in string_nodes_finalized:
400
+ columns_with_valid_contents.add(col_name)
401
+ elif isinstance(sf.datatype, (StructType, MapType, StringType)):
208
402
  next_level_content = row[col_name]
403
+ next_level_trace_stack = _append_node_in_trace_stack(col_name, col_name)
209
404
  if next_level_content is not None:
210
405
  with suppress(json.JSONDecodeError):
211
406
  if isinstance(next_level_content, datetime):
@@ -217,10 +412,13 @@ def merge_row_schema(
217
412
  None
218
413
  if not isinstance(sf.datatype, StructType)
219
414
  else sf.datatype,
415
+ next_level_trace_stack,
416
+ string_nodes_finalized,
220
417
  dropFieldIfAllNull,
221
418
  )
222
419
  else:
223
420
  sf.datatype = StringType()
421
+ string_nodes_finalized.add(col_name)
224
422
  columns_with_valid_contents.add(col_name)
225
423
 
226
424
  elif isinstance(sf.datatype, ArrayType):
@@ -230,43 +428,59 @@ def merge_row_schema(
230
428
  decoded_content = json.loads(content)
231
429
  if isinstance(decoded_content, list):
232
430
  content = decoded_content
233
- if not isinstance(content, list):
431
+ if not isinstance(content, list) or col_name in string_nodes_finalized:
234
432
  sf.datatype = StringType()
433
+ string_nodes_finalized.add(col_name)
235
434
  else:
236
- for v in content:
237
- if v is not None:
238
- columns_with_valid_contents.add(col_name)
239
- sf.datatype.element_type = merge_json_schema(
240
- v,
241
- sf.datatype.element_type,
242
- dropFieldIfAllNull,
243
- )
435
+ next_level_trace_stack = _append_node_in_trace_stack(
436
+ col_name, "array"
437
+ )
438
+ if next_level_trace_stack in string_nodes_finalized:
439
+ sf.datatype.element_type = StringType()
440
+ else:
441
+ inner_schema = sf.datatype.element_type
442
+ for v in content:
443
+ if v is not None:
444
+ columns_with_valid_contents.add(col_name)
445
+ inner_schema = merge_json_schema(
446
+ v,
447
+ inner_schema,
448
+ next_level_trace_stack,
449
+ string_nodes_finalized,
450
+ dropFieldIfAllNull,
451
+ )
452
+ if isinstance(inner_schema, StringType):
453
+ string_nodes_finalized.add(next_level_trace_stack)
454
+ break
455
+ sf.datatype.element_type = inner_schema
244
456
  elif isinstance(sf.datatype, TimestampType):
245
457
  sf.datatype = StringType()
246
458
  columns_with_valid_contents.add(col_name)
459
+ string_nodes_finalized.add(col_name)
247
460
  elif row[col_name] is not None:
248
461
  columns_with_valid_contents.add(col_name)
249
462
 
250
- sf.datatype.structured = True
463
+ if isinstance(sf.datatype, StructType) or isinstance(sf.datatype, ArrayType):
464
+ sf.datatype.structured = True
251
465
  new_schema.add(sf)
252
466
 
253
- return schema
467
+ return new_schema
254
468
 
255
469
 
256
- def union_data_into_df(
257
- result_df: snowpark.DataFrame,
258
- data: typing.List[Row],
259
- schema: StructType,
470
+ def insert_data_chunk(
260
471
  session: snowpark.Session,
261
- ) -> snowpark.DataFrame:
262
- current_df = session.create_dataframe(
472
+ data: list[Row],
473
+ schema: StructType,
474
+ table_name: str,
475
+ ) -> None:
476
+ df = session.create_dataframe(
263
477
  data=data,
264
478
  schema=schema,
265
479
  )
266
- if result_df is None:
267
- return current_df
268
480
 
269
- return result_df.union(current_df)
481
+ df.write.mode("append").save_as_table(
482
+ table_name, table_type="temp", table_exists=True
483
+ )
270
484
 
271
485
 
272
486
  def construct_dataframe_by_schema(
@@ -276,39 +490,55 @@ def construct_dataframe_by_schema(
276
490
  snowpark_options: dict,
277
491
  batch_size: int = 1000,
278
492
  ) -> snowpark.DataFrame:
279
- result = None
493
+ table_name = "__sas_json_read_temp_" + uuid.uuid4().hex
494
+
495
+ # We can have more workers than CPU count, this is an IO-intensive task
496
+ max_workers = min(16, os.cpu_count() * 2)
280
497
 
281
498
  current_data = []
282
499
  progress = 0
283
- for row in rows:
284
- current_data.append(construct_row_by_schema(row, schema, snowpark_options))
285
- if len(current_data) >= batch_size:
500
+
501
+ # Initialize the temp table
502
+ session.create_dataframe([], schema=schema).write.mode("append").save_as_table(
503
+ table_name, table_type="temp", table_exists=False
504
+ )
505
+
506
+ is_running_in_stored_proc = is_in_stored_procedure()
507
+
508
+ # We are having issues in which the read is not giving correct number of rows
509
+ # in storedprocs when the number of workers are more than 1
510
+ # as a temporary fix we will make max_workers to 1
511
+ if is_running_in_stored_proc:
512
+ max_workers = 1
513
+
514
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as exc:
515
+ for row in rows:
516
+ current_data.append(construct_row_by_schema(row, schema, snowpark_options))
517
+ if len(current_data) >= batch_size:
518
+ progress += len(current_data)
519
+ exc.submit(
520
+ insert_data_chunk,
521
+ session,
522
+ copy.deepcopy(current_data),
523
+ schema,
524
+ table_name,
525
+ )
526
+
527
+ logger.info(f"JSON reader: finished processing {progress} rows")
528
+ current_data.clear()
529
+
530
+ if len(current_data) > 0:
286
531
  progress += len(current_data)
287
- result = union_data_into_df(
288
- result,
289
- current_data,
290
- schema,
532
+ exc.submit(
533
+ insert_data_chunk,
291
534
  session,
535
+ copy.deepcopy(current_data),
536
+ schema,
537
+ table_name,
292
538
  )
293
-
294
539
  logger.info(f"JSON reader: finished processing {progress} rows")
295
- current_data = []
296
-
297
- if len(current_data) > 0:
298
- progress += len(current_data)
299
- result = union_data_into_df(
300
- result,
301
- current_data,
302
- schema,
303
- session,
304
- )
305
540
 
306
- logger.info(f"JSON reader: finished processing {progress} rows")
307
- current_data = []
308
-
309
- if result is None:
310
- raise ValueError("Dataframe cannot be empty")
311
- return result
541
+ return session.table(table_name)
312
542
 
313
543
 
314
544
  def construct_row_by_schema(
@@ -342,17 +572,22 @@ def construct_row_by_schema(
342
572
  content.get(col_name, None), sf.datatype, snowpark_options
343
573
  )
344
574
  else:
345
- raise SnowparkConnectNotImplementedError(
575
+ exception = SnowparkConnectNotImplementedError(
346
576
  f"JSON construct {str(content)} to StructType failed"
347
577
  )
578
+ attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
579
+ raise exception
348
580
  return result
349
581
  elif isinstance(schema, ArrayType):
350
582
  result = []
351
583
  inner_schema = schema.element_type
352
584
  if isinstance(content, str):
353
585
  content = json.loads(content)
354
- for ele in content:
355
- result.append(construct_row_by_schema(ele, inner_schema, snowpark_options))
586
+ if inner_schema is not None:
587
+ for ele in content:
588
+ result.append(
589
+ construct_row_by_schema(ele, inner_schema, snowpark_options)
590
+ )
356
591
  return result
357
592
  elif isinstance(schema, DateType):
358
593
  return cast_to_match_snowpark_type(