snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. snowflake/snowpark_connect/__init__.py +1 -0
  2. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
  3. snowflake/snowpark_connect/client/__init__.py +15 -0
  4. snowflake/snowpark_connect/client/error_utils.py +30 -0
  5. snowflake/snowpark_connect/client/exceptions.py +36 -0
  6. snowflake/snowpark_connect/client/query_results.py +90 -0
  7. snowflake/snowpark_connect/client/server.py +717 -0
  8. snowflake/snowpark_connect/client/utils/__init__.py +10 -0
  9. snowflake/snowpark_connect/client/utils/session.py +85 -0
  10. snowflake/snowpark_connect/column_name_handler.py +404 -243
  11. snowflake/snowpark_connect/column_qualifier.py +43 -0
  12. snowflake/snowpark_connect/config.py +309 -26
  13. snowflake/snowpark_connect/constants.py +2 -0
  14. snowflake/snowpark_connect/dataframe_container.py +102 -8
  15. snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
  16. snowflake/snowpark_connect/error/error_codes.py +50 -0
  17. snowflake/snowpark_connect/error/error_utils.py +172 -23
  18. snowflake/snowpark_connect/error/exceptions.py +13 -4
  19. snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
  20. snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
  21. snowflake/snowpark_connect/execute_plan/utils.py +5 -1
  22. snowflake/snowpark_connect/expression/error_utils.py +28 -0
  23. snowflake/snowpark_connect/expression/function_defaults.py +9 -2
  24. snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
  25. snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
  26. snowflake/snowpark_connect/expression/literal.py +37 -13
  27. snowflake/snowpark_connect/expression/map_cast.py +224 -15
  28. snowflake/snowpark_connect/expression/map_expression.py +80 -27
  29. snowflake/snowpark_connect/expression/map_extension.py +322 -12
  30. snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
  31. snowflake/snowpark_connect/expression/map_udf.py +86 -20
  32. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
  33. snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
  34. snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
  35. snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
  36. snowflake/snowpark_connect/expression/map_window_function.py +18 -3
  37. snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
  38. snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
  39. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
  40. snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
  41. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
  42. snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
  43. snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
  44. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
  45. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
  46. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
  47. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
  48. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
  49. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
  50. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
  51. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
  52. snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
  53. snowflake/snowpark_connect/relation/io_utils.py +110 -10
  54. snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
  55. snowflake/snowpark_connect/relation/map_catalog.py +5 -1
  56. snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
  57. snowflake/snowpark_connect/relation/map_extension.py +263 -29
  58. snowflake/snowpark_connect/relation/map_join.py +683 -442
  59. snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
  60. snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
  61. snowflake/snowpark_connect/relation/map_relation.py +48 -19
  62. snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
  63. snowflake/snowpark_connect/relation/map_show_string.py +13 -6
  64. snowflake/snowpark_connect/relation/map_sql.py +1233 -222
  65. snowflake/snowpark_connect/relation/map_stats.py +48 -9
  66. snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
  67. snowflake/snowpark_connect/relation/map_udtf.py +14 -4
  68. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
  69. snowflake/snowpark_connect/relation/read/map_read.py +134 -43
  70. snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
  71. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
  72. snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
  73. snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
  74. snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
  75. snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
  76. snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
  77. snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
  78. snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
  79. snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
  80. snowflake/snowpark_connect/relation/read/utils.py +50 -5
  81. snowflake/snowpark_connect/relation/stage_locator.py +91 -55
  82. snowflake/snowpark_connect/relation/utils.py +128 -5
  83. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
  84. snowflake/snowpark_connect/relation/write/map_write.py +929 -319
  85. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
  86. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  87. snowflake/snowpark_connect/resources_initializer.py +171 -48
  88. snowflake/snowpark_connect/server.py +528 -473
  89. snowflake/snowpark_connect/server_common/__init__.py +503 -0
  90. snowflake/snowpark_connect/snowflake_session.py +65 -0
  91. snowflake/snowpark_connect/start_server.py +53 -5
  92. snowflake/snowpark_connect/type_mapping.py +349 -27
  93. snowflake/snowpark_connect/type_support.py +130 -0
  94. snowflake/snowpark_connect/typed_column.py +9 -7
  95. snowflake/snowpark_connect/utils/artifacts.py +9 -8
  96. snowflake/snowpark_connect/utils/cache.py +49 -27
  97. snowflake/snowpark_connect/utils/concurrent.py +36 -1
  98. snowflake/snowpark_connect/utils/context.py +195 -37
  99. snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
  100. snowflake/snowpark_connect/utils/env_utils.py +5 -1
  101. snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
  102. snowflake/snowpark_connect/utils/identifiers.py +137 -3
  103. snowflake/snowpark_connect/utils/io_utils.py +57 -1
  104. snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
  105. snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
  106. snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
  107. snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
  108. snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
  109. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
  110. snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
  111. snowflake/snowpark_connect/utils/profiling.py +25 -8
  112. snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
  113. snowflake/snowpark_connect/utils/sequence.py +21 -0
  114. snowflake/snowpark_connect/utils/session.py +64 -28
  115. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
  116. snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
  117. snowflake/snowpark_connect/utils/telemetry.py +192 -40
  118. snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
  119. snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
  120. snowflake/snowpark_connect/utils/udf_cache.py +117 -41
  121. snowflake/snowpark_connect/utils/udf_helper.py +39 -37
  122. snowflake/snowpark_connect/utils/udf_utils.py +133 -14
  123. snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
  124. snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
  125. snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
  126. snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
  127. snowflake/snowpark_connect/version.py +1 -1
  128. snowflake/snowpark_decoder/dp_session.py +6 -2
  129. snowflake/snowpark_decoder/spark_decoder.py +12 -0
  130. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
  131. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
  132. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
  133. snowflake/snowpark_connect/hidden_column.py +0 -39
  134. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  135. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  136. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  137. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  138. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  139. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  140. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  141. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  142. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  143. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  144. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  145. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  146. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  147. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  148. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  149. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  150. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  151. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  152. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  153. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  154. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  155. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  156. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  157. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  158. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  159. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  160. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  161. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  162. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  163. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  164. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  165. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  166. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  167. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  168. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  169. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  170. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  171. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  172. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  173. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  174. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  175. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  176. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  177. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  178. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  179. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  180. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  181. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  182. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  183. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  184. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  185. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  186. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  187. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  188. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  189. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  190. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  191. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  192. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  193. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  194. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
  195. {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
  196. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
  197. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
  198. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
  199. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
  200. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
@@ -2,9 +2,12 @@
2
2
  # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
4
 
5
+ import concurrent.futures
5
6
  import copy
6
7
  import json
8
+ import os
7
9
  import typing
10
+ import uuid
8
11
  from contextlib import suppress
9
12
  from datetime import datetime
10
13
 
@@ -12,6 +15,7 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
12
15
 
13
16
  from snowflake import snowpark
14
17
  from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
18
+ from snowflake.snowpark._internal.utils import is_in_stored_procedure
15
19
  from snowflake.snowpark.row import Row
16
20
  from snowflake.snowpark.types import (
17
21
  ArrayType,
@@ -25,21 +29,33 @@ from snowflake.snowpark.types import (
25
29
  TimestampType,
26
30
  )
27
31
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
32
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
33
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
28
34
  from snowflake.snowpark_connect.relation.read.map_read import JsonReaderConfig
35
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
36
+ add_filename_metadata_to_reader,
37
+ )
29
38
  from snowflake.snowpark_connect.relation.read.utils import (
39
+ apply_metadata_exclusion_pattern,
30
40
  get_spark_column_names_from_snowpark_columns,
31
41
  rename_columns_as_snowflake_standard,
32
42
  )
33
43
  from snowflake.snowpark_connect.type_mapping import (
34
44
  cast_to_match_snowpark_type,
35
45
  map_simple_types,
46
+ merge_different_types,
36
47
  )
48
+ from snowflake.snowpark_connect.type_support import emulate_integral_types
37
49
  from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
38
50
  from snowflake.snowpark_connect.utils.telemetry import (
39
51
  SnowparkConnectNotImplementedError,
40
52
  )
41
53
 
42
54
 
55
+ def _append_node_in_trace_stack(trace_stack: str, node: str) -> str:
56
+ return f"{trace_stack}:{node}"
57
+
58
+
43
59
  def map_read_json(
44
60
  rel: relation_proto.Relation,
45
61
  schema: StructType | None,
@@ -58,30 +74,42 @@ def map_read_json(
58
74
 
59
75
  if rel.read.is_streaming is True:
60
76
  # TODO: Structured streaming implementation.
61
- raise SnowparkConnectNotImplementedError(
77
+ exception = SnowparkConnectNotImplementedError(
62
78
  "Streaming is not supported for JSON files."
63
79
  )
80
+ attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
81
+ raise exception
64
82
  else:
65
83
  snowpark_options = options.convert_to_snowpark_args()
84
+ raw_options = rel.read.data_source.options
66
85
  snowpark_options["infer_schema"] = True
67
86
 
68
87
  rows_to_infer_schema = snowpark_options.pop("rowstoinferschema", 1000)
69
88
  dropFieldIfAllNull = snowpark_options.pop("dropfieldifallnull", False)
70
89
  batch_size = snowpark_options.pop("batchsize", 1000)
71
90
 
72
- reader = session.read.options(snowpark_options)
91
+ apply_metadata_exclusion_pattern(snowpark_options)
92
+
93
+ reader = add_filename_metadata_to_reader(
94
+ session.read.options(snowpark_options), raw_options
95
+ )
73
96
 
74
97
  df = reader.json(paths[0])
75
98
  if len(paths) > 1:
76
99
  # TODO: figure out if this is what Spark does.
77
100
  for p in paths[1:]:
78
- df = df.union_all(session.read.options(snowpark_options).json(p))
101
+ df = df.union_all(
102
+ add_filename_metadata_to_reader(
103
+ session.read.options(snowpark_options), raw_options
104
+ ).json(p)
105
+ )
79
106
 
80
107
  if schema is None:
81
108
  schema = copy.deepcopy(df.schema)
82
109
  infer_row_counts = 0
83
110
 
84
111
  columns_with_valid_contents = set()
112
+ string_nodes_finalized = set[str]()
85
113
  for row in df.to_local_iterator():
86
114
  infer_row_counts += 1
87
115
  if (
@@ -90,7 +118,11 @@ def map_read_json(
90
118
  ):
91
119
  break
92
120
  schema = merge_row_schema(
93
- schema, row, columns_with_valid_contents, dropFieldIfAllNull
121
+ schema,
122
+ row,
123
+ columns_with_valid_contents,
124
+ string_nodes_finalized,
125
+ dropFieldIfAllNull,
94
126
  )
95
127
 
96
128
  if dropFieldIfAllNull:
@@ -100,6 +132,10 @@ def map_read_json(
100
132
  if unquote_if_quoted(sf.name) in columns_with_valid_contents
101
133
  ]
102
134
 
135
+ new_schema, fields_changed = validate_and_update_schema(schema)
136
+ if fields_changed:
137
+ schema = new_schema
138
+
103
139
  df = construct_dataframe_by_schema(
104
140
  schema, df.to_local_iterator(), session, snowpark_options, batch_size
105
141
  )
@@ -113,70 +149,205 @@ def map_read_json(
113
149
  dataframe=renamed_df,
114
150
  spark_column_names=spark_column_names,
115
151
  snowpark_column_names=snowpark_column_names,
116
- snowpark_column_types=[f.datatype for f in df.schema.fields],
152
+ snowpark_column_types=[
153
+ emulate_integral_types(f.datatype) for f in df.schema.fields
154
+ ],
117
155
  )
118
156
 
119
157
 
158
+ def should_drop_field(field: StructField) -> bool:
159
+ if isinstance(field.datatype, StructType):
160
+ # "a" : {} => drop the field
161
+ if len(field.datatype.fields) == 0:
162
+ return True
163
+ elif (
164
+ isinstance(field.datatype, ArrayType)
165
+ and field.datatype.element_type is not None
166
+ and isinstance(field.datatype.element_type, StructType)
167
+ ):
168
+ if len(field.datatype.element_type.fields) == 0:
169
+ # "a" : [{}] => drop the field
170
+ return True
171
+ return False
172
+
173
+
174
+ # Validate the schema to ensure it is valid for Snowflake
175
+ # Handles these cases:
176
+ # 1. Drops StructField([])
177
+ # 2. Drops ArrayType(StructType([]))
178
+ # 3. ArrayType() -> ArrayType(StringType())
179
+ def validate_and_update_schema(schema: StructType | None) -> (StructType | None, bool):
180
+ if not isinstance(schema, StructType):
181
+ return schema, False
182
+ new_fields = []
183
+ fields_changed = False
184
+ for sf in schema.fields:
185
+ if should_drop_field(sf):
186
+ fields_changed = True
187
+ continue
188
+ if isinstance(sf.datatype, StructType):
189
+ # If the schema is a struct, validate the child schema
190
+ if len(sf.datatype.fields) == 0:
191
+ # No fields in the struct, drop the field
192
+ fields_changed = True
193
+ continue
194
+ child_field = StructField(sf.name, sf.datatype, sf.nullable)
195
+ # Recursively validate the child schema
196
+ child_field.datatype, child_field_changes = validate_and_update_schema(
197
+ sf.datatype
198
+ )
199
+ if should_drop_field(child_field):
200
+ fields_changed = True
201
+ continue
202
+ new_fields.append(child_field)
203
+ fields_changed = fields_changed or child_field_changes
204
+ elif isinstance(sf.datatype, ArrayType):
205
+ # If the schema is an array, validate the element schema
206
+ if sf.datatype.element_type is not None and isinstance(
207
+ sf.datatype.element_type, StructType
208
+ ):
209
+ # If the element schema is a struct, validate the element schema
210
+ if len(sf.datatype.element_type.fields) == 0:
211
+ # No fields in the struct, drop the field
212
+ fields_changed = True
213
+ continue
214
+ else:
215
+ # Recursively validate the element schema
216
+ element_schema, element_field_changes = validate_and_update_schema(
217
+ sf.datatype.element_type
218
+ )
219
+ if element_field_changes:
220
+ sf.datatype.element_type = element_schema
221
+ fields_changed = True
222
+ if should_drop_field(sf):
223
+ fields_changed = True
224
+ continue
225
+ elif sf.datatype.element_type is None:
226
+ fields_changed = True
227
+ sf.datatype.element_type = StringType()
228
+ new_fields.append(sf)
229
+ else:
230
+ new_fields.append(sf)
231
+ if fields_changed:
232
+ schema.fields = new_fields
233
+ return schema, fields_changed
234
+
235
+
120
236
  def merge_json_schema(
121
237
  content: typing.Any,
122
238
  schema: StructType | None,
239
+ trace_stack: str,
240
+ string_nodes_finalized: set[str],
123
241
  dropFieldIfAllNull: bool = False,
124
242
  ) -> DataType:
243
+ """
244
+ Merge the JSON content's schema into an existing schema structure.
245
+
246
+ This function recursively processes JSON content (dict, list, or primitive values) and merges
247
+ its inferred schema with an existing schema if provided. It handles nested structures like
248
+ objects (StructType) and arrays (ArrayType), and can optionally drop fields that are always null.
249
+
250
+ Args:
251
+ content: The JSON content to infer schema from. Can be a dict, list, primitive value, or None.
252
+ schema: The existing schema to merge with, or None if inferring from scratch.
253
+ trace_stack: A string representing the current position in the schema hierarchy,
254
+ used for tracking/debugging nested structures.
255
+ string_nodes_finalized: A set of strings representing the nodes that have been finalized as strings.
256
+ dropFieldIfAllNull: If True, fields that only contain null values will be excluded
257
+ from the resulting schema. Defaults to False.
258
+
259
+ Returns:
260
+ The merged schema as a DataType. Returns NullType if content is None and no existing
261
+ schema is provided. For dicts, returns StructType; for lists, returns ArrayType;
262
+ for primitives, returns the appropriate primitive type (StringType, IntegerType, etc.).
263
+ """
125
264
  if content is None:
126
265
  if schema is not None:
127
266
  return schema
128
267
  return NullType()
129
268
 
130
- if isinstance(content, str):
131
- with suppress(json.JSONDecodeError):
132
- json_content = json.loads(content)
133
- if not isinstance(json_content, str):
134
- content = json_content
269
+ if trace_stack in string_nodes_finalized:
270
+ return StringType()
135
271
 
136
272
  if isinstance(content, dict):
137
- current_schema = StructType()
273
+ additional_schemas = list[StructField]()
138
274
 
139
275
  existed_schema = {}
140
- if schema is not None and schema.type_name() == "struct":
141
- for sf in schema.fields:
142
- existed_schema[sf.name] = sf.datatype
276
+ if schema is not None:
277
+ if schema.type_name() == "struct":
278
+ for sf in schema.fields:
279
+ existed_schema[sf.name] = sf.datatype
280
+ else:
281
+ string_nodes_finalized.add(trace_stack)
282
+ return StringType()
143
283
 
144
284
  for k, v in content.items():
145
285
  col_name = f'"{unquote_if_quoted(k)}"'
146
286
  existed_data_type = existed_schema.get(col_name, None)
147
287
  next_level_schema = merge_json_schema(
148
- v, existed_data_type, dropFieldIfAllNull
288
+ v,
289
+ existed_data_type,
290
+ _append_node_in_trace_stack(trace_stack, col_name),
291
+ string_nodes_finalized,
292
+ dropFieldIfAllNull,
149
293
  )
150
294
 
151
- if (
152
- existed_data_type is not None
153
- or not dropFieldIfAllNull
154
- or not isinstance(next_level_schema, NullType)
155
- ):
295
+ if not dropFieldIfAllNull or not isinstance(next_level_schema, NullType):
156
296
  # Drop field if it's always null
157
- current_schema.add(StructField(col_name, next_level_schema))
158
- if existed_data_type is not None:
159
- del existed_schema[col_name]
297
+ if col_name in existed_schema:
298
+ existed_schema[col_name] = next_level_schema
299
+ else:
300
+ additional_schemas.append(StructField(col_name, next_level_schema))
160
301
 
161
- for k, v in existed_schema.items():
162
- col_name = f'"{unquote_if_quoted(k)}"'
163
- current_schema.add(StructField(col_name, v))
302
+ current_schema = StructType()
303
+ if schema is not None and schema.type_name() == "struct":
304
+ # Keep the order of columns in the schema
305
+ for sf in schema.fields:
306
+ col_name = f'"{unquote_if_quoted(sf.name)}"'
307
+ if (
308
+ not dropFieldIfAllNull
309
+ or existed_schema.get(col_name, NullType()) != NullType()
310
+ ):
311
+ current_schema.add(
312
+ StructField(col_name, existed_schema.get(col_name, NullType()))
313
+ )
314
+
315
+ for additional_schema in additional_schemas:
316
+ current_schema.add(additional_schema)
164
317
 
165
318
  elif isinstance(content, list):
166
319
  # ArrayType(*) need to have element schema inside, it would be NullType() as placeholder and keep updating while enumerating
167
320
  inner_schema = NullType()
168
- if schema is not None and schema.type_name() == "list":
169
- inner_schema = schema.element_type
170
- if len(content) > 0:
171
- for v in content:
172
- inner_schema = merge_json_schema(v, inner_schema, dropFieldIfAllNull)
173
- if isinstance(inner_schema, NullType) and dropFieldIfAllNull:
174
- return NullType()
321
+ next_level_trace_stack = _append_node_in_trace_stack(trace_stack, "$array")
322
+
323
+ if schema is not None:
324
+ if schema.type_name() in ("list", "array"):
325
+ inner_schema = schema.element_type
326
+ else:
327
+ string_nodes_finalized.add(trace_stack)
328
+ return StringType()
329
+
330
+ if next_level_trace_stack in string_nodes_finalized:
331
+ inner_schema = StringType()
332
+ else:
333
+ if len(content) > 0:
334
+ for v in content:
335
+ inner_schema = merge_json_schema(
336
+ v,
337
+ inner_schema,
338
+ next_level_trace_stack,
339
+ string_nodes_finalized,
340
+ dropFieldIfAllNull,
341
+ )
342
+ if isinstance(inner_schema, StringType):
343
+ string_nodes_finalized.add(next_level_trace_stack)
344
+ break
345
+ if isinstance(inner_schema, NullType) and dropFieldIfAllNull:
346
+ return NullType()
175
347
  current_schema = ArrayType(inner_schema)
176
348
  else:
177
349
  current_schema = map_simple_types(type(content).__name__)
178
350
 
179
- # If there's conflict , use StringType
180
351
  if (
181
352
  schema is not None
182
353
  and schema != NullType()
@@ -184,28 +355,55 @@ def merge_json_schema(
184
355
  and current_schema != NullType()
185
356
  and schema.type_name() != current_schema.type_name()
186
357
  ):
187
- return StringType()
358
+ current_schema = merge_different_types(schema, current_schema)
359
+
360
+ if isinstance(current_schema, StructType) or isinstance(current_schema, ArrayType):
361
+ current_schema.structured = True
188
362
 
189
- current_schema.structured = True
363
+ if isinstance(current_schema, StringType):
364
+ string_nodes_finalized.add(trace_stack)
190
365
  return current_schema
191
366
 
192
367
 
193
368
  def merge_row_schema(
194
369
  schema: StructType | None,
195
370
  row: Row,
196
- columns_with_valid_contents: set,
371
+ columns_with_valid_contents: set[str],
372
+ string_nodes_finalized: set[str],
197
373
  dropFieldIfAllNull: bool = False,
198
374
  ) -> StructType | NullType:
375
+ """
376
+ Merge the schema inferred from a single row with the existing schema.
377
+
378
+ This function updates the schema by examining each row of data and merging
379
+ type information. It handles nested structures (StructType, MapType, ArrayType)
380
+ and attempts to parse JSON strings to infer deeper schema structures.
381
+
382
+ Args:
383
+ schema: The current schema to merge with
384
+ row: A single row of data to examine
385
+ columns_with_valid_contents: Set to track columns that have non-null values
386
+ string_nodes_finalized: Set to track nodes that have been finalized as strings
387
+ dropFieldIfAllNull: If True, fields that are always null will be dropped
388
+
389
+ Returns:
390
+ The merged schema as a StructType, or NullType if the row is None and no schema exists
391
+ """
392
+
199
393
  if row is None:
200
394
  if schema is not None:
201
395
  return schema
202
396
  return NullType()
397
+
203
398
  new_schema = StructType()
204
399
 
205
400
  for sf in schema.fields:
206
401
  col_name = unquote_if_quoted(sf.name)
207
- if isinstance(sf.datatype, (StructType, MapType, StringType)):
402
+ if col_name in string_nodes_finalized:
403
+ columns_with_valid_contents.add(col_name)
404
+ elif isinstance(sf.datatype, (StructType, MapType, StringType)):
208
405
  next_level_content = row[col_name]
406
+ next_level_trace_stack = _append_node_in_trace_stack(col_name, col_name)
209
407
  if next_level_content is not None:
210
408
  with suppress(json.JSONDecodeError):
211
409
  if isinstance(next_level_content, datetime):
@@ -217,10 +415,13 @@ def merge_row_schema(
217
415
  None
218
416
  if not isinstance(sf.datatype, StructType)
219
417
  else sf.datatype,
418
+ next_level_trace_stack,
419
+ string_nodes_finalized,
220
420
  dropFieldIfAllNull,
221
421
  )
222
422
  else:
223
423
  sf.datatype = StringType()
424
+ string_nodes_finalized.add(col_name)
224
425
  columns_with_valid_contents.add(col_name)
225
426
 
226
427
  elif isinstance(sf.datatype, ArrayType):
@@ -230,43 +431,59 @@ def merge_row_schema(
230
431
  decoded_content = json.loads(content)
231
432
  if isinstance(decoded_content, list):
232
433
  content = decoded_content
233
- if not isinstance(content, list):
434
+ if not isinstance(content, list) or col_name in string_nodes_finalized:
234
435
  sf.datatype = StringType()
436
+ string_nodes_finalized.add(col_name)
235
437
  else:
236
- for v in content:
237
- if v is not None:
238
- columns_with_valid_contents.add(col_name)
239
- sf.datatype.element_type = merge_json_schema(
240
- v,
241
- sf.datatype.element_type,
242
- dropFieldIfAllNull,
243
- )
438
+ next_level_trace_stack = _append_node_in_trace_stack(
439
+ col_name, "array"
440
+ )
441
+ if next_level_trace_stack in string_nodes_finalized:
442
+ sf.datatype.element_type = StringType()
443
+ else:
444
+ inner_schema = sf.datatype.element_type
445
+ for v in content:
446
+ if v is not None:
447
+ columns_with_valid_contents.add(col_name)
448
+ inner_schema = merge_json_schema(
449
+ v,
450
+ inner_schema,
451
+ next_level_trace_stack,
452
+ string_nodes_finalized,
453
+ dropFieldIfAllNull,
454
+ )
455
+ if isinstance(inner_schema, StringType):
456
+ string_nodes_finalized.add(next_level_trace_stack)
457
+ break
458
+ sf.datatype.element_type = inner_schema
244
459
  elif isinstance(sf.datatype, TimestampType):
245
460
  sf.datatype = StringType()
246
461
  columns_with_valid_contents.add(col_name)
462
+ string_nodes_finalized.add(col_name)
247
463
  elif row[col_name] is not None:
248
464
  columns_with_valid_contents.add(col_name)
249
465
 
250
- sf.datatype.structured = True
466
+ if isinstance(sf.datatype, StructType) or isinstance(sf.datatype, ArrayType):
467
+ sf.datatype.structured = True
251
468
  new_schema.add(sf)
252
469
 
253
- return schema
470
+ return new_schema
254
471
 
255
472
 
256
- def union_data_into_df(
257
- result_df: snowpark.DataFrame,
258
- data: typing.List[Row],
259
- schema: StructType,
473
+ def insert_data_chunk(
260
474
  session: snowpark.Session,
261
- ) -> snowpark.DataFrame:
262
- current_df = session.create_dataframe(
475
+ data: list[Row],
476
+ schema: StructType,
477
+ table_name: str,
478
+ ) -> None:
479
+ df = session.create_dataframe(
263
480
  data=data,
264
481
  schema=schema,
265
482
  )
266
- if result_df is None:
267
- return current_df
268
483
 
269
- return result_df.union(current_df)
484
+ df.write.mode("append").save_as_table(
485
+ table_name, table_type="temp", table_exists=True
486
+ )
270
487
 
271
488
 
272
489
  def construct_dataframe_by_schema(
@@ -276,39 +493,55 @@ def construct_dataframe_by_schema(
276
493
  snowpark_options: dict,
277
494
  batch_size: int = 1000,
278
495
  ) -> snowpark.DataFrame:
279
- result = None
496
+ table_name = "__sas_json_read_temp_" + uuid.uuid4().hex
497
+
498
+ # We can have more workers than CPU count, this is an IO-intensive task
499
+ max_workers = min(16, os.cpu_count() * 2)
280
500
 
281
501
  current_data = []
282
502
  progress = 0
283
- for row in rows:
284
- current_data.append(construct_row_by_schema(row, schema, snowpark_options))
285
- if len(current_data) >= batch_size:
503
+
504
+ # Initialize the temp table
505
+ session.create_dataframe([], schema=schema).write.mode("append").save_as_table(
506
+ table_name, table_type="temp", table_exists=False
507
+ )
508
+
509
+ is_running_in_stored_proc = is_in_stored_procedure()
510
+
511
+ # We are having issues in which the read is not giving correct number of rows
512
+ # in storedprocs when the number of workers are more than 1
513
+ # as a temporary fix we will make max_workers to 1
514
+ if is_running_in_stored_proc:
515
+ max_workers = 1
516
+
517
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as exc:
518
+ for row in rows:
519
+ current_data.append(construct_row_by_schema(row, schema, snowpark_options))
520
+ if len(current_data) >= batch_size:
521
+ progress += len(current_data)
522
+ exc.submit(
523
+ insert_data_chunk,
524
+ session,
525
+ copy.deepcopy(current_data),
526
+ schema,
527
+ table_name,
528
+ )
529
+
530
+ logger.info(f"JSON reader: finished processing {progress} rows")
531
+ current_data.clear()
532
+
533
+ if len(current_data) > 0:
286
534
  progress += len(current_data)
287
- result = union_data_into_df(
288
- result,
289
- current_data,
290
- schema,
535
+ exc.submit(
536
+ insert_data_chunk,
291
537
  session,
538
+ copy.deepcopy(current_data),
539
+ schema,
540
+ table_name,
292
541
  )
293
-
294
542
  logger.info(f"JSON reader: finished processing {progress} rows")
295
- current_data = []
296
-
297
- if len(current_data) > 0:
298
- progress += len(current_data)
299
- result = union_data_into_df(
300
- result,
301
- current_data,
302
- schema,
303
- session,
304
- )
305
543
 
306
- logger.info(f"JSON reader: finished processing {progress} rows")
307
- current_data = []
308
-
309
- if result is None:
310
- raise ValueError("Dataframe cannot be empty")
311
- return result
544
+ return session.table(table_name)
312
545
 
313
546
 
314
547
  def construct_row_by_schema(
@@ -342,17 +575,22 @@ def construct_row_by_schema(
342
575
  content.get(col_name, None), sf.datatype, snowpark_options
343
576
  )
344
577
  else:
345
- raise SnowparkConnectNotImplementedError(
578
+ exception = SnowparkConnectNotImplementedError(
346
579
  f"JSON construct {str(content)} to StructType failed"
347
580
  )
581
+ attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
582
+ raise exception
348
583
  return result
349
584
  elif isinstance(schema, ArrayType):
350
585
  result = []
351
586
  inner_schema = schema.element_type
352
587
  if isinstance(content, str):
353
588
  content = json.loads(content)
354
- for ele in content:
355
- result.append(construct_row_by_schema(ele, inner_schema, snowpark_options))
589
+ if inner_schema is not None:
590
+ for ele in content:
591
+ result.append(
592
+ construct_row_by_schema(ele, inner_schema, snowpark_options)
593
+ )
356
594
  return result
357
595
  elif isinstance(schema, DateType):
358
596
  return cast_to_match_snowpark_type(