snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +680 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +237 -23
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +123 -5
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +85 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2748 -746
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +196 -255
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +255 -45
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +17 -5
- snowflake/snowpark_connect/relation/read/map_read_json.py +320 -85
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +142 -27
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +11 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +82 -5
- snowflake/snowpark_connect/relation/read/map_read_text.py +18 -3
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +36 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +110 -48
- snowflake/snowpark_connect/server.py +546 -456
- snowflake/snowpark_connect/server_common/__init__.py +500 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +187 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +125 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +303 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +248 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +101 -332
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +163 -22
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-submit +2 -2
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/METADATA +14 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/RECORD +129 -167
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -2,9 +2,12 @@
|
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
4
|
|
|
5
|
+
import concurrent.futures
|
|
5
6
|
import copy
|
|
6
7
|
import json
|
|
8
|
+
import os
|
|
7
9
|
import typing
|
|
10
|
+
import uuid
|
|
8
11
|
from contextlib import suppress
|
|
9
12
|
from datetime import datetime
|
|
10
13
|
|
|
@@ -12,6 +15,7 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
|
12
15
|
|
|
13
16
|
from snowflake import snowpark
|
|
14
17
|
from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
|
|
18
|
+
from snowflake.snowpark._internal.utils import is_in_stored_procedure
|
|
15
19
|
from snowflake.snowpark.row import Row
|
|
16
20
|
from snowflake.snowpark.types import (
|
|
17
21
|
ArrayType,
|
|
@@ -25,14 +29,21 @@ from snowflake.snowpark.types import (
|
|
|
25
29
|
TimestampType,
|
|
26
30
|
)
|
|
27
31
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
32
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
33
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
28
34
|
from snowflake.snowpark_connect.relation.read.map_read import JsonReaderConfig
|
|
35
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
36
|
+
add_filename_metadata_to_reader,
|
|
37
|
+
)
|
|
29
38
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
39
|
+
apply_metadata_exclusion_pattern,
|
|
30
40
|
get_spark_column_names_from_snowpark_columns,
|
|
31
41
|
rename_columns_as_snowflake_standard,
|
|
32
42
|
)
|
|
33
43
|
from snowflake.snowpark_connect.type_mapping import (
|
|
34
44
|
cast_to_match_snowpark_type,
|
|
35
45
|
map_simple_types,
|
|
46
|
+
merge_different_types,
|
|
36
47
|
)
|
|
37
48
|
from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
38
49
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
@@ -40,6 +51,10 @@ from snowflake.snowpark_connect.utils.telemetry import (
|
|
|
40
51
|
)
|
|
41
52
|
|
|
42
53
|
|
|
54
|
+
def _append_node_in_trace_stack(trace_stack: str, node: str) -> str:
|
|
55
|
+
return f"{trace_stack}:{node}"
|
|
56
|
+
|
|
57
|
+
|
|
43
58
|
def map_read_json(
|
|
44
59
|
rel: relation_proto.Relation,
|
|
45
60
|
schema: StructType | None,
|
|
@@ -58,30 +73,42 @@ def map_read_json(
|
|
|
58
73
|
|
|
59
74
|
if rel.read.is_streaming is True:
|
|
60
75
|
# TODO: Structured streaming implementation.
|
|
61
|
-
|
|
76
|
+
exception = SnowparkConnectNotImplementedError(
|
|
62
77
|
"Streaming is not supported for JSON files."
|
|
63
78
|
)
|
|
79
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
80
|
+
raise exception
|
|
64
81
|
else:
|
|
65
82
|
snowpark_options = options.convert_to_snowpark_args()
|
|
83
|
+
raw_options = rel.read.data_source.options
|
|
66
84
|
snowpark_options["infer_schema"] = True
|
|
67
85
|
|
|
68
86
|
rows_to_infer_schema = snowpark_options.pop("rowstoinferschema", 1000)
|
|
69
87
|
dropFieldIfAllNull = snowpark_options.pop("dropfieldifallnull", False)
|
|
70
88
|
batch_size = snowpark_options.pop("batchsize", 1000)
|
|
71
89
|
|
|
72
|
-
|
|
90
|
+
apply_metadata_exclusion_pattern(snowpark_options)
|
|
91
|
+
|
|
92
|
+
reader = add_filename_metadata_to_reader(
|
|
93
|
+
session.read.options(snowpark_options), raw_options
|
|
94
|
+
)
|
|
73
95
|
|
|
74
96
|
df = reader.json(paths[0])
|
|
75
97
|
if len(paths) > 1:
|
|
76
98
|
# TODO: figure out if this is what Spark does.
|
|
77
99
|
for p in paths[1:]:
|
|
78
|
-
df = df.union_all(
|
|
100
|
+
df = df.union_all(
|
|
101
|
+
add_filename_metadata_to_reader(
|
|
102
|
+
session.read.options(snowpark_options), raw_options
|
|
103
|
+
).json(p)
|
|
104
|
+
)
|
|
79
105
|
|
|
80
106
|
if schema is None:
|
|
81
107
|
schema = copy.deepcopy(df.schema)
|
|
82
108
|
infer_row_counts = 0
|
|
83
109
|
|
|
84
110
|
columns_with_valid_contents = set()
|
|
111
|
+
string_nodes_finalized = set[str]()
|
|
85
112
|
for row in df.to_local_iterator():
|
|
86
113
|
infer_row_counts += 1
|
|
87
114
|
if (
|
|
@@ -90,7 +117,11 @@ def map_read_json(
|
|
|
90
117
|
):
|
|
91
118
|
break
|
|
92
119
|
schema = merge_row_schema(
|
|
93
|
-
schema,
|
|
120
|
+
schema,
|
|
121
|
+
row,
|
|
122
|
+
columns_with_valid_contents,
|
|
123
|
+
string_nodes_finalized,
|
|
124
|
+
dropFieldIfAllNull,
|
|
94
125
|
)
|
|
95
126
|
|
|
96
127
|
if dropFieldIfAllNull:
|
|
@@ -100,6 +131,10 @@ def map_read_json(
|
|
|
100
131
|
if unquote_if_quoted(sf.name) in columns_with_valid_contents
|
|
101
132
|
]
|
|
102
133
|
|
|
134
|
+
new_schema, fields_changed = validate_and_update_schema(schema)
|
|
135
|
+
if fields_changed:
|
|
136
|
+
schema = new_schema
|
|
137
|
+
|
|
103
138
|
df = construct_dataframe_by_schema(
|
|
104
139
|
schema, df.to_local_iterator(), session, snowpark_options, batch_size
|
|
105
140
|
)
|
|
@@ -117,66 +152,199 @@ def map_read_json(
|
|
|
117
152
|
)
|
|
118
153
|
|
|
119
154
|
|
|
155
|
+
def should_drop_field(field: StructField) -> bool:
|
|
156
|
+
if isinstance(field.datatype, StructType):
|
|
157
|
+
# "a" : {} => drop the field
|
|
158
|
+
if len(field.datatype.fields) == 0:
|
|
159
|
+
return True
|
|
160
|
+
elif (
|
|
161
|
+
isinstance(field.datatype, ArrayType)
|
|
162
|
+
and field.datatype.element_type is not None
|
|
163
|
+
and isinstance(field.datatype.element_type, StructType)
|
|
164
|
+
):
|
|
165
|
+
if len(field.datatype.element_type.fields) == 0:
|
|
166
|
+
# "a" : [{}] => drop the field
|
|
167
|
+
return True
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# Validate the schema to ensure it is valid for Snowflake
|
|
172
|
+
# Handles these cases:
|
|
173
|
+
# 1. Drops StructField([])
|
|
174
|
+
# 2. Drops ArrayType(StructType([]))
|
|
175
|
+
# 3. ArrayType() -> ArrayType(StringType())
|
|
176
|
+
def validate_and_update_schema(schema: StructType | None) -> (StructType | None, bool):
|
|
177
|
+
if not isinstance(schema, StructType):
|
|
178
|
+
return schema, False
|
|
179
|
+
new_fields = []
|
|
180
|
+
fields_changed = False
|
|
181
|
+
for sf in schema.fields:
|
|
182
|
+
if should_drop_field(sf):
|
|
183
|
+
fields_changed = True
|
|
184
|
+
continue
|
|
185
|
+
if isinstance(sf.datatype, StructType):
|
|
186
|
+
# If the schema is a struct, validate the child schema
|
|
187
|
+
if len(sf.datatype.fields) == 0:
|
|
188
|
+
# No fields in the struct, drop the field
|
|
189
|
+
fields_changed = True
|
|
190
|
+
continue
|
|
191
|
+
child_field = StructField(sf.name, sf.datatype, sf.nullable)
|
|
192
|
+
# Recursively validate the child schema
|
|
193
|
+
child_field.datatype, child_field_changes = validate_and_update_schema(
|
|
194
|
+
sf.datatype
|
|
195
|
+
)
|
|
196
|
+
if should_drop_field(child_field):
|
|
197
|
+
fields_changed = True
|
|
198
|
+
continue
|
|
199
|
+
new_fields.append(child_field)
|
|
200
|
+
fields_changed = fields_changed or child_field_changes
|
|
201
|
+
elif isinstance(sf.datatype, ArrayType):
|
|
202
|
+
# If the schema is an array, validate the element schema
|
|
203
|
+
if sf.datatype.element_type is not None and isinstance(
|
|
204
|
+
sf.datatype.element_type, StructType
|
|
205
|
+
):
|
|
206
|
+
# If the element schema is a struct, validate the element schema
|
|
207
|
+
if len(sf.datatype.element_type.fields) == 0:
|
|
208
|
+
# No fields in the struct, drop the field
|
|
209
|
+
fields_changed = True
|
|
210
|
+
continue
|
|
211
|
+
else:
|
|
212
|
+
# Recursively validate the element schema
|
|
213
|
+
element_schema, element_field_changes = validate_and_update_schema(
|
|
214
|
+
sf.datatype.element_type
|
|
215
|
+
)
|
|
216
|
+
if element_field_changes:
|
|
217
|
+
sf.datatype.element_type = element_schema
|
|
218
|
+
fields_changed = True
|
|
219
|
+
if should_drop_field(sf):
|
|
220
|
+
fields_changed = True
|
|
221
|
+
continue
|
|
222
|
+
elif sf.datatype.element_type is None:
|
|
223
|
+
fields_changed = True
|
|
224
|
+
sf.datatype.element_type = StringType()
|
|
225
|
+
new_fields.append(sf)
|
|
226
|
+
else:
|
|
227
|
+
new_fields.append(sf)
|
|
228
|
+
if fields_changed:
|
|
229
|
+
schema.fields = new_fields
|
|
230
|
+
return schema, fields_changed
|
|
231
|
+
|
|
232
|
+
|
|
120
233
|
def merge_json_schema(
|
|
121
234
|
content: typing.Any,
|
|
122
235
|
schema: StructType | None,
|
|
236
|
+
trace_stack: str,
|
|
237
|
+
string_nodes_finalized: set[str],
|
|
123
238
|
dropFieldIfAllNull: bool = False,
|
|
124
239
|
) -> DataType:
|
|
240
|
+
"""
|
|
241
|
+
Merge the JSON content's schema into an existing schema structure.
|
|
242
|
+
|
|
243
|
+
This function recursively processes JSON content (dict, list, or primitive values) and merges
|
|
244
|
+
its inferred schema with an existing schema if provided. It handles nested structures like
|
|
245
|
+
objects (StructType) and arrays (ArrayType), and can optionally drop fields that are always null.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
content: The JSON content to infer schema from. Can be a dict, list, primitive value, or None.
|
|
249
|
+
schema: The existing schema to merge with, or None if inferring from scratch.
|
|
250
|
+
trace_stack: A string representing the current position in the schema hierarchy,
|
|
251
|
+
used for tracking/debugging nested structures.
|
|
252
|
+
string_nodes_finalized: A set of strings representing the nodes that have been finalized as strings.
|
|
253
|
+
dropFieldIfAllNull: If True, fields that only contain null values will be excluded
|
|
254
|
+
from the resulting schema. Defaults to False.
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
The merged schema as a DataType. Returns NullType if content is None and no existing
|
|
258
|
+
schema is provided. For dicts, returns StructType; for lists, returns ArrayType;
|
|
259
|
+
for primitives, returns the appropriate primitive type (StringType, IntegerType, etc.).
|
|
260
|
+
"""
|
|
125
261
|
if content is None:
|
|
126
262
|
if schema is not None:
|
|
127
263
|
return schema
|
|
128
264
|
return NullType()
|
|
129
265
|
|
|
130
|
-
if
|
|
131
|
-
|
|
132
|
-
json_content = json.loads(content)
|
|
133
|
-
if not isinstance(json_content, str):
|
|
134
|
-
content = json_content
|
|
266
|
+
if trace_stack in string_nodes_finalized:
|
|
267
|
+
return StringType()
|
|
135
268
|
|
|
136
269
|
if isinstance(content, dict):
|
|
137
|
-
|
|
270
|
+
additional_schemas = list[StructField]()
|
|
138
271
|
|
|
139
272
|
existed_schema = {}
|
|
140
|
-
if schema is not None
|
|
141
|
-
|
|
142
|
-
|
|
273
|
+
if schema is not None:
|
|
274
|
+
if schema.type_name() == "struct":
|
|
275
|
+
for sf in schema.fields:
|
|
276
|
+
existed_schema[sf.name] = sf.datatype
|
|
277
|
+
else:
|
|
278
|
+
string_nodes_finalized.add(trace_stack)
|
|
279
|
+
return StringType()
|
|
143
280
|
|
|
144
281
|
for k, v in content.items():
|
|
145
282
|
col_name = f'"{unquote_if_quoted(k)}"'
|
|
146
283
|
existed_data_type = existed_schema.get(col_name, None)
|
|
147
284
|
next_level_schema = merge_json_schema(
|
|
148
|
-
v,
|
|
285
|
+
v,
|
|
286
|
+
existed_data_type,
|
|
287
|
+
_append_node_in_trace_stack(trace_stack, col_name),
|
|
288
|
+
string_nodes_finalized,
|
|
289
|
+
dropFieldIfAllNull,
|
|
149
290
|
)
|
|
150
291
|
|
|
151
|
-
if (
|
|
152
|
-
existed_data_type is not None
|
|
153
|
-
or not dropFieldIfAllNull
|
|
154
|
-
or not isinstance(next_level_schema, NullType)
|
|
155
|
-
):
|
|
292
|
+
if not dropFieldIfAllNull or not isinstance(next_level_schema, NullType):
|
|
156
293
|
# Drop field if it's always null
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
294
|
+
if col_name in existed_schema:
|
|
295
|
+
existed_schema[col_name] = next_level_schema
|
|
296
|
+
else:
|
|
297
|
+
additional_schemas.append(StructField(col_name, next_level_schema))
|
|
160
298
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
299
|
+
current_schema = StructType()
|
|
300
|
+
if schema is not None and schema.type_name() == "struct":
|
|
301
|
+
# Keep the order of columns in the schema
|
|
302
|
+
for sf in schema.fields:
|
|
303
|
+
col_name = f'"{unquote_if_quoted(sf.name)}"'
|
|
304
|
+
if (
|
|
305
|
+
not dropFieldIfAllNull
|
|
306
|
+
or existed_schema.get(col_name, NullType()) != NullType()
|
|
307
|
+
):
|
|
308
|
+
current_schema.add(
|
|
309
|
+
StructField(col_name, existed_schema.get(col_name, NullType()))
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
for additional_schema in additional_schemas:
|
|
313
|
+
current_schema.add(additional_schema)
|
|
164
314
|
|
|
165
315
|
elif isinstance(content, list):
|
|
166
316
|
# ArrayType(*) need to have element schema inside, it would be NullType() as placeholder and keep updating while enumerating
|
|
167
317
|
inner_schema = NullType()
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
if
|
|
171
|
-
|
|
172
|
-
inner_schema =
|
|
173
|
-
|
|
174
|
-
|
|
318
|
+
next_level_trace_stack = _append_node_in_trace_stack(trace_stack, "$array")
|
|
319
|
+
|
|
320
|
+
if schema is not None:
|
|
321
|
+
if schema.type_name() in ("list", "array"):
|
|
322
|
+
inner_schema = schema.element_type
|
|
323
|
+
else:
|
|
324
|
+
string_nodes_finalized.add(trace_stack)
|
|
325
|
+
return StringType()
|
|
326
|
+
|
|
327
|
+
if next_level_trace_stack in string_nodes_finalized:
|
|
328
|
+
inner_schema = StringType()
|
|
329
|
+
else:
|
|
330
|
+
if len(content) > 0:
|
|
331
|
+
for v in content:
|
|
332
|
+
inner_schema = merge_json_schema(
|
|
333
|
+
v,
|
|
334
|
+
inner_schema,
|
|
335
|
+
next_level_trace_stack,
|
|
336
|
+
string_nodes_finalized,
|
|
337
|
+
dropFieldIfAllNull,
|
|
338
|
+
)
|
|
339
|
+
if isinstance(inner_schema, StringType):
|
|
340
|
+
string_nodes_finalized.add(next_level_trace_stack)
|
|
341
|
+
break
|
|
342
|
+
if isinstance(inner_schema, NullType) and dropFieldIfAllNull:
|
|
343
|
+
return NullType()
|
|
175
344
|
current_schema = ArrayType(inner_schema)
|
|
176
345
|
else:
|
|
177
346
|
current_schema = map_simple_types(type(content).__name__)
|
|
178
347
|
|
|
179
|
-
# If there's conflict , use StringType
|
|
180
348
|
if (
|
|
181
349
|
schema is not None
|
|
182
350
|
and schema != NullType()
|
|
@@ -184,28 +352,55 @@ def merge_json_schema(
|
|
|
184
352
|
and current_schema != NullType()
|
|
185
353
|
and schema.type_name() != current_schema.type_name()
|
|
186
354
|
):
|
|
187
|
-
|
|
355
|
+
current_schema = merge_different_types(schema, current_schema)
|
|
356
|
+
|
|
357
|
+
if isinstance(current_schema, StructType) or isinstance(current_schema, ArrayType):
|
|
358
|
+
current_schema.structured = True
|
|
188
359
|
|
|
189
|
-
current_schema
|
|
360
|
+
if isinstance(current_schema, StringType):
|
|
361
|
+
string_nodes_finalized.add(trace_stack)
|
|
190
362
|
return current_schema
|
|
191
363
|
|
|
192
364
|
|
|
193
365
|
def merge_row_schema(
|
|
194
366
|
schema: StructType | None,
|
|
195
367
|
row: Row,
|
|
196
|
-
columns_with_valid_contents: set,
|
|
368
|
+
columns_with_valid_contents: set[str],
|
|
369
|
+
string_nodes_finalized: set[str],
|
|
197
370
|
dropFieldIfAllNull: bool = False,
|
|
198
371
|
) -> StructType | NullType:
|
|
372
|
+
"""
|
|
373
|
+
Merge the schema inferred from a single row with the existing schema.
|
|
374
|
+
|
|
375
|
+
This function updates the schema by examining each row of data and merging
|
|
376
|
+
type information. It handles nested structures (StructType, MapType, ArrayType)
|
|
377
|
+
and attempts to parse JSON strings to infer deeper schema structures.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
schema: The current schema to merge with
|
|
381
|
+
row: A single row of data to examine
|
|
382
|
+
columns_with_valid_contents: Set to track columns that have non-null values
|
|
383
|
+
string_nodes_finalized: Set to track nodes that have been finalized as strings
|
|
384
|
+
dropFieldIfAllNull: If True, fields that are always null will be dropped
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
The merged schema as a StructType, or NullType if the row is None and no schema exists
|
|
388
|
+
"""
|
|
389
|
+
|
|
199
390
|
if row is None:
|
|
200
391
|
if schema is not None:
|
|
201
392
|
return schema
|
|
202
393
|
return NullType()
|
|
394
|
+
|
|
203
395
|
new_schema = StructType()
|
|
204
396
|
|
|
205
397
|
for sf in schema.fields:
|
|
206
398
|
col_name = unquote_if_quoted(sf.name)
|
|
207
|
-
if
|
|
399
|
+
if col_name in string_nodes_finalized:
|
|
400
|
+
columns_with_valid_contents.add(col_name)
|
|
401
|
+
elif isinstance(sf.datatype, (StructType, MapType, StringType)):
|
|
208
402
|
next_level_content = row[col_name]
|
|
403
|
+
next_level_trace_stack = _append_node_in_trace_stack(col_name, col_name)
|
|
209
404
|
if next_level_content is not None:
|
|
210
405
|
with suppress(json.JSONDecodeError):
|
|
211
406
|
if isinstance(next_level_content, datetime):
|
|
@@ -217,10 +412,13 @@ def merge_row_schema(
|
|
|
217
412
|
None
|
|
218
413
|
if not isinstance(sf.datatype, StructType)
|
|
219
414
|
else sf.datatype,
|
|
415
|
+
next_level_trace_stack,
|
|
416
|
+
string_nodes_finalized,
|
|
220
417
|
dropFieldIfAllNull,
|
|
221
418
|
)
|
|
222
419
|
else:
|
|
223
420
|
sf.datatype = StringType()
|
|
421
|
+
string_nodes_finalized.add(col_name)
|
|
224
422
|
columns_with_valid_contents.add(col_name)
|
|
225
423
|
|
|
226
424
|
elif isinstance(sf.datatype, ArrayType):
|
|
@@ -230,43 +428,59 @@ def merge_row_schema(
|
|
|
230
428
|
decoded_content = json.loads(content)
|
|
231
429
|
if isinstance(decoded_content, list):
|
|
232
430
|
content = decoded_content
|
|
233
|
-
if not isinstance(content, list):
|
|
431
|
+
if not isinstance(content, list) or col_name in string_nodes_finalized:
|
|
234
432
|
sf.datatype = StringType()
|
|
433
|
+
string_nodes_finalized.add(col_name)
|
|
235
434
|
else:
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
435
|
+
next_level_trace_stack = _append_node_in_trace_stack(
|
|
436
|
+
col_name, "array"
|
|
437
|
+
)
|
|
438
|
+
if next_level_trace_stack in string_nodes_finalized:
|
|
439
|
+
sf.datatype.element_type = StringType()
|
|
440
|
+
else:
|
|
441
|
+
inner_schema = sf.datatype.element_type
|
|
442
|
+
for v in content:
|
|
443
|
+
if v is not None:
|
|
444
|
+
columns_with_valid_contents.add(col_name)
|
|
445
|
+
inner_schema = merge_json_schema(
|
|
446
|
+
v,
|
|
447
|
+
inner_schema,
|
|
448
|
+
next_level_trace_stack,
|
|
449
|
+
string_nodes_finalized,
|
|
450
|
+
dropFieldIfAllNull,
|
|
451
|
+
)
|
|
452
|
+
if isinstance(inner_schema, StringType):
|
|
453
|
+
string_nodes_finalized.add(next_level_trace_stack)
|
|
454
|
+
break
|
|
455
|
+
sf.datatype.element_type = inner_schema
|
|
244
456
|
elif isinstance(sf.datatype, TimestampType):
|
|
245
457
|
sf.datatype = StringType()
|
|
246
458
|
columns_with_valid_contents.add(col_name)
|
|
459
|
+
string_nodes_finalized.add(col_name)
|
|
247
460
|
elif row[col_name] is not None:
|
|
248
461
|
columns_with_valid_contents.add(col_name)
|
|
249
462
|
|
|
250
|
-
sf.datatype.
|
|
463
|
+
if isinstance(sf.datatype, StructType) or isinstance(sf.datatype, ArrayType):
|
|
464
|
+
sf.datatype.structured = True
|
|
251
465
|
new_schema.add(sf)
|
|
252
466
|
|
|
253
|
-
return
|
|
467
|
+
return new_schema
|
|
254
468
|
|
|
255
469
|
|
|
256
|
-
def
|
|
257
|
-
result_df: snowpark.DataFrame,
|
|
258
|
-
data: typing.List[Row],
|
|
259
|
-
schema: StructType,
|
|
470
|
+
def insert_data_chunk(
|
|
260
471
|
session: snowpark.Session,
|
|
261
|
-
|
|
262
|
-
|
|
472
|
+
data: list[Row],
|
|
473
|
+
schema: StructType,
|
|
474
|
+
table_name: str,
|
|
475
|
+
) -> None:
|
|
476
|
+
df = session.create_dataframe(
|
|
263
477
|
data=data,
|
|
264
478
|
schema=schema,
|
|
265
479
|
)
|
|
266
|
-
if result_df is None:
|
|
267
|
-
return current_df
|
|
268
480
|
|
|
269
|
-
|
|
481
|
+
df.write.mode("append").save_as_table(
|
|
482
|
+
table_name, table_type="temp", table_exists=True
|
|
483
|
+
)
|
|
270
484
|
|
|
271
485
|
|
|
272
486
|
def construct_dataframe_by_schema(
|
|
@@ -276,39 +490,55 @@ def construct_dataframe_by_schema(
|
|
|
276
490
|
snowpark_options: dict,
|
|
277
491
|
batch_size: int = 1000,
|
|
278
492
|
) -> snowpark.DataFrame:
|
|
279
|
-
|
|
493
|
+
table_name = "__sas_json_read_temp_" + uuid.uuid4().hex
|
|
494
|
+
|
|
495
|
+
# We can have more workers than CPU count, this is an IO-intensive task
|
|
496
|
+
max_workers = min(16, os.cpu_count() * 2)
|
|
280
497
|
|
|
281
498
|
current_data = []
|
|
282
499
|
progress = 0
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
500
|
+
|
|
501
|
+
# Initialize the temp table
|
|
502
|
+
session.create_dataframe([], schema=schema).write.mode("append").save_as_table(
|
|
503
|
+
table_name, table_type="temp", table_exists=False
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
is_running_in_stored_proc = is_in_stored_procedure()
|
|
507
|
+
|
|
508
|
+
# We are having issues in which the read is not giving correct number of rows
|
|
509
|
+
# in storedprocs when the number of workers are more than 1
|
|
510
|
+
# as a temporary fix we will make max_workers to 1
|
|
511
|
+
if is_running_in_stored_proc:
|
|
512
|
+
max_workers = 1
|
|
513
|
+
|
|
514
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as exc:
|
|
515
|
+
for row in rows:
|
|
516
|
+
current_data.append(construct_row_by_schema(row, schema, snowpark_options))
|
|
517
|
+
if len(current_data) >= batch_size:
|
|
518
|
+
progress += len(current_data)
|
|
519
|
+
exc.submit(
|
|
520
|
+
insert_data_chunk,
|
|
521
|
+
session,
|
|
522
|
+
copy.deepcopy(current_data),
|
|
523
|
+
schema,
|
|
524
|
+
table_name,
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
logger.info(f"JSON reader: finished processing {progress} rows")
|
|
528
|
+
current_data.clear()
|
|
529
|
+
|
|
530
|
+
if len(current_data) > 0:
|
|
286
531
|
progress += len(current_data)
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
current_data,
|
|
290
|
-
schema,
|
|
532
|
+
exc.submit(
|
|
533
|
+
insert_data_chunk,
|
|
291
534
|
session,
|
|
535
|
+
copy.deepcopy(current_data),
|
|
536
|
+
schema,
|
|
537
|
+
table_name,
|
|
292
538
|
)
|
|
293
|
-
|
|
294
539
|
logger.info(f"JSON reader: finished processing {progress} rows")
|
|
295
|
-
current_data = []
|
|
296
|
-
|
|
297
|
-
if len(current_data) > 0:
|
|
298
|
-
progress += len(current_data)
|
|
299
|
-
result = union_data_into_df(
|
|
300
|
-
result,
|
|
301
|
-
current_data,
|
|
302
|
-
schema,
|
|
303
|
-
session,
|
|
304
|
-
)
|
|
305
540
|
|
|
306
|
-
|
|
307
|
-
current_data = []
|
|
308
|
-
|
|
309
|
-
if result is None:
|
|
310
|
-
raise ValueError("Dataframe cannot be empty")
|
|
311
|
-
return result
|
|
541
|
+
return session.table(table_name)
|
|
312
542
|
|
|
313
543
|
|
|
314
544
|
def construct_row_by_schema(
|
|
@@ -342,17 +572,22 @@ def construct_row_by_schema(
|
|
|
342
572
|
content.get(col_name, None), sf.datatype, snowpark_options
|
|
343
573
|
)
|
|
344
574
|
else:
|
|
345
|
-
|
|
575
|
+
exception = SnowparkConnectNotImplementedError(
|
|
346
576
|
f"JSON construct {str(content)} to StructType failed"
|
|
347
577
|
)
|
|
578
|
+
attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
|
|
579
|
+
raise exception
|
|
348
580
|
return result
|
|
349
581
|
elif isinstance(schema, ArrayType):
|
|
350
582
|
result = []
|
|
351
583
|
inner_schema = schema.element_type
|
|
352
584
|
if isinstance(content, str):
|
|
353
585
|
content = json.loads(content)
|
|
354
|
-
|
|
355
|
-
|
|
586
|
+
if inner_schema is not None:
|
|
587
|
+
for ele in content:
|
|
588
|
+
result.append(
|
|
589
|
+
construct_row_by_schema(ele, inner_schema, snowpark_options)
|
|
590
|
+
)
|
|
356
591
|
return result
|
|
357
592
|
elif isinstance(schema, DateType):
|
|
358
593
|
return cast_to_match_snowpark_type(
|