snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +717 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +309 -26
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/error_utils.py +28 -0
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +224 -15
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +86 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
- snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
- snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +171 -48
- snowflake/snowpark_connect/server.py +528 -473
- snowflake/snowpark_connect/server_common/__init__.py +503 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/type_support.py +130 -0
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +195 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +192 -40
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -2,9 +2,12 @@
|
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
4
|
|
|
5
|
+
import concurrent.futures
|
|
5
6
|
import copy
|
|
6
7
|
import json
|
|
8
|
+
import os
|
|
7
9
|
import typing
|
|
10
|
+
import uuid
|
|
8
11
|
from contextlib import suppress
|
|
9
12
|
from datetime import datetime
|
|
10
13
|
|
|
@@ -12,6 +15,7 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
|
12
15
|
|
|
13
16
|
from snowflake import snowpark
|
|
14
17
|
from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
|
|
18
|
+
from snowflake.snowpark._internal.utils import is_in_stored_procedure
|
|
15
19
|
from snowflake.snowpark.row import Row
|
|
16
20
|
from snowflake.snowpark.types import (
|
|
17
21
|
ArrayType,
|
|
@@ -25,21 +29,33 @@ from snowflake.snowpark.types import (
|
|
|
25
29
|
TimestampType,
|
|
26
30
|
)
|
|
27
31
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
32
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
33
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
28
34
|
from snowflake.snowpark_connect.relation.read.map_read import JsonReaderConfig
|
|
35
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
36
|
+
add_filename_metadata_to_reader,
|
|
37
|
+
)
|
|
29
38
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
39
|
+
apply_metadata_exclusion_pattern,
|
|
30
40
|
get_spark_column_names_from_snowpark_columns,
|
|
31
41
|
rename_columns_as_snowflake_standard,
|
|
32
42
|
)
|
|
33
43
|
from snowflake.snowpark_connect.type_mapping import (
|
|
34
44
|
cast_to_match_snowpark_type,
|
|
35
45
|
map_simple_types,
|
|
46
|
+
merge_different_types,
|
|
36
47
|
)
|
|
48
|
+
from snowflake.snowpark_connect.type_support import emulate_integral_types
|
|
37
49
|
from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
38
50
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
39
51
|
SnowparkConnectNotImplementedError,
|
|
40
52
|
)
|
|
41
53
|
|
|
42
54
|
|
|
55
|
+
def _append_node_in_trace_stack(trace_stack: str, node: str) -> str:
|
|
56
|
+
return f"{trace_stack}:{node}"
|
|
57
|
+
|
|
58
|
+
|
|
43
59
|
def map_read_json(
|
|
44
60
|
rel: relation_proto.Relation,
|
|
45
61
|
schema: StructType | None,
|
|
@@ -58,30 +74,42 @@ def map_read_json(
|
|
|
58
74
|
|
|
59
75
|
if rel.read.is_streaming is True:
|
|
60
76
|
# TODO: Structured streaming implementation.
|
|
61
|
-
|
|
77
|
+
exception = SnowparkConnectNotImplementedError(
|
|
62
78
|
"Streaming is not supported for JSON files."
|
|
63
79
|
)
|
|
80
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
81
|
+
raise exception
|
|
64
82
|
else:
|
|
65
83
|
snowpark_options = options.convert_to_snowpark_args()
|
|
84
|
+
raw_options = rel.read.data_source.options
|
|
66
85
|
snowpark_options["infer_schema"] = True
|
|
67
86
|
|
|
68
87
|
rows_to_infer_schema = snowpark_options.pop("rowstoinferschema", 1000)
|
|
69
88
|
dropFieldIfAllNull = snowpark_options.pop("dropfieldifallnull", False)
|
|
70
89
|
batch_size = snowpark_options.pop("batchsize", 1000)
|
|
71
90
|
|
|
72
|
-
|
|
91
|
+
apply_metadata_exclusion_pattern(snowpark_options)
|
|
92
|
+
|
|
93
|
+
reader = add_filename_metadata_to_reader(
|
|
94
|
+
session.read.options(snowpark_options), raw_options
|
|
95
|
+
)
|
|
73
96
|
|
|
74
97
|
df = reader.json(paths[0])
|
|
75
98
|
if len(paths) > 1:
|
|
76
99
|
# TODO: figure out if this is what Spark does.
|
|
77
100
|
for p in paths[1:]:
|
|
78
|
-
df = df.union_all(
|
|
101
|
+
df = df.union_all(
|
|
102
|
+
add_filename_metadata_to_reader(
|
|
103
|
+
session.read.options(snowpark_options), raw_options
|
|
104
|
+
).json(p)
|
|
105
|
+
)
|
|
79
106
|
|
|
80
107
|
if schema is None:
|
|
81
108
|
schema = copy.deepcopy(df.schema)
|
|
82
109
|
infer_row_counts = 0
|
|
83
110
|
|
|
84
111
|
columns_with_valid_contents = set()
|
|
112
|
+
string_nodes_finalized = set[str]()
|
|
85
113
|
for row in df.to_local_iterator():
|
|
86
114
|
infer_row_counts += 1
|
|
87
115
|
if (
|
|
@@ -90,7 +118,11 @@ def map_read_json(
|
|
|
90
118
|
):
|
|
91
119
|
break
|
|
92
120
|
schema = merge_row_schema(
|
|
93
|
-
schema,
|
|
121
|
+
schema,
|
|
122
|
+
row,
|
|
123
|
+
columns_with_valid_contents,
|
|
124
|
+
string_nodes_finalized,
|
|
125
|
+
dropFieldIfAllNull,
|
|
94
126
|
)
|
|
95
127
|
|
|
96
128
|
if dropFieldIfAllNull:
|
|
@@ -100,6 +132,10 @@ def map_read_json(
|
|
|
100
132
|
if unquote_if_quoted(sf.name) in columns_with_valid_contents
|
|
101
133
|
]
|
|
102
134
|
|
|
135
|
+
new_schema, fields_changed = validate_and_update_schema(schema)
|
|
136
|
+
if fields_changed:
|
|
137
|
+
schema = new_schema
|
|
138
|
+
|
|
103
139
|
df = construct_dataframe_by_schema(
|
|
104
140
|
schema, df.to_local_iterator(), session, snowpark_options, batch_size
|
|
105
141
|
)
|
|
@@ -113,70 +149,205 @@ def map_read_json(
|
|
|
113
149
|
dataframe=renamed_df,
|
|
114
150
|
spark_column_names=spark_column_names,
|
|
115
151
|
snowpark_column_names=snowpark_column_names,
|
|
116
|
-
snowpark_column_types=[
|
|
152
|
+
snowpark_column_types=[
|
|
153
|
+
emulate_integral_types(f.datatype) for f in df.schema.fields
|
|
154
|
+
],
|
|
117
155
|
)
|
|
118
156
|
|
|
119
157
|
|
|
158
|
+
def should_drop_field(field: StructField) -> bool:
|
|
159
|
+
if isinstance(field.datatype, StructType):
|
|
160
|
+
# "a" : {} => drop the field
|
|
161
|
+
if len(field.datatype.fields) == 0:
|
|
162
|
+
return True
|
|
163
|
+
elif (
|
|
164
|
+
isinstance(field.datatype, ArrayType)
|
|
165
|
+
and field.datatype.element_type is not None
|
|
166
|
+
and isinstance(field.datatype.element_type, StructType)
|
|
167
|
+
):
|
|
168
|
+
if len(field.datatype.element_type.fields) == 0:
|
|
169
|
+
# "a" : [{}] => drop the field
|
|
170
|
+
return True
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# Validate the schema to ensure it is valid for Snowflake
|
|
175
|
+
# Handles these cases:
|
|
176
|
+
# 1. Drops StructField([])
|
|
177
|
+
# 2. Drops ArrayType(StructType([]))
|
|
178
|
+
# 3. ArrayType() -> ArrayType(StringType())
|
|
179
|
+
def validate_and_update_schema(schema: StructType | None) -> (StructType | None, bool):
|
|
180
|
+
if not isinstance(schema, StructType):
|
|
181
|
+
return schema, False
|
|
182
|
+
new_fields = []
|
|
183
|
+
fields_changed = False
|
|
184
|
+
for sf in schema.fields:
|
|
185
|
+
if should_drop_field(sf):
|
|
186
|
+
fields_changed = True
|
|
187
|
+
continue
|
|
188
|
+
if isinstance(sf.datatype, StructType):
|
|
189
|
+
# If the schema is a struct, validate the child schema
|
|
190
|
+
if len(sf.datatype.fields) == 0:
|
|
191
|
+
# No fields in the struct, drop the field
|
|
192
|
+
fields_changed = True
|
|
193
|
+
continue
|
|
194
|
+
child_field = StructField(sf.name, sf.datatype, sf.nullable)
|
|
195
|
+
# Recursively validate the child schema
|
|
196
|
+
child_field.datatype, child_field_changes = validate_and_update_schema(
|
|
197
|
+
sf.datatype
|
|
198
|
+
)
|
|
199
|
+
if should_drop_field(child_field):
|
|
200
|
+
fields_changed = True
|
|
201
|
+
continue
|
|
202
|
+
new_fields.append(child_field)
|
|
203
|
+
fields_changed = fields_changed or child_field_changes
|
|
204
|
+
elif isinstance(sf.datatype, ArrayType):
|
|
205
|
+
# If the schema is an array, validate the element schema
|
|
206
|
+
if sf.datatype.element_type is not None and isinstance(
|
|
207
|
+
sf.datatype.element_type, StructType
|
|
208
|
+
):
|
|
209
|
+
# If the element schema is a struct, validate the element schema
|
|
210
|
+
if len(sf.datatype.element_type.fields) == 0:
|
|
211
|
+
# No fields in the struct, drop the field
|
|
212
|
+
fields_changed = True
|
|
213
|
+
continue
|
|
214
|
+
else:
|
|
215
|
+
# Recursively validate the element schema
|
|
216
|
+
element_schema, element_field_changes = validate_and_update_schema(
|
|
217
|
+
sf.datatype.element_type
|
|
218
|
+
)
|
|
219
|
+
if element_field_changes:
|
|
220
|
+
sf.datatype.element_type = element_schema
|
|
221
|
+
fields_changed = True
|
|
222
|
+
if should_drop_field(sf):
|
|
223
|
+
fields_changed = True
|
|
224
|
+
continue
|
|
225
|
+
elif sf.datatype.element_type is None:
|
|
226
|
+
fields_changed = True
|
|
227
|
+
sf.datatype.element_type = StringType()
|
|
228
|
+
new_fields.append(sf)
|
|
229
|
+
else:
|
|
230
|
+
new_fields.append(sf)
|
|
231
|
+
if fields_changed:
|
|
232
|
+
schema.fields = new_fields
|
|
233
|
+
return schema, fields_changed
|
|
234
|
+
|
|
235
|
+
|
|
120
236
|
def merge_json_schema(
|
|
121
237
|
content: typing.Any,
|
|
122
238
|
schema: StructType | None,
|
|
239
|
+
trace_stack: str,
|
|
240
|
+
string_nodes_finalized: set[str],
|
|
123
241
|
dropFieldIfAllNull: bool = False,
|
|
124
242
|
) -> DataType:
|
|
243
|
+
"""
|
|
244
|
+
Merge the JSON content's schema into an existing schema structure.
|
|
245
|
+
|
|
246
|
+
This function recursively processes JSON content (dict, list, or primitive values) and merges
|
|
247
|
+
its inferred schema with an existing schema if provided. It handles nested structures like
|
|
248
|
+
objects (StructType) and arrays (ArrayType), and can optionally drop fields that are always null.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
content: The JSON content to infer schema from. Can be a dict, list, primitive value, or None.
|
|
252
|
+
schema: The existing schema to merge with, or None if inferring from scratch.
|
|
253
|
+
trace_stack: A string representing the current position in the schema hierarchy,
|
|
254
|
+
used for tracking/debugging nested structures.
|
|
255
|
+
string_nodes_finalized: A set of strings representing the nodes that have been finalized as strings.
|
|
256
|
+
dropFieldIfAllNull: If True, fields that only contain null values will be excluded
|
|
257
|
+
from the resulting schema. Defaults to False.
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
The merged schema as a DataType. Returns NullType if content is None and no existing
|
|
261
|
+
schema is provided. For dicts, returns StructType; for lists, returns ArrayType;
|
|
262
|
+
for primitives, returns the appropriate primitive type (StringType, IntegerType, etc.).
|
|
263
|
+
"""
|
|
125
264
|
if content is None:
|
|
126
265
|
if schema is not None:
|
|
127
266
|
return schema
|
|
128
267
|
return NullType()
|
|
129
268
|
|
|
130
|
-
if
|
|
131
|
-
|
|
132
|
-
json_content = json.loads(content)
|
|
133
|
-
if not isinstance(json_content, str):
|
|
134
|
-
content = json_content
|
|
269
|
+
if trace_stack in string_nodes_finalized:
|
|
270
|
+
return StringType()
|
|
135
271
|
|
|
136
272
|
if isinstance(content, dict):
|
|
137
|
-
|
|
273
|
+
additional_schemas = list[StructField]()
|
|
138
274
|
|
|
139
275
|
existed_schema = {}
|
|
140
|
-
if schema is not None
|
|
141
|
-
|
|
142
|
-
|
|
276
|
+
if schema is not None:
|
|
277
|
+
if schema.type_name() == "struct":
|
|
278
|
+
for sf in schema.fields:
|
|
279
|
+
existed_schema[sf.name] = sf.datatype
|
|
280
|
+
else:
|
|
281
|
+
string_nodes_finalized.add(trace_stack)
|
|
282
|
+
return StringType()
|
|
143
283
|
|
|
144
284
|
for k, v in content.items():
|
|
145
285
|
col_name = f'"{unquote_if_quoted(k)}"'
|
|
146
286
|
existed_data_type = existed_schema.get(col_name, None)
|
|
147
287
|
next_level_schema = merge_json_schema(
|
|
148
|
-
v,
|
|
288
|
+
v,
|
|
289
|
+
existed_data_type,
|
|
290
|
+
_append_node_in_trace_stack(trace_stack, col_name),
|
|
291
|
+
string_nodes_finalized,
|
|
292
|
+
dropFieldIfAllNull,
|
|
149
293
|
)
|
|
150
294
|
|
|
151
|
-
if (
|
|
152
|
-
existed_data_type is not None
|
|
153
|
-
or not dropFieldIfAllNull
|
|
154
|
-
or not isinstance(next_level_schema, NullType)
|
|
155
|
-
):
|
|
295
|
+
if not dropFieldIfAllNull or not isinstance(next_level_schema, NullType):
|
|
156
296
|
# Drop field if it's always null
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
297
|
+
if col_name in existed_schema:
|
|
298
|
+
existed_schema[col_name] = next_level_schema
|
|
299
|
+
else:
|
|
300
|
+
additional_schemas.append(StructField(col_name, next_level_schema))
|
|
160
301
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
302
|
+
current_schema = StructType()
|
|
303
|
+
if schema is not None and schema.type_name() == "struct":
|
|
304
|
+
# Keep the order of columns in the schema
|
|
305
|
+
for sf in schema.fields:
|
|
306
|
+
col_name = f'"{unquote_if_quoted(sf.name)}"'
|
|
307
|
+
if (
|
|
308
|
+
not dropFieldIfAllNull
|
|
309
|
+
or existed_schema.get(col_name, NullType()) != NullType()
|
|
310
|
+
):
|
|
311
|
+
current_schema.add(
|
|
312
|
+
StructField(col_name, existed_schema.get(col_name, NullType()))
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
for additional_schema in additional_schemas:
|
|
316
|
+
current_schema.add(additional_schema)
|
|
164
317
|
|
|
165
318
|
elif isinstance(content, list):
|
|
166
319
|
# ArrayType(*) need to have element schema inside, it would be NullType() as placeholder and keep updating while enumerating
|
|
167
320
|
inner_schema = NullType()
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
if
|
|
171
|
-
|
|
172
|
-
inner_schema =
|
|
173
|
-
|
|
174
|
-
|
|
321
|
+
next_level_trace_stack = _append_node_in_trace_stack(trace_stack, "$array")
|
|
322
|
+
|
|
323
|
+
if schema is not None:
|
|
324
|
+
if schema.type_name() in ("list", "array"):
|
|
325
|
+
inner_schema = schema.element_type
|
|
326
|
+
else:
|
|
327
|
+
string_nodes_finalized.add(trace_stack)
|
|
328
|
+
return StringType()
|
|
329
|
+
|
|
330
|
+
if next_level_trace_stack in string_nodes_finalized:
|
|
331
|
+
inner_schema = StringType()
|
|
332
|
+
else:
|
|
333
|
+
if len(content) > 0:
|
|
334
|
+
for v in content:
|
|
335
|
+
inner_schema = merge_json_schema(
|
|
336
|
+
v,
|
|
337
|
+
inner_schema,
|
|
338
|
+
next_level_trace_stack,
|
|
339
|
+
string_nodes_finalized,
|
|
340
|
+
dropFieldIfAllNull,
|
|
341
|
+
)
|
|
342
|
+
if isinstance(inner_schema, StringType):
|
|
343
|
+
string_nodes_finalized.add(next_level_trace_stack)
|
|
344
|
+
break
|
|
345
|
+
if isinstance(inner_schema, NullType) and dropFieldIfAllNull:
|
|
346
|
+
return NullType()
|
|
175
347
|
current_schema = ArrayType(inner_schema)
|
|
176
348
|
else:
|
|
177
349
|
current_schema = map_simple_types(type(content).__name__)
|
|
178
350
|
|
|
179
|
-
# If there's conflict , use StringType
|
|
180
351
|
if (
|
|
181
352
|
schema is not None
|
|
182
353
|
and schema != NullType()
|
|
@@ -184,28 +355,55 @@ def merge_json_schema(
|
|
|
184
355
|
and current_schema != NullType()
|
|
185
356
|
and schema.type_name() != current_schema.type_name()
|
|
186
357
|
):
|
|
187
|
-
|
|
358
|
+
current_schema = merge_different_types(schema, current_schema)
|
|
359
|
+
|
|
360
|
+
if isinstance(current_schema, StructType) or isinstance(current_schema, ArrayType):
|
|
361
|
+
current_schema.structured = True
|
|
188
362
|
|
|
189
|
-
current_schema
|
|
363
|
+
if isinstance(current_schema, StringType):
|
|
364
|
+
string_nodes_finalized.add(trace_stack)
|
|
190
365
|
return current_schema
|
|
191
366
|
|
|
192
367
|
|
|
193
368
|
def merge_row_schema(
|
|
194
369
|
schema: StructType | None,
|
|
195
370
|
row: Row,
|
|
196
|
-
columns_with_valid_contents: set,
|
|
371
|
+
columns_with_valid_contents: set[str],
|
|
372
|
+
string_nodes_finalized: set[str],
|
|
197
373
|
dropFieldIfAllNull: bool = False,
|
|
198
374
|
) -> StructType | NullType:
|
|
375
|
+
"""
|
|
376
|
+
Merge the schema inferred from a single row with the existing schema.
|
|
377
|
+
|
|
378
|
+
This function updates the schema by examining each row of data and merging
|
|
379
|
+
type information. It handles nested structures (StructType, MapType, ArrayType)
|
|
380
|
+
and attempts to parse JSON strings to infer deeper schema structures.
|
|
381
|
+
|
|
382
|
+
Args:
|
|
383
|
+
schema: The current schema to merge with
|
|
384
|
+
row: A single row of data to examine
|
|
385
|
+
columns_with_valid_contents: Set to track columns that have non-null values
|
|
386
|
+
string_nodes_finalized: Set to track nodes that have been finalized as strings
|
|
387
|
+
dropFieldIfAllNull: If True, fields that are always null will be dropped
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
The merged schema as a StructType, or NullType if the row is None and no schema exists
|
|
391
|
+
"""
|
|
392
|
+
|
|
199
393
|
if row is None:
|
|
200
394
|
if schema is not None:
|
|
201
395
|
return schema
|
|
202
396
|
return NullType()
|
|
397
|
+
|
|
203
398
|
new_schema = StructType()
|
|
204
399
|
|
|
205
400
|
for sf in schema.fields:
|
|
206
401
|
col_name = unquote_if_quoted(sf.name)
|
|
207
|
-
if
|
|
402
|
+
if col_name in string_nodes_finalized:
|
|
403
|
+
columns_with_valid_contents.add(col_name)
|
|
404
|
+
elif isinstance(sf.datatype, (StructType, MapType, StringType)):
|
|
208
405
|
next_level_content = row[col_name]
|
|
406
|
+
next_level_trace_stack = _append_node_in_trace_stack(col_name, col_name)
|
|
209
407
|
if next_level_content is not None:
|
|
210
408
|
with suppress(json.JSONDecodeError):
|
|
211
409
|
if isinstance(next_level_content, datetime):
|
|
@@ -217,10 +415,13 @@ def merge_row_schema(
|
|
|
217
415
|
None
|
|
218
416
|
if not isinstance(sf.datatype, StructType)
|
|
219
417
|
else sf.datatype,
|
|
418
|
+
next_level_trace_stack,
|
|
419
|
+
string_nodes_finalized,
|
|
220
420
|
dropFieldIfAllNull,
|
|
221
421
|
)
|
|
222
422
|
else:
|
|
223
423
|
sf.datatype = StringType()
|
|
424
|
+
string_nodes_finalized.add(col_name)
|
|
224
425
|
columns_with_valid_contents.add(col_name)
|
|
225
426
|
|
|
226
427
|
elif isinstance(sf.datatype, ArrayType):
|
|
@@ -230,43 +431,59 @@ def merge_row_schema(
|
|
|
230
431
|
decoded_content = json.loads(content)
|
|
231
432
|
if isinstance(decoded_content, list):
|
|
232
433
|
content = decoded_content
|
|
233
|
-
if not isinstance(content, list):
|
|
434
|
+
if not isinstance(content, list) or col_name in string_nodes_finalized:
|
|
234
435
|
sf.datatype = StringType()
|
|
436
|
+
string_nodes_finalized.add(col_name)
|
|
235
437
|
else:
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
438
|
+
next_level_trace_stack = _append_node_in_trace_stack(
|
|
439
|
+
col_name, "array"
|
|
440
|
+
)
|
|
441
|
+
if next_level_trace_stack in string_nodes_finalized:
|
|
442
|
+
sf.datatype.element_type = StringType()
|
|
443
|
+
else:
|
|
444
|
+
inner_schema = sf.datatype.element_type
|
|
445
|
+
for v in content:
|
|
446
|
+
if v is not None:
|
|
447
|
+
columns_with_valid_contents.add(col_name)
|
|
448
|
+
inner_schema = merge_json_schema(
|
|
449
|
+
v,
|
|
450
|
+
inner_schema,
|
|
451
|
+
next_level_trace_stack,
|
|
452
|
+
string_nodes_finalized,
|
|
453
|
+
dropFieldIfAllNull,
|
|
454
|
+
)
|
|
455
|
+
if isinstance(inner_schema, StringType):
|
|
456
|
+
string_nodes_finalized.add(next_level_trace_stack)
|
|
457
|
+
break
|
|
458
|
+
sf.datatype.element_type = inner_schema
|
|
244
459
|
elif isinstance(sf.datatype, TimestampType):
|
|
245
460
|
sf.datatype = StringType()
|
|
246
461
|
columns_with_valid_contents.add(col_name)
|
|
462
|
+
string_nodes_finalized.add(col_name)
|
|
247
463
|
elif row[col_name] is not None:
|
|
248
464
|
columns_with_valid_contents.add(col_name)
|
|
249
465
|
|
|
250
|
-
sf.datatype.
|
|
466
|
+
if isinstance(sf.datatype, StructType) or isinstance(sf.datatype, ArrayType):
|
|
467
|
+
sf.datatype.structured = True
|
|
251
468
|
new_schema.add(sf)
|
|
252
469
|
|
|
253
|
-
return
|
|
470
|
+
return new_schema
|
|
254
471
|
|
|
255
472
|
|
|
256
|
-
def
|
|
257
|
-
result_df: snowpark.DataFrame,
|
|
258
|
-
data: typing.List[Row],
|
|
259
|
-
schema: StructType,
|
|
473
|
+
def insert_data_chunk(
|
|
260
474
|
session: snowpark.Session,
|
|
261
|
-
|
|
262
|
-
|
|
475
|
+
data: list[Row],
|
|
476
|
+
schema: StructType,
|
|
477
|
+
table_name: str,
|
|
478
|
+
) -> None:
|
|
479
|
+
df = session.create_dataframe(
|
|
263
480
|
data=data,
|
|
264
481
|
schema=schema,
|
|
265
482
|
)
|
|
266
|
-
if result_df is None:
|
|
267
|
-
return current_df
|
|
268
483
|
|
|
269
|
-
|
|
484
|
+
df.write.mode("append").save_as_table(
|
|
485
|
+
table_name, table_type="temp", table_exists=True
|
|
486
|
+
)
|
|
270
487
|
|
|
271
488
|
|
|
272
489
|
def construct_dataframe_by_schema(
|
|
@@ -276,39 +493,55 @@ def construct_dataframe_by_schema(
|
|
|
276
493
|
snowpark_options: dict,
|
|
277
494
|
batch_size: int = 1000,
|
|
278
495
|
) -> snowpark.DataFrame:
|
|
279
|
-
|
|
496
|
+
table_name = "__sas_json_read_temp_" + uuid.uuid4().hex
|
|
497
|
+
|
|
498
|
+
# We can have more workers than CPU count, this is an IO-intensive task
|
|
499
|
+
max_workers = min(16, os.cpu_count() * 2)
|
|
280
500
|
|
|
281
501
|
current_data = []
|
|
282
502
|
progress = 0
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
503
|
+
|
|
504
|
+
# Initialize the temp table
|
|
505
|
+
session.create_dataframe([], schema=schema).write.mode("append").save_as_table(
|
|
506
|
+
table_name, table_type="temp", table_exists=False
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
is_running_in_stored_proc = is_in_stored_procedure()
|
|
510
|
+
|
|
511
|
+
# We are having issues in which the read is not giving correct number of rows
|
|
512
|
+
# in storedprocs when the number of workers are more than 1
|
|
513
|
+
# as a temporary fix we will make max_workers to 1
|
|
514
|
+
if is_running_in_stored_proc:
|
|
515
|
+
max_workers = 1
|
|
516
|
+
|
|
517
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as exc:
|
|
518
|
+
for row in rows:
|
|
519
|
+
current_data.append(construct_row_by_schema(row, schema, snowpark_options))
|
|
520
|
+
if len(current_data) >= batch_size:
|
|
521
|
+
progress += len(current_data)
|
|
522
|
+
exc.submit(
|
|
523
|
+
insert_data_chunk,
|
|
524
|
+
session,
|
|
525
|
+
copy.deepcopy(current_data),
|
|
526
|
+
schema,
|
|
527
|
+
table_name,
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
logger.info(f"JSON reader: finished processing {progress} rows")
|
|
531
|
+
current_data.clear()
|
|
532
|
+
|
|
533
|
+
if len(current_data) > 0:
|
|
286
534
|
progress += len(current_data)
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
current_data,
|
|
290
|
-
schema,
|
|
535
|
+
exc.submit(
|
|
536
|
+
insert_data_chunk,
|
|
291
537
|
session,
|
|
538
|
+
copy.deepcopy(current_data),
|
|
539
|
+
schema,
|
|
540
|
+
table_name,
|
|
292
541
|
)
|
|
293
|
-
|
|
294
542
|
logger.info(f"JSON reader: finished processing {progress} rows")
|
|
295
|
-
current_data = []
|
|
296
|
-
|
|
297
|
-
if len(current_data) > 0:
|
|
298
|
-
progress += len(current_data)
|
|
299
|
-
result = union_data_into_df(
|
|
300
|
-
result,
|
|
301
|
-
current_data,
|
|
302
|
-
schema,
|
|
303
|
-
session,
|
|
304
|
-
)
|
|
305
543
|
|
|
306
|
-
|
|
307
|
-
current_data = []
|
|
308
|
-
|
|
309
|
-
if result is None:
|
|
310
|
-
raise ValueError("Dataframe cannot be empty")
|
|
311
|
-
return result
|
|
544
|
+
return session.table(table_name)
|
|
312
545
|
|
|
313
546
|
|
|
314
547
|
def construct_row_by_schema(
|
|
@@ -342,17 +575,22 @@ def construct_row_by_schema(
|
|
|
342
575
|
content.get(col_name, None), sf.datatype, snowpark_options
|
|
343
576
|
)
|
|
344
577
|
else:
|
|
345
|
-
|
|
578
|
+
exception = SnowparkConnectNotImplementedError(
|
|
346
579
|
f"JSON construct {str(content)} to StructType failed"
|
|
347
580
|
)
|
|
581
|
+
attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
|
|
582
|
+
raise exception
|
|
348
583
|
return result
|
|
349
584
|
elif isinstance(schema, ArrayType):
|
|
350
585
|
result = []
|
|
351
586
|
inner_schema = schema.element_type
|
|
352
587
|
if isinstance(content, str):
|
|
353
588
|
content = json.loads(content)
|
|
354
|
-
|
|
355
|
-
|
|
589
|
+
if inner_schema is not None:
|
|
590
|
+
for ele in content:
|
|
591
|
+
result.append(
|
|
592
|
+
construct_row_by_schema(ele, inner_schema, snowpark_options)
|
|
593
|
+
)
|
|
356
594
|
return result
|
|
357
595
|
elif isinstance(schema, DateType):
|
|
358
596
|
return cast_to_match_snowpark_type(
|