snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +680 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +237 -23
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +123 -5
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +85 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2748 -746
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +196 -255
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +255 -45
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +17 -5
- snowflake/snowpark_connect/relation/read/map_read_json.py +320 -85
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +142 -27
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +11 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +82 -5
- snowflake/snowpark_connect/relation/read/map_read_text.py +18 -3
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +36 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +110 -48
- snowflake/snowpark_connect/server.py +546 -456
- snowflake/snowpark_connect/server_common/__init__.py +500 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +187 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +125 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +303 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +248 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +101 -332
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +163 -22
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-submit +2 -2
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/METADATA +14 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/RECORD +129 -167
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -3,19 +3,30 @@
|
|
|
3
3
|
#
|
|
4
4
|
|
|
5
5
|
import copy
|
|
6
|
+
from typing import Any
|
|
6
7
|
|
|
7
8
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
9
|
+
from pyspark.errors.exceptions.base import AnalysisException
|
|
8
10
|
|
|
9
11
|
import snowflake.snowpark.functions as snowpark_fn
|
|
10
12
|
from snowflake import snowpark
|
|
11
13
|
from snowflake.snowpark.dataframe_reader import DataFrameReader
|
|
12
14
|
from snowflake.snowpark.types import StringType, StructField, StructType
|
|
15
|
+
from snowflake.snowpark_connect.config import global_config, str_to_bool
|
|
13
16
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
17
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
18
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
14
19
|
from snowflake.snowpark_connect.relation.read.map_read import CsvReaderConfig
|
|
20
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
21
|
+
add_filename_metadata_to_reader,
|
|
22
|
+
get_non_metadata_fields,
|
|
23
|
+
)
|
|
15
24
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
25
|
+
apply_metadata_exclusion_pattern,
|
|
16
26
|
get_spark_column_names_from_snowpark_columns,
|
|
17
27
|
rename_columns_as_snowflake_standard,
|
|
18
28
|
)
|
|
29
|
+
from snowflake.snowpark_connect.utils.io_utils import cached_file_format
|
|
19
30
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
20
31
|
SnowparkConnectNotImplementedError,
|
|
21
32
|
)
|
|
@@ -37,33 +48,68 @@ def map_read_csv(
|
|
|
37
48
|
|
|
38
49
|
if rel.read.is_streaming is True:
|
|
39
50
|
# TODO: Structured streaming implementation.
|
|
40
|
-
|
|
51
|
+
exception = SnowparkConnectNotImplementedError(
|
|
41
52
|
"Streaming is not supported for CSV files."
|
|
42
53
|
)
|
|
54
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
55
|
+
raise exception
|
|
43
56
|
else:
|
|
44
|
-
|
|
57
|
+
converted_snowpark_options = options.convert_to_snowpark_args()
|
|
58
|
+
parse_header = converted_snowpark_options.get("PARSE_HEADER", False)
|
|
59
|
+
file_format_options = _parse_csv_snowpark_options(converted_snowpark_options)
|
|
60
|
+
file_format = cached_file_format(session, "csv", file_format_options)
|
|
61
|
+
|
|
62
|
+
snowpark_reader_options = dict()
|
|
63
|
+
snowpark_reader_options["FORMAT_NAME"] = file_format
|
|
64
|
+
snowpark_reader_options["ENFORCE_EXISTING_FILE_FORMAT"] = True
|
|
65
|
+
snowpark_reader_options["INFER_SCHEMA"] = converted_snowpark_options.get(
|
|
66
|
+
"INFER_SCHEMA", False
|
|
67
|
+
)
|
|
68
|
+
snowpark_reader_options[
|
|
69
|
+
"INFER_SCHEMA_OPTIONS"
|
|
70
|
+
] = converted_snowpark_options.get("INFER_SCHEMA_OPTIONS", {})
|
|
71
|
+
|
|
72
|
+
# Use Try_cast to avoid schema inference errors
|
|
73
|
+
if snowpark_reader_options.get("INFER_SCHEMA", False):
|
|
74
|
+
snowpark_reader_options["TRY_CAST"] = True
|
|
75
|
+
|
|
76
|
+
apply_metadata_exclusion_pattern(converted_snowpark_options)
|
|
77
|
+
snowpark_reader_options["PATTERN"] = converted_snowpark_options.get(
|
|
78
|
+
"PATTERN", None
|
|
79
|
+
)
|
|
80
|
+
|
|
45
81
|
raw_options = rel.read.data_source.options
|
|
82
|
+
|
|
46
83
|
if schema is None or (
|
|
47
|
-
|
|
48
|
-
and raw_options.get("enforceSchema", "True").lower() == "false"
|
|
84
|
+
parse_header
|
|
85
|
+
and str(raw_options.get("enforceSchema", "True")).lower() == "false"
|
|
49
86
|
): # Schema has to equals to header's format
|
|
50
|
-
reader =
|
|
87
|
+
reader = add_filename_metadata_to_reader(
|
|
88
|
+
session.read.options(snowpark_reader_options), raw_options
|
|
89
|
+
)
|
|
51
90
|
else:
|
|
52
|
-
reader =
|
|
91
|
+
reader = add_filename_metadata_to_reader(
|
|
92
|
+
session.read.options(snowpark_reader_options).schema(schema),
|
|
93
|
+
raw_options,
|
|
94
|
+
)
|
|
53
95
|
df = read_data(
|
|
54
96
|
reader,
|
|
55
97
|
schema,
|
|
56
98
|
session,
|
|
57
99
|
paths[0],
|
|
58
|
-
|
|
100
|
+
file_format_options,
|
|
101
|
+
snowpark_reader_options,
|
|
59
102
|
raw_options,
|
|
103
|
+
parse_header,
|
|
60
104
|
)
|
|
61
105
|
if len(paths) > 1:
|
|
62
106
|
# TODO: figure out if this is what Spark does.
|
|
63
107
|
for p in paths[1:]:
|
|
64
108
|
df = df.union_all(reader.csv(p))
|
|
65
109
|
|
|
66
|
-
if schema is None
|
|
110
|
+
if schema is None and not str_to_bool(
|
|
111
|
+
str(raw_options.get("inferSchema", "false"))
|
|
112
|
+
):
|
|
67
113
|
df = df.select(
|
|
68
114
|
[snowpark_fn.col(c).cast("STRING").alias(c) for c in df.schema.names]
|
|
69
115
|
)
|
|
@@ -81,62 +127,226 @@ def map_read_csv(
|
|
|
81
127
|
)
|
|
82
128
|
|
|
83
129
|
|
|
130
|
+
_csv_file_format_allowed_options = {
|
|
131
|
+
"COMPRESSION",
|
|
132
|
+
"RECORD_DELIMITER",
|
|
133
|
+
"FIELD_DELIMITER",
|
|
134
|
+
"MULTI_LINE",
|
|
135
|
+
"FILE_EXTENSION",
|
|
136
|
+
"PARSE_HEADER",
|
|
137
|
+
"SKIP_HEADER",
|
|
138
|
+
"SKIP_BLANK_LINES",
|
|
139
|
+
"DATE_FORMAT",
|
|
140
|
+
"TIME_FORMAT",
|
|
141
|
+
"TIMESTAMP_FORMAT",
|
|
142
|
+
"BINARY_FORMAT",
|
|
143
|
+
"ESCAPE",
|
|
144
|
+
"ESCAPE_UNENCLOSED_FIELD",
|
|
145
|
+
"TRIM_SPACE",
|
|
146
|
+
"FIELD_OPTIONALLY_ENCLOSED_BY",
|
|
147
|
+
"NULL_IF",
|
|
148
|
+
"ERROR_ON_COLUMN_COUNT_MISMATCH",
|
|
149
|
+
"REPLACE_INVALID_CHARACTERS",
|
|
150
|
+
"EMPTY_FIELD_AS_NULL",
|
|
151
|
+
"SKIP_BYTE_ORDER_MARK",
|
|
152
|
+
"ENCODING",
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _parse_csv_snowpark_options(snowpark_options: dict[str, Any]) -> dict[str, Any]:
|
|
157
|
+
file_format_options = dict()
|
|
158
|
+
for key, value in snowpark_options.items():
|
|
159
|
+
upper_key = key.upper()
|
|
160
|
+
if upper_key in _csv_file_format_allowed_options:
|
|
161
|
+
file_format_options[upper_key] = value
|
|
162
|
+
|
|
163
|
+
# This option has to be removed, because we cannot use at the same time predefined file format and parse_header option
|
|
164
|
+
# Such combination causes snowpark to raise SQL compilation error: Invalid file format "PARSE_HEADER" is only allowed for CSV INFER_SCHEMA and MATCH_BY_COLUMN_NAME
|
|
165
|
+
parse_header = file_format_options.get("PARSE_HEADER", False)
|
|
166
|
+
if parse_header:
|
|
167
|
+
file_format_options["SKIP_HEADER"] = 1
|
|
168
|
+
del file_format_options["PARSE_HEADER"]
|
|
169
|
+
|
|
170
|
+
return file_format_options
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _deduplicate_column_names_pyspark_style(
|
|
174
|
+
column_names: list[str], case_sensitive: bool
|
|
175
|
+
) -> list[str]:
|
|
176
|
+
"""
|
|
177
|
+
Deduplicate column names following PySpark's behavior in CSVUtils.scala::makeSafeHeader by appending
|
|
178
|
+
global position index to all occurrences of duplicated names.
|
|
179
|
+
|
|
180
|
+
Examples with case_sensitive=False:
|
|
181
|
+
['ab', 'AB'] -> ['ab0', 'AB1']
|
|
182
|
+
['ab', 'ab'] -> ['ab0', 'ab1']
|
|
183
|
+
['a', 'b', 'A', 'c', 'B'] -> ['a0', 'b1', 'A2', 'c', 'B4'] (positions: a=0,2; b=1,4; c=3)
|
|
184
|
+
|
|
185
|
+
Examples with case_sensitive=True:
|
|
186
|
+
['ab', 'AB'] -> ['ab', 'AB'] (no duplicates, different case)
|
|
187
|
+
['ab', 'ab'] -> ['ab0', 'ab1'] (exact duplicates at positions 0, 1)
|
|
188
|
+
['a', 'b', 'A', 'c', 'B'] -> ['a', 'b', 'A', 'c', 'B'] (no duplicates)
|
|
189
|
+
|
|
190
|
+
Edge cases:
|
|
191
|
+
['a0', 'a0'] -> ['a00', 'a01'] (appends position even if name already has digits)
|
|
192
|
+
['a', '', 'b'] -> ['a', '_c1', 'b'] (empty names become _c<position>)
|
|
193
|
+
"""
|
|
194
|
+
seen = set()
|
|
195
|
+
duplicates = set()
|
|
196
|
+
|
|
197
|
+
for name in column_names:
|
|
198
|
+
# filter out nulls and apply case transformation
|
|
199
|
+
if not name:
|
|
200
|
+
continue
|
|
201
|
+
key = name if case_sensitive else name.lower()
|
|
202
|
+
if key in seen:
|
|
203
|
+
duplicates.add(key)
|
|
204
|
+
else:
|
|
205
|
+
seen.add(key)
|
|
206
|
+
|
|
207
|
+
result = []
|
|
208
|
+
for index, value in enumerate(column_names):
|
|
209
|
+
# Empty/null, append _c<index>
|
|
210
|
+
if value is None or value == "":
|
|
211
|
+
result.append(f"_c{index}")
|
|
212
|
+
# Case-insensitive duplicate, append index
|
|
213
|
+
elif not case_sensitive and value.lower() in duplicates:
|
|
214
|
+
result.append(f"{value}{index}")
|
|
215
|
+
# Case-sensitive duplicate, append index
|
|
216
|
+
elif case_sensitive and value in duplicates:
|
|
217
|
+
result.append(f"{value}{index}")
|
|
218
|
+
else:
|
|
219
|
+
result.append(value)
|
|
220
|
+
|
|
221
|
+
return result
|
|
222
|
+
|
|
223
|
+
|
|
84
224
|
def get_header_names(
|
|
85
225
|
session: snowpark.Session,
|
|
86
226
|
path: list[str],
|
|
87
|
-
|
|
227
|
+
file_format_options: dict,
|
|
228
|
+
snowpark_read_options: dict,
|
|
229
|
+
raw_options: dict,
|
|
230
|
+
parse_header: bool,
|
|
88
231
|
) -> list[str]:
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
232
|
+
no_header_file_format_options = copy.copy(file_format_options)
|
|
233
|
+
no_header_file_format_options["PARSE_HEADER"] = False
|
|
234
|
+
no_header_file_format_options.pop("SKIP_HEADER", None)
|
|
235
|
+
|
|
236
|
+
file_format = cached_file_format(session, "csv", no_header_file_format_options)
|
|
237
|
+
no_header_snowpark_read_options = copy.copy(snowpark_read_options)
|
|
238
|
+
no_header_snowpark_read_options["FORMAT_NAME"] = file_format
|
|
239
|
+
no_header_snowpark_read_options.pop("INFER_SCHEMA", None)
|
|
240
|
+
|
|
241
|
+
# If we don't set this, snowpark will try to infer the schema for all rows in the csv file.
|
|
242
|
+
# Since there's no easy way to just read the header from the csv, we use this approach where we force the df reader to infer the schema for 10 rows and
|
|
243
|
+
# and we are only interested in the first row to get the header names and discard the inferred schema.
|
|
244
|
+
no_header_snowpark_read_options["INFER_SCHEMA_OPTIONS"] = {
|
|
245
|
+
"MAX_RECORDS_PER_FILE": 1,
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
header_df = session.read.options(no_header_snowpark_read_options).csv(path).limit(1)
|
|
249
|
+
collected_data = header_df.collect()
|
|
250
|
+
|
|
251
|
+
if len(collected_data) == 0:
|
|
252
|
+
error_msg = f"Path does not exist or contains no data: {path}"
|
|
253
|
+
user_pattern = raw_options.get("pathGlobFilter", None)
|
|
254
|
+
if user_pattern:
|
|
255
|
+
error_msg += f" (with pathGlobFilter: {user_pattern})"
|
|
256
|
+
|
|
257
|
+
exception = AnalysisException(error_msg)
|
|
258
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
259
|
+
raise exception
|
|
260
|
+
|
|
261
|
+
header_data = collected_data[0]
|
|
262
|
+
num_columns = len(header_df.schema.fields)
|
|
263
|
+
|
|
264
|
+
if not parse_header:
|
|
265
|
+
# parse_header=False, use default _c0, _c1, _c2... naming for columns
|
|
266
|
+
return [f'"_c{i}"' for i in range(num_columns)]
|
|
267
|
+
|
|
268
|
+
# parse_header=True: Read first row as column names and deduplicate
|
|
269
|
+
raw_column_names = [
|
|
270
|
+
header_data[i] if header_data[i] is not None else "" for i in range(num_columns)
|
|
98
271
|
]
|
|
99
272
|
|
|
273
|
+
case_sensitive = global_config.spark_sql_caseSensitive
|
|
274
|
+
deduplicated_names = _deduplicate_column_names_pyspark_style(
|
|
275
|
+
raw_column_names, case_sensitive
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
return [f'"{name}"' for name in deduplicated_names]
|
|
279
|
+
|
|
100
280
|
|
|
101
281
|
def read_data(
|
|
102
282
|
reader: DataFrameReader,
|
|
103
283
|
schema: snowpark.types.StructType | None,
|
|
104
284
|
session: snowpark.Session,
|
|
105
285
|
path: list[str],
|
|
106
|
-
|
|
286
|
+
file_format_options: dict,
|
|
287
|
+
snowpark_read_options: dict,
|
|
107
288
|
raw_options: dict,
|
|
289
|
+
parse_header: bool,
|
|
108
290
|
) -> snowpark.DataFrame:
|
|
109
|
-
df = reader.csv(path)
|
|
110
291
|
filename = path.strip("/").split("/")[-1]
|
|
292
|
+
|
|
111
293
|
if schema is not None:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
if
|
|
294
|
+
df = reader.csv(path)
|
|
295
|
+
non_metadata_fields = get_non_metadata_fields(df.schema.fields)
|
|
296
|
+
if len(schema.fields) != len(non_metadata_fields):
|
|
297
|
+
exception = Exception(f"csv load from {filename} failed.")
|
|
298
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_CAST)
|
|
299
|
+
raise exception
|
|
300
|
+
if str(raw_options.get("enforceSchema", "True")).lower() == "false":
|
|
115
301
|
for i in range(len(schema.fields)):
|
|
116
302
|
if (
|
|
117
|
-
schema.fields[i].name !=
|
|
118
|
-
and f'"{schema.fields[i].name}"' !=
|
|
303
|
+
schema.fields[i].name != non_metadata_fields[i].name
|
|
304
|
+
and f'"{schema.fields[i].name}"' != non_metadata_fields[i].name
|
|
119
305
|
):
|
|
120
|
-
|
|
306
|
+
exception = Exception("CSV header does not conform to the schema")
|
|
307
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
308
|
+
raise exception
|
|
121
309
|
return df
|
|
122
310
|
|
|
123
|
-
headers = get_header_names(
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
.
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
311
|
+
headers = get_header_names(
|
|
312
|
+
session,
|
|
313
|
+
path,
|
|
314
|
+
file_format_options,
|
|
315
|
+
snowpark_read_options,
|
|
316
|
+
raw_options,
|
|
317
|
+
parse_header,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
# Create schema with the column names and read CSV
|
|
321
|
+
if len(headers) > 0:
|
|
322
|
+
if (
|
|
323
|
+
not str_to_bool(str(raw_options.get("inferSchema", "false")))
|
|
324
|
+
and schema is None
|
|
325
|
+
):
|
|
326
|
+
inferred_schema = StructType(
|
|
327
|
+
[StructField(h, StringType(), True) for h in headers]
|
|
328
|
+
)
|
|
329
|
+
df = reader.schema(inferred_schema).csv(path)
|
|
330
|
+
else:
|
|
331
|
+
df = reader.csv(path)
|
|
332
|
+
non_metadata_fields = get_non_metadata_fields(df.schema.fields)
|
|
333
|
+
if len(non_metadata_fields) != len(headers):
|
|
334
|
+
exception = Exception(
|
|
335
|
+
f"CSV header: {headers} does not conform to the schema"
|
|
336
|
+
)
|
|
337
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
338
|
+
raise exception
|
|
339
|
+
if any(
|
|
340
|
+
non_metadata_fields[i].name != headers[i]
|
|
341
|
+
for i in range(len(non_metadata_fields))
|
|
342
|
+
):
|
|
343
|
+
df = df.select(
|
|
344
|
+
[
|
|
345
|
+
snowpark_fn.col(non_metadata_fields[i].name).alias(headers[i])
|
|
346
|
+
for i in range(len(non_metadata_fields))
|
|
347
|
+
]
|
|
348
|
+
)
|
|
349
|
+
return df
|
|
141
350
|
|
|
142
|
-
|
|
351
|
+
# Fallback: no headers, shouldn't reach here
|
|
352
|
+
return reader.csv(path)
|
|
@@ -9,6 +9,8 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
|
9
9
|
from snowflake import snowpark
|
|
10
10
|
from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
|
|
11
11
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
12
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
13
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
12
14
|
from snowflake.snowpark_connect.relation.read.jdbc_read_dbapi import JdbcDataFrameReader
|
|
13
15
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
14
16
|
Connection,
|
|
@@ -28,7 +30,9 @@ def create_connection(jdbc_options: dict[str, str]) -> Connection:
|
|
|
28
30
|
return jaydebeapi.connect(driver, url, jdbc_options)
|
|
29
31
|
except Exception as e:
|
|
30
32
|
jpype.detachThreadFromJVM()
|
|
31
|
-
|
|
33
|
+
exception = Exception(f"Error connecting JDBC datasource: {e}")
|
|
34
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
35
|
+
raise exception
|
|
32
36
|
|
|
33
37
|
|
|
34
38
|
def close_connection(conn: Connection) -> None:
|
|
@@ -70,17 +74,23 @@ def map_read_jdbc(
|
|
|
70
74
|
dbtable = None
|
|
71
75
|
|
|
72
76
|
if not dbtable and not query:
|
|
73
|
-
|
|
77
|
+
exception = ValueError("Include dbtable or query is required option")
|
|
78
|
+
attach_custom_error_code(exception, ErrorCodes.INSUFFICIENT_INPUT)
|
|
79
|
+
raise exception
|
|
74
80
|
|
|
75
81
|
if query is not None and dbtable is not None:
|
|
76
|
-
|
|
82
|
+
exception = ValueError(
|
|
77
83
|
"Not allowed to specify dbtable and query options at the same time"
|
|
78
84
|
)
|
|
85
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
86
|
+
raise exception
|
|
79
87
|
|
|
80
88
|
if query is not None and partition_column is not None:
|
|
81
|
-
|
|
89
|
+
exception = ValueError(
|
|
82
90
|
"Not allowed to specify partitionColumn and query options at the same time"
|
|
83
91
|
)
|
|
92
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
93
|
+
raise exception
|
|
84
94
|
|
|
85
95
|
try:
|
|
86
96
|
df = JdbcDataFrameReader(session, jdbc_options).jdbc_read_dbapi(
|
|
@@ -105,4 +115,6 @@ def map_read_jdbc(
|
|
|
105
115
|
snowpark_column_types=[f.datatype for f in df.schema.fields],
|
|
106
116
|
)
|
|
107
117
|
except Exception as e:
|
|
108
|
-
|
|
118
|
+
exception = Exception(f"Error accessing JDBC datasource for read: {e}")
|
|
119
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
120
|
+
raise exception
|