snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +717 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +309 -26
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/error_utils.py +28 -0
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +224 -15
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +86 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
- snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
- snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +171 -48
- snowflake/snowpark_connect/server.py +528 -473
- snowflake/snowpark_connect/server_common/__init__.py +503 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/type_support.py +130 -0
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +195 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +192 -40
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -7,7 +7,7 @@ import ast
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
import pandas
|
|
9
9
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
10
|
-
from pyspark.errors.exceptions.base import AnalysisException
|
|
10
|
+
from pyspark.errors.exceptions.base import AnalysisException, IllegalArgumentException
|
|
11
11
|
|
|
12
12
|
import snowflake.snowpark.functions as fn
|
|
13
13
|
import snowflake.snowpark.types as snowpark_types
|
|
@@ -15,6 +15,9 @@ from snowflake import snowpark
|
|
|
15
15
|
from snowflake.snowpark.exceptions import SnowparkSQLException
|
|
16
16
|
from snowflake.snowpark_connect.config import get_boolean_session_config_param
|
|
17
17
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
18
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
19
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
20
|
+
from snowflake.snowpark_connect.includes.python.pyspark.sql.types import StructField
|
|
18
21
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
19
22
|
from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
|
|
20
23
|
|
|
@@ -52,7 +55,7 @@ def map_cov(
|
|
|
52
55
|
"""
|
|
53
56
|
Find the covariance of two columns in the input DataFrame.
|
|
54
57
|
|
|
55
|
-
Returns a pandas DataFrame because the
|
|
58
|
+
Returns a pandas DataFrame because the covariance of two columns produces
|
|
56
59
|
a scalar value.
|
|
57
60
|
"""
|
|
58
61
|
input_container = map_relation(rel.cov.input)
|
|
@@ -64,6 +67,16 @@ def map_cov(
|
|
|
64
67
|
col2 = input_container.column_map.get_snowpark_column_name_from_spark_column_name(
|
|
65
68
|
rel.cov.col2
|
|
66
69
|
)
|
|
70
|
+
|
|
71
|
+
col1_type = next(
|
|
72
|
+
field.datatype for field in input_df.schema.fields if field.name == col1
|
|
73
|
+
)
|
|
74
|
+
col2_type = next(
|
|
75
|
+
field.datatype for field in input_df.schema.fields if field.name == col2
|
|
76
|
+
)
|
|
77
|
+
_check_numeric_column(col_name=rel.cov.col1, col_type=col1_type)
|
|
78
|
+
_check_numeric_column(col_name=rel.cov.col2, col_type=col2_type)
|
|
79
|
+
|
|
67
80
|
result: float = input_df.cov(col1, col2)
|
|
68
81
|
return pandas.DataFrame({"cov": [result]})
|
|
69
82
|
|
|
@@ -81,7 +94,7 @@ def map_approx_quantile(
|
|
|
81
94
|
input_df = input_container.dataframe
|
|
82
95
|
|
|
83
96
|
snowflake_compatible = get_boolean_session_config_param(
|
|
84
|
-
"enable_snowflake_extension_behavior"
|
|
97
|
+
"snowpark.connect.enable_snowflake_extension_behavior"
|
|
85
98
|
)
|
|
86
99
|
|
|
87
100
|
if not snowflake_compatible:
|
|
@@ -99,9 +112,11 @@ def map_approx_quantile(
|
|
|
99
112
|
else ""
|
|
100
113
|
)
|
|
101
114
|
|
|
102
|
-
|
|
115
|
+
exception = AnalysisException(
|
|
103
116
|
f"[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `{col_name}` cannot be resolved.{suggestion_text}"
|
|
104
117
|
)
|
|
118
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
119
|
+
raise exception
|
|
105
120
|
|
|
106
121
|
cols = input_container.column_map.get_snowpark_column_names_from_spark_column_names(
|
|
107
122
|
list(rel.approx_quantile.cols)
|
|
@@ -309,9 +324,28 @@ def map_freq_items(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
309
324
|
cols = input_container.column_map.get_snowpark_column_names_from_spark_column_names(
|
|
310
325
|
list(rel.freq_items.cols)
|
|
311
326
|
)
|
|
327
|
+
|
|
328
|
+
# handle empty DataFrame case
|
|
329
|
+
row_count = input_df.count()
|
|
330
|
+
|
|
331
|
+
for sp_col_name in cols:
|
|
332
|
+
spark_col_names.append(
|
|
333
|
+
f"{input_container.column_map.get_spark_column_name_from_snowpark_column_name(sp_col_name)}_freqItems"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
if row_count == 0:
|
|
337
|
+
# If DataFrame is empty, return empty arrays for each column
|
|
338
|
+
empty_values = [[] for _ in cols]
|
|
339
|
+
approx_top_k_df = session.createDataFrame([empty_values], spark_col_names)
|
|
340
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
341
|
+
dataframe=approx_top_k_df,
|
|
342
|
+
spark_column_names=spark_col_names,
|
|
343
|
+
snowpark_column_names=spark_col_names,
|
|
344
|
+
)
|
|
345
|
+
|
|
312
346
|
approx_top_k_df = input_df.select(
|
|
313
347
|
*[
|
|
314
|
-
fn.function("approx_top_k")(fn.col(col), round(
|
|
348
|
+
fn.function("approx_top_k")(fn.col(col), round(row_count / support))
|
|
315
349
|
for col in cols
|
|
316
350
|
]
|
|
317
351
|
)
|
|
@@ -330,10 +364,6 @@ def map_freq_items(rel: relation_proto.Relation) -> DataFrameContainer:
|
|
|
330
364
|
for value in approx_top_k_values
|
|
331
365
|
]
|
|
332
366
|
|
|
333
|
-
for sp_col_name in cols:
|
|
334
|
-
spark_col_names.append(
|
|
335
|
-
f"{input_container.column_map.get_spark_column_name_from_snowpark_column_name(sp_col_name)}_freqItems"
|
|
336
|
-
)
|
|
337
367
|
approx_top_k_df = session.createDataFrame([filtered_values], spark_col_names)
|
|
338
368
|
|
|
339
369
|
return DataFrameContainer.create_with_column_mapping(
|
|
@@ -371,3 +401,12 @@ def _build_column_map_helper_container(
|
|
|
371
401
|
spark_column_names=spark_col_names,
|
|
372
402
|
snowpark_column_names=desc_df.columns,
|
|
373
403
|
)
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def _check_numeric_column(col_name: str, col_type: StructField) -> None:
|
|
407
|
+
"""Checks if a column type is a Snowpark NumericType and raises an exception if not."""
|
|
408
|
+
if not isinstance(col_type, snowpark_types._NumericType):
|
|
409
|
+
raise IllegalArgumentException(
|
|
410
|
+
f"Column '{col_name}' must be of numeric type for covariance calculation, "
|
|
411
|
+
f"but got {col_type}"
|
|
412
|
+
)
|
|
@@ -4,8 +4,12 @@
|
|
|
4
4
|
|
|
5
5
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
6
6
|
|
|
7
|
+
from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
|
|
7
8
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
8
9
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
10
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
11
|
+
without_internal_columns,
|
|
12
|
+
)
|
|
9
13
|
|
|
10
14
|
|
|
11
15
|
def map_alias(
|
|
@@ -17,8 +21,12 @@ def map_alias(
|
|
|
17
21
|
alias: str = rel.subquery_alias.alias
|
|
18
22
|
# we set reuse_parsed_plan=False because we need new expr_id for the attributes (output columns) in aliased snowpark dataframe
|
|
19
23
|
# reuse_parsed_plan will lead to ambiguous column name for operations like joining two dataframes that are aliased from the same dataframe
|
|
20
|
-
input_container =
|
|
21
|
-
|
|
24
|
+
input_container = without_internal_columns(
|
|
25
|
+
map_relation(rel.subquery_alias.input, reuse_parsed_plan=False)
|
|
26
|
+
)
|
|
27
|
+
qualifiers = [
|
|
28
|
+
{ColumnQualifier((alias,))} for _ in input_container.column_map.columns
|
|
29
|
+
]
|
|
22
30
|
|
|
23
31
|
return DataFrameContainer.create_with_column_mapping(
|
|
24
32
|
dataframe=input_container.dataframe,
|
|
@@ -28,4 +36,5 @@ def map_alias(
|
|
|
28
36
|
column_qualifiers=qualifiers,
|
|
29
37
|
parent_column_name_map=input_container.column_map.get_parent_column_name_map(),
|
|
30
38
|
alias=alias,
|
|
39
|
+
equivalent_snowpark_names=input_container.column_map.get_equivalent_snowpark_names(),
|
|
31
40
|
)
|
|
@@ -22,6 +22,8 @@ from snowflake.snowpark_connect.config import (
|
|
|
22
22
|
global_config,
|
|
23
23
|
)
|
|
24
24
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
25
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
26
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
25
27
|
from snowflake.snowpark_connect.expression.map_expression import (
|
|
26
28
|
map_single_column_expression,
|
|
27
29
|
)
|
|
@@ -163,17 +165,21 @@ def process_return_type(
|
|
|
163
165
|
else:
|
|
164
166
|
parsed_return = return_type
|
|
165
167
|
except ValueError as e:
|
|
166
|
-
|
|
168
|
+
exception = PythonException(
|
|
167
169
|
f"[UDTF_ARROW_TYPE_CAST_ERROR] Error parsing UDTF return type DDL: {e}"
|
|
168
170
|
)
|
|
171
|
+
attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
|
|
172
|
+
raise exception
|
|
169
173
|
original_output_schema = proto_to_snowpark_type(parsed_return)
|
|
170
174
|
output_schema = proto_to_snowpark_type(parsed_return)
|
|
171
175
|
# Snowflake UDTF does not support MapType, so we convert it to VariantType.
|
|
172
176
|
output_schema = convert_maptype_to_variant(output_schema)
|
|
173
177
|
if not isinstance(output_schema, StructType):
|
|
174
|
-
|
|
178
|
+
exception = PySparkTypeError(
|
|
175
179
|
f"Invalid Python user-defined table function return type. Expect a struct type, but got {parsed_return}"
|
|
176
180
|
)
|
|
181
|
+
attach_custom_error_code(exception, ErrorCodes.TYPE_MISMATCH)
|
|
182
|
+
raise exception
|
|
177
183
|
|
|
178
184
|
expected_types = None
|
|
179
185
|
if is_arrow_enabled_in_udtf() or is_spark_compatible_udtf_mode_enabled():
|
|
@@ -276,12 +282,16 @@ def map_common_inline_user_defined_table_function(
|
|
|
276
282
|
if require_creating_udtf_in_sproc(udtf_proto):
|
|
277
283
|
snowpark_udtf_or_error = create_udtf_in_sproc(**kwargs)
|
|
278
284
|
if isinstance(snowpark_udtf_or_error, str):
|
|
279
|
-
|
|
285
|
+
exception = PythonException(snowpark_udtf_or_error)
|
|
286
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
287
|
+
raise exception
|
|
280
288
|
snowpark_udtf = snowpark_udtf_or_error
|
|
281
289
|
else:
|
|
282
290
|
udtf_or_error = create_udtf(**kwargs)
|
|
283
291
|
if isinstance(udtf_or_error, str):
|
|
284
|
-
|
|
292
|
+
exception = PythonException(udtf_or_error)
|
|
293
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
294
|
+
raise exception
|
|
285
295
|
udtf = udtf_or_error
|
|
286
296
|
snowpark_udtf = SnowparkUDTF(
|
|
287
297
|
name=udtf.name,
|
|
@@ -38,6 +38,8 @@ from snowflake.snowpark.types import (
|
|
|
38
38
|
TimeType,
|
|
39
39
|
_NumericType,
|
|
40
40
|
)
|
|
41
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
42
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
41
43
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
42
44
|
DATA_SOURCE_SQL_COMMENT,
|
|
43
45
|
Connection,
|
|
@@ -147,9 +149,11 @@ class JdbcDataFrameReader(DataFrameReader):
|
|
|
147
149
|
or upper_bound is not None
|
|
148
150
|
or num_partitions is not None
|
|
149
151
|
):
|
|
150
|
-
|
|
152
|
+
exception = ValueError(
|
|
151
153
|
"when column is not specified, lower_bound, upper_bound, num_partitions are expected to be None"
|
|
152
154
|
)
|
|
155
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
156
|
+
raise exception
|
|
153
157
|
if table is not None:
|
|
154
158
|
partitioned_queries = []
|
|
155
159
|
table_query = f"SELECT * FROM {table}"
|
|
@@ -160,24 +164,32 @@ class JdbcDataFrameReader(DataFrameReader):
|
|
|
160
164
|
elif query is not None:
|
|
161
165
|
partitioned_queries = [query]
|
|
162
166
|
else:
|
|
163
|
-
|
|
167
|
+
exception = ValueError("table or query is not specified")
|
|
168
|
+
attach_custom_error_code(exception, ErrorCodes.INSUFFICIENT_INPUT)
|
|
169
|
+
raise exception
|
|
164
170
|
else:
|
|
165
171
|
if lower_bound is None or upper_bound is None or num_partitions is None:
|
|
166
|
-
|
|
172
|
+
exception = ValueError(
|
|
167
173
|
"when column is specified, lower_bound, upper_bound, num_partitions must be specified"
|
|
168
174
|
)
|
|
175
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
176
|
+
raise exception
|
|
169
177
|
|
|
170
178
|
column_type = None
|
|
171
179
|
for field in struct_schema.fields:
|
|
172
180
|
if field.name.lower() == column.lower():
|
|
173
181
|
column_type = field.datatype
|
|
174
182
|
if column_type is None:
|
|
175
|
-
|
|
183
|
+
exception = ValueError("Column does not exist")
|
|
184
|
+
attach_custom_error_code(exception, ErrorCodes.COLUMN_NOT_FOUND)
|
|
185
|
+
raise exception
|
|
176
186
|
|
|
177
187
|
if not isinstance(column_type, _NumericType) and not isinstance(
|
|
178
188
|
column_type, DateType
|
|
179
189
|
):
|
|
180
|
-
|
|
190
|
+
exception = ValueError(f"unsupported type {column_type}")
|
|
191
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_TYPE)
|
|
192
|
+
raise exception
|
|
181
193
|
spark_column_name = f'"{column}"'
|
|
182
194
|
partitioned_queries = self._generate_partition(
|
|
183
195
|
table,
|
|
@@ -240,7 +252,11 @@ class JdbcDataFrameReader(DataFrameReader):
|
|
|
240
252
|
)
|
|
241
253
|
query_thread_executor.shutdown(wait=False)
|
|
242
254
|
upload_thread_executor.shutdown(wait=False)
|
|
243
|
-
|
|
255
|
+
exception = future.result()
|
|
256
|
+
attach_custom_error_code(
|
|
257
|
+
exception, ErrorCodes.INTERNAL_ERROR
|
|
258
|
+
)
|
|
259
|
+
raise exception
|
|
244
260
|
else:
|
|
245
261
|
path = future.result()
|
|
246
262
|
if not path:
|
|
@@ -266,7 +282,11 @@ class JdbcDataFrameReader(DataFrameReader):
|
|
|
266
282
|
)
|
|
267
283
|
query_thread_executor.shutdown(wait=False)
|
|
268
284
|
upload_thread_executor.shutdown(wait=False)
|
|
269
|
-
|
|
285
|
+
exception = f.result()
|
|
286
|
+
attach_custom_error_code(
|
|
287
|
+
exception, ErrorCodes.INTERNAL_ERROR
|
|
288
|
+
)
|
|
289
|
+
raise exception
|
|
270
290
|
finally:
|
|
271
291
|
close_connection(conn)
|
|
272
292
|
|
|
@@ -281,9 +301,14 @@ class JdbcDataFrameReader(DataFrameReader):
|
|
|
281
301
|
if table is not None:
|
|
282
302
|
sql = f"SELECT * FROM {table} WHERE 1=0"
|
|
283
303
|
elif query is not None:
|
|
284
|
-
|
|
304
|
+
# We need "jdbc_query" subquery alias as other datasources such as SQL Server and PostgreSQL
|
|
305
|
+
# do not work without an alias.
|
|
306
|
+
# Snowflake works with or without subquery alias.
|
|
307
|
+
sql = f"SELECT jdbc_query.* FROM ({query}) as jdbc_query WHERE 1=0"
|
|
285
308
|
else:
|
|
286
|
-
|
|
309
|
+
exception = ValueError("table or query is not specified")
|
|
310
|
+
attach_custom_error_code(exception, ErrorCodes.INSUFFICIENT_INPUT)
|
|
311
|
+
raise exception
|
|
287
312
|
|
|
288
313
|
cursor = conn.cursor()
|
|
289
314
|
cursor.execute(sql)
|
|
@@ -301,7 +326,11 @@ class JdbcDataFrameReader(DataFrameReader):
|
|
|
301
326
|
dt = parser.parse(value)
|
|
302
327
|
return int(dt.replace(tzinfo=pytz.UTC).timestamp())
|
|
303
328
|
else:
|
|
304
|
-
|
|
329
|
+
exception = TypeError(
|
|
330
|
+
f"unsupported column type for partition: {column_type}"
|
|
331
|
+
)
|
|
332
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_TYPE)
|
|
333
|
+
raise exception
|
|
305
334
|
|
|
306
335
|
# this function is only used in data source API for SQL server
|
|
307
336
|
def _to_external_value(self, value: Union[int, str, float], column_type: DataType):
|
|
@@ -311,7 +340,11 @@ class JdbcDataFrameReader(DataFrameReader):
|
|
|
311
340
|
# TODO: SNOW-1909315: support timezone
|
|
312
341
|
return datetime.datetime.fromtimestamp(value, tz=pytz.UTC)
|
|
313
342
|
else:
|
|
314
|
-
|
|
343
|
+
exception = TypeError(
|
|
344
|
+
f"unsupported column type for partition: {column_type}"
|
|
345
|
+
)
|
|
346
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_TYPE)
|
|
347
|
+
raise exception
|
|
315
348
|
|
|
316
349
|
def _to_snowpark_type(self, schema: Tuple[tuple]) -> StructType:
|
|
317
350
|
fields = []
|
|
@@ -339,7 +372,9 @@ class JdbcDataFrameReader(DataFrameReader):
|
|
|
339
372
|
case jaydebeapi.BINARY:
|
|
340
373
|
field = StructField(name, BinaryType(), is_nullable)
|
|
341
374
|
case _:
|
|
342
|
-
|
|
375
|
+
exception = ValueError(f"unsupported type: {dbapi_type}")
|
|
376
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_TYPE)
|
|
377
|
+
raise exception
|
|
343
378
|
|
|
344
379
|
fields.append(field)
|
|
345
380
|
return StructType(fields)
|
|
@@ -359,7 +394,9 @@ class JdbcDataFrameReader(DataFrameReader):
|
|
|
359
394
|
processed_lower_bound = self._to_internal_value(lower_bound, column_type)
|
|
360
395
|
processed_upper_bound = self._to_internal_value(upper_bound, column_type)
|
|
361
396
|
if processed_lower_bound > processed_upper_bound:
|
|
362
|
-
|
|
397
|
+
exception = ValueError("lower_bound cannot be greater than upper_bound")
|
|
398
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
399
|
+
raise exception
|
|
363
400
|
|
|
364
401
|
if processed_lower_bound == processed_upper_bound or num_partitions <= 1:
|
|
365
402
|
return [select_query]
|
|
@@ -665,4 +702,6 @@ def get_jdbc_dialect(url: str) -> JdbcDialect:
|
|
|
665
702
|
for jdbc_dialect in jdbc_dialects:
|
|
666
703
|
if jdbc_dialect.can_handle(url):
|
|
667
704
|
return jdbc_dialect
|
|
668
|
-
|
|
705
|
+
exception = ValueError(f"Unsupported JDBC datasource: {url}")
|
|
706
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
707
|
+
raise exception
|
|
@@ -10,13 +10,18 @@ import re
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
|
|
12
12
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
13
|
+
from pyspark.errors.exceptions.base import AnalysisException
|
|
13
14
|
|
|
14
15
|
from snowflake import snowpark
|
|
16
|
+
from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
|
|
15
17
|
from snowflake.snowpark.types import StructType
|
|
16
18
|
from snowflake.snowpark_connect.config import global_config
|
|
17
19
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
20
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
21
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
18
22
|
from snowflake.snowpark_connect.relation.io_utils import (
|
|
19
23
|
convert_file_prefix_path,
|
|
24
|
+
get_compression_for_source_and_options,
|
|
20
25
|
is_cloud_path,
|
|
21
26
|
)
|
|
22
27
|
from snowflake.snowpark_connect.relation.read.map_read_table import map_read_table
|
|
@@ -26,9 +31,12 @@ from snowflake.snowpark_connect.relation.read.reader_config import (
|
|
|
26
31
|
ParquetReaderConfig,
|
|
27
32
|
)
|
|
28
33
|
from snowflake.snowpark_connect.relation.stage_locator import get_paths_from_stage
|
|
29
|
-
from snowflake.snowpark_connect.type_mapping import
|
|
34
|
+
from snowflake.snowpark_connect.type_mapping import (
|
|
35
|
+
_parse_ddl_with_spark_scala,
|
|
36
|
+
map_json_schema_to_snowpark,
|
|
37
|
+
)
|
|
30
38
|
from snowflake.snowpark_connect.utils.cache import df_cache_map_put_if_absent
|
|
31
|
-
from snowflake.snowpark_connect.utils.context import
|
|
39
|
+
from snowflake.snowpark_connect.utils.context import get_spark_session_id
|
|
32
40
|
from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
|
|
33
41
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
34
42
|
SnowparkConnectNotImplementedError,
|
|
@@ -46,6 +54,7 @@ def map_read(
|
|
|
46
54
|
|
|
47
55
|
Currently, the supported read formats are `csv`, `json` and `parquet`.
|
|
48
56
|
"""
|
|
57
|
+
|
|
49
58
|
match rel.read.WhichOneof("read_type"):
|
|
50
59
|
case "named_table":
|
|
51
60
|
return map_read_table_or_file(rel)
|
|
@@ -74,28 +83,26 @@ def map_read(
|
|
|
74
83
|
try:
|
|
75
84
|
parsed_schema = json.loads(rel.read.data_source.schema)
|
|
76
85
|
except json.JSONDecodeError:
|
|
77
|
-
#
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
), f"Schema's definition {name_and_type} is invalid"
|
|
84
|
-
parsed_schema["fields"].append(
|
|
85
|
-
{
|
|
86
|
-
"name": name_and_type[0],
|
|
87
|
-
"nullable": True,
|
|
88
|
-
"type": name_and_type[1],
|
|
89
|
-
}
|
|
90
|
-
)
|
|
86
|
+
# Scala clients send DDL-formatted strings like
|
|
87
|
+
# "billing_account_id STRING, cost STRING" or "struct<id:bigint>"
|
|
88
|
+
spark_datatype = _parse_ddl_with_spark_scala(
|
|
89
|
+
rel.read.data_source.schema
|
|
90
|
+
)
|
|
91
|
+
parsed_schema = json.loads(spark_datatype.json())
|
|
91
92
|
schema = map_json_schema_to_snowpark(parsed_schema)
|
|
92
93
|
options = dict(rel.read.data_source.options)
|
|
93
94
|
telemetry.report_io_read(read_format)
|
|
94
95
|
session: snowpark.Session = get_or_create_snowpark_session()
|
|
95
96
|
if len(rel.read.data_source.paths) > 0:
|
|
97
|
+
if options.get("path"):
|
|
98
|
+
raise AnalysisException(
|
|
99
|
+
"There is a 'path' or 'paths' option set and load() is called with path parameters. "
|
|
100
|
+
"Either remove the path option if it's the same as the path parameter, "
|
|
101
|
+
"or add it to the load() parameter if you do want to read multiple paths."
|
|
102
|
+
)
|
|
96
103
|
# Normalize paths to ensure consistent behavior
|
|
97
104
|
clean_source_paths = [
|
|
98
|
-
path
|
|
105
|
+
path if is_cloud_path(path) else str(Path(path))
|
|
99
106
|
for path in rel.read.data_source.paths
|
|
100
107
|
]
|
|
101
108
|
|
|
@@ -121,23 +128,67 @@ def map_read(
|
|
|
121
128
|
options = {k.lower(): v for k, v in options.items()}
|
|
122
129
|
QUERY_OPTION = "query"
|
|
123
130
|
DBTABLE_OPTION = "dbtable"
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
131
|
+
|
|
132
|
+
def _identifiers_match(
|
|
133
|
+
desired: str, current: str | None
|
|
134
|
+
) -> bool:
|
|
135
|
+
if current is None:
|
|
136
|
+
return False
|
|
137
|
+
|
|
138
|
+
desired_unquoted = unquote_if_quoted(desired)
|
|
139
|
+
current_unquoted = unquote_if_quoted(current)
|
|
140
|
+
desired_was_quoted = desired != desired_unquoted
|
|
141
|
+
|
|
142
|
+
# If both are quoted, exact match required. session.get* always returns quoted identifier
|
|
143
|
+
# name.
|
|
144
|
+
if desired_was_quoted:
|
|
145
|
+
return desired == current
|
|
146
|
+
|
|
147
|
+
return desired_unquoted.upper() == current_unquoted
|
|
148
|
+
|
|
149
|
+
if "sfrole" in options:
|
|
150
|
+
desired_role = options["sfrole"]
|
|
151
|
+
current_role = session.get_current_role()
|
|
152
|
+
if not _identifiers_match(desired_role, current_role):
|
|
153
|
+
logger.warning(
|
|
154
|
+
f"Changing Role from {current_role} to {desired_role} via "
|
|
155
|
+
"options. This will change the role for the entire session."
|
|
156
|
+
)
|
|
157
|
+
session.use_role(desired_role)
|
|
158
|
+
|
|
159
|
+
if "sfwarehouse" in options:
|
|
160
|
+
desired_warehouse = options["sfwarehouse"]
|
|
161
|
+
current_warehouse = session.get_current_warehouse()
|
|
162
|
+
if not _identifiers_match(
|
|
163
|
+
desired_warehouse, current_warehouse
|
|
164
|
+
):
|
|
165
|
+
logger.warning(
|
|
166
|
+
f"Changing Warehouse from {current_warehouse} to {desired_warehouse} via "
|
|
167
|
+
"options. This will change the warehouse for the entire session."
|
|
168
|
+
)
|
|
169
|
+
session.use_warehouse(desired_warehouse)
|
|
170
|
+
|
|
171
|
+
if "sfdatabase" in options:
|
|
172
|
+
desired_database = options["sfdatabase"]
|
|
173
|
+
current_database = session.get_current_database()
|
|
174
|
+
if not _identifiers_match(
|
|
175
|
+
desired_database, current_database
|
|
176
|
+
):
|
|
177
|
+
logger.warning(
|
|
178
|
+
f"Changing Database from {current_database} to {desired_database} via "
|
|
179
|
+
"options. This will change the database for the entire session."
|
|
180
|
+
)
|
|
181
|
+
session.use_database(desired_database)
|
|
182
|
+
|
|
183
|
+
if "sfschema" in options:
|
|
184
|
+
desired_schema = options["sfschema"]
|
|
185
|
+
current_schema = session.get_current_schema()
|
|
186
|
+
if not _identifiers_match(desired_schema, current_schema):
|
|
187
|
+
logger.warning(
|
|
188
|
+
f"Changing Schema from {current_schema} to {desired_schema} via "
|
|
189
|
+
"options. This will change the schema for the entire session."
|
|
190
|
+
)
|
|
191
|
+
session.use_schema(desired_schema)
|
|
141
192
|
if QUERY_OPTION in options.keys():
|
|
142
193
|
from .map_read_table import get_table_from_query
|
|
143
194
|
|
|
@@ -151,19 +202,27 @@ def map_read(
|
|
|
151
202
|
options[DBTABLE_OPTION], session, rel.common.plan_id
|
|
152
203
|
)
|
|
153
204
|
case other:
|
|
154
|
-
|
|
205
|
+
exception = SnowparkConnectNotImplementedError(
|
|
155
206
|
f"UNSUPPORTED FORMAT {other} WITH NO PATH"
|
|
156
207
|
)
|
|
208
|
+
attach_custom_error_code(
|
|
209
|
+
exception, ErrorCodes.UNSUPPORTED_OPERATION
|
|
210
|
+
)
|
|
211
|
+
raise exception
|
|
157
212
|
case other:
|
|
158
213
|
# TODO: Empty data source
|
|
159
|
-
|
|
214
|
+
exception = SnowparkConnectNotImplementedError(
|
|
215
|
+
f"Unsupported read type: {other}"
|
|
216
|
+
)
|
|
217
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
218
|
+
raise exception
|
|
160
219
|
|
|
161
220
|
return df_cache_map_put_if_absent(
|
|
162
|
-
(
|
|
221
|
+
(get_spark_session_id(), rel.common.plan_id), lambda: result
|
|
163
222
|
)
|
|
164
223
|
|
|
165
224
|
|
|
166
|
-
def map_read_table_or_file(rel):
|
|
225
|
+
def map_read_table_or_file(rel) -> DataFrameContainer:
|
|
167
226
|
read_named_table_from_file = (
|
|
168
227
|
rel.read.named_table.unparsed_identifier
|
|
169
228
|
and _get_supported_read_file_format(rel.read.named_table.unparsed_identifier)
|
|
@@ -205,6 +264,23 @@ def _get_supported_read_file_format(unparsed_identifier: str) -> str | None:
|
|
|
205
264
|
return None
|
|
206
265
|
|
|
207
266
|
|
|
267
|
+
# TODO: [SNOW-2465948] Remove this once Snowpark fixes the issue with stage paths.
|
|
268
|
+
class StagePathStr(str):
|
|
269
|
+
def partition(self, __sep):
|
|
270
|
+
if str(self)[0] == "'":
|
|
271
|
+
return str(self)[1:].partition(__sep)
|
|
272
|
+
return str(self).partition(__sep)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def _quote_stage_path(stage_path: str) -> str:
|
|
276
|
+
"""
|
|
277
|
+
Quote stage paths to escape any special characters.
|
|
278
|
+
"""
|
|
279
|
+
if stage_path.startswith("@"):
|
|
280
|
+
return StagePathStr(f"'{stage_path}'")
|
|
281
|
+
return stage_path
|
|
282
|
+
|
|
283
|
+
|
|
208
284
|
def _read_file(
|
|
209
285
|
clean_source_paths: list[str],
|
|
210
286
|
options: dict,
|
|
@@ -212,12 +288,21 @@ def _read_file(
|
|
|
212
288
|
rel: relation_proto.Relation,
|
|
213
289
|
schema: StructType | None,
|
|
214
290
|
session: snowpark.Session,
|
|
215
|
-
) ->
|
|
291
|
+
) -> DataFrameContainer:
|
|
216
292
|
paths = get_paths_from_stage(
|
|
217
293
|
clean_source_paths,
|
|
218
294
|
session,
|
|
219
295
|
)
|
|
220
296
|
upload_files_if_needed(paths, clean_source_paths, session, read_format)
|
|
297
|
+
paths = [_quote_stage_path(path) for path in paths]
|
|
298
|
+
|
|
299
|
+
if read_format in ("csv", "text", "json", "parquet"):
|
|
300
|
+
compression = get_compression_for_source_and_options(
|
|
301
|
+
read_format, options, from_read=True
|
|
302
|
+
)
|
|
303
|
+
if compression is not None:
|
|
304
|
+
options["compression"] = compression
|
|
305
|
+
|
|
221
306
|
match read_format:
|
|
222
307
|
case "csv":
|
|
223
308
|
from snowflake.snowpark_connect.relation.read.map_read_csv import (
|
|
@@ -230,7 +315,11 @@ def _read_file(
|
|
|
230
315
|
map_read_json,
|
|
231
316
|
)
|
|
232
317
|
|
|
233
|
-
|
|
318
|
+
# JSON already materializes the table internally
|
|
319
|
+
return map_read_json(
|
|
320
|
+
rel, schema, session, paths, JsonReaderConfig(options)
|
|
321
|
+
).without_materialization()
|
|
322
|
+
|
|
234
323
|
case "parquet":
|
|
235
324
|
from snowflake.snowpark_connect.relation.read.map_read_parquet import (
|
|
236
325
|
map_read_parquet,
|
|
@@ -246,9 +335,11 @@ def _read_file(
|
|
|
246
335
|
|
|
247
336
|
return map_read_text(rel, schema, session, paths)
|
|
248
337
|
case _:
|
|
249
|
-
|
|
338
|
+
exception = SnowparkConnectNotImplementedError(
|
|
250
339
|
f"Unsupported format: {read_format}"
|
|
251
340
|
)
|
|
341
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
342
|
+
raise exception
|
|
252
343
|
|
|
253
344
|
|
|
254
345
|
def _skip_upload(path: str, read_format: str):
|
|
@@ -285,8 +376,8 @@ def upload_files_if_needed(
|
|
|
285
376
|
|
|
286
377
|
def _upload_dir(target: str, source: str) -> None:
|
|
287
378
|
# overwrite=True will not remove all stale files in the target prefix
|
|
288
|
-
|
|
289
|
-
remove_command = f"REMOVE {target}/"
|
|
379
|
+
# Quote the target path to allow special characters.
|
|
380
|
+
remove_command = f"REMOVE '{target}/'"
|
|
290
381
|
assert (
|
|
291
382
|
"//" not in remove_command
|
|
292
383
|
), f"Remove command {remove_command} contains double slash"
|