snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +717 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +309 -26
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/error_utils.py +28 -0
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +224 -15
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +86 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
- snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
- snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +171 -48
- snowflake/snowpark_connect/server.py +528 -473
- snowflake/snowpark_connect/server_common/__init__.py +503 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/type_support.py +130 -0
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +195 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +192 -40
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
import collections
|
|
6
6
|
import re
|
|
7
7
|
from collections.abc import Callable
|
|
8
|
+
from typing import Any
|
|
8
9
|
|
|
9
10
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
10
11
|
|
|
@@ -20,12 +21,31 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
|
20
21
|
quote_name_without_upper_casing,
|
|
21
22
|
)
|
|
22
23
|
from snowflake.snowpark.column import METADATA_FILENAME
|
|
23
|
-
from snowflake.snowpark.types import
|
|
24
|
+
from snowflake.snowpark.types import (
|
|
25
|
+
DataType,
|
|
26
|
+
DoubleType,
|
|
27
|
+
IntegerType,
|
|
28
|
+
StringType,
|
|
29
|
+
StructType,
|
|
30
|
+
)
|
|
31
|
+
from snowflake.snowpark_connect.config import external_table_location
|
|
24
32
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
33
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
34
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
35
|
+
from snowflake.snowpark_connect.relation.read.map_read_partitioned_parquet import (
|
|
36
|
+
read_partitioned_parquet_from_external_table,
|
|
37
|
+
use_external_table,
|
|
38
|
+
)
|
|
39
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
40
|
+
add_filename_metadata_to_reader,
|
|
41
|
+
)
|
|
25
42
|
from snowflake.snowpark_connect.relation.read.reader_config import ReaderWriterConfig
|
|
26
43
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
44
|
+
apply_metadata_exclusion_pattern,
|
|
27
45
|
rename_columns_as_snowflake_standard,
|
|
28
46
|
)
|
|
47
|
+
from snowflake.snowpark_connect.type_support import emulate_integral_types
|
|
48
|
+
from snowflake.snowpark_connect.utils.io_utils import cached_file_format
|
|
29
49
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
30
50
|
SnowparkConnectNotImplementedError,
|
|
31
51
|
)
|
|
@@ -33,7 +53,7 @@ from snowflake.snowpark_connect.utils.telemetry import (
|
|
|
33
53
|
|
|
34
54
|
def map_read_parquet(
|
|
35
55
|
rel: relation_proto.Relation,
|
|
36
|
-
schema:
|
|
56
|
+
schema: StructType | None,
|
|
37
57
|
session: snowpark.Session,
|
|
38
58
|
paths: list[str],
|
|
39
59
|
options: ReaderWriterConfig,
|
|
@@ -41,28 +61,62 @@ def map_read_parquet(
|
|
|
41
61
|
"""Read a Parquet file into a Snowpark DataFrame."""
|
|
42
62
|
|
|
43
63
|
if rel.read.is_streaming is True:
|
|
44
|
-
|
|
64
|
+
exception = SnowparkConnectNotImplementedError(
|
|
45
65
|
"Streaming is not supported for Parquet files."
|
|
46
66
|
)
|
|
67
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
68
|
+
raise exception
|
|
47
69
|
|
|
48
|
-
|
|
49
|
-
|
|
70
|
+
converted_snowpark_options = options.convert_to_snowpark_args()
|
|
71
|
+
file_format_options = _parse_parquet_snowpark_options(converted_snowpark_options)
|
|
72
|
+
raw_options = rel.read.data_source.options
|
|
50
73
|
assert len(paths) > 0, "Read PARQUET expects at least one path"
|
|
51
74
|
|
|
52
|
-
|
|
75
|
+
snowpark_options = {
|
|
76
|
+
# Setting these two options prevents a significant number of additional CREATE TEMPORARY
|
|
77
|
+
# FILE FORMAT and DROP FILE FORMAT queries. If FORMAT_NAME is not set, the Snowpark DF reader
|
|
78
|
+
# will eagerly issue a CREATE TEMPORARY FILE FORMAT when inferring the schema of the result;
|
|
79
|
+
# if ENFORCE_EXISTING_FILE_FORMAT is not set, an additional CREATE ... command will be
|
|
80
|
+
# issued when the lazy DF is materialized by a cache_result call.
|
|
81
|
+
"FORMAT_NAME": converted_snowpark_options.get(
|
|
82
|
+
"FORMAT_NAME",
|
|
83
|
+
cached_file_format(session, "parquet", file_format_options),
|
|
84
|
+
),
|
|
85
|
+
"ENFORCE_EXISTING_FILE_FORMAT": True,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if "PATTERN" in converted_snowpark_options:
|
|
89
|
+
snowpark_options["PATTERN"] = converted_snowpark_options.get("PATTERN")
|
|
90
|
+
|
|
91
|
+
apply_metadata_exclusion_pattern(snowpark_options)
|
|
92
|
+
|
|
93
|
+
reader = add_filename_metadata_to_reader(
|
|
94
|
+
session.read.options(snowpark_options), raw_options
|
|
95
|
+
)
|
|
53
96
|
|
|
54
97
|
if len(paths) == 1:
|
|
55
|
-
df = _read_parquet_with_partitions(
|
|
98
|
+
df, read_using_external_table = _read_parquet_with_partitions(
|
|
99
|
+
session, reader, paths[0], schema, snowpark_options
|
|
100
|
+
)
|
|
101
|
+
can_be_cached = not read_using_external_table
|
|
56
102
|
else:
|
|
57
103
|
is_merge_schema = options.config.get("mergeschema")
|
|
58
|
-
df = _read_parquet_with_partitions(
|
|
104
|
+
df, read_using_external_table = _read_parquet_with_partitions(
|
|
105
|
+
session, reader, paths[0], schema, snowpark_options
|
|
106
|
+
)
|
|
107
|
+
can_be_cached = not read_using_external_table
|
|
59
108
|
schema_cols = df.columns
|
|
60
109
|
for p in paths[1:]:
|
|
61
110
|
reader._user_schema = None
|
|
111
|
+
partition_df, read_using_external_table = _read_parquet_with_partitions(
|
|
112
|
+
session, reader, p, schema, snowpark_options
|
|
113
|
+
)
|
|
62
114
|
df = df.union_all_by_name(
|
|
63
|
-
|
|
115
|
+
partition_df,
|
|
64
116
|
allow_missing_columns=True,
|
|
65
117
|
)
|
|
118
|
+
can_be_cached = can_be_cached and not read_using_external_table
|
|
119
|
+
|
|
66
120
|
if not is_merge_schema:
|
|
67
121
|
df = df.select(*schema_cols)
|
|
68
122
|
|
|
@@ -73,34 +127,92 @@ def map_read_parquet(
|
|
|
73
127
|
dataframe=renamed_df,
|
|
74
128
|
spark_column_names=[analyzer_utils.unquote_if_quoted(c) for c in df.columns],
|
|
75
129
|
snowpark_column_names=snowpark_column_names,
|
|
76
|
-
snowpark_column_types=[
|
|
130
|
+
snowpark_column_types=[
|
|
131
|
+
emulate_integral_types(f.datatype) for f in df.schema.fields
|
|
132
|
+
],
|
|
133
|
+
can_be_cached=can_be_cached,
|
|
77
134
|
)
|
|
78
135
|
|
|
79
136
|
|
|
80
137
|
def _read_parquet_with_partitions(
|
|
81
|
-
session: Session,
|
|
82
|
-
|
|
83
|
-
|
|
138
|
+
session: Session,
|
|
139
|
+
reader: DataFrameReader,
|
|
140
|
+
path: str,
|
|
141
|
+
schema: StructType | None,
|
|
142
|
+
snowpark_options: dict[str, Any],
|
|
143
|
+
) -> tuple[DataFrame, bool]:
|
|
144
|
+
"""
|
|
145
|
+
Reads parquet files and adds partition columns from subdirectories.
|
|
146
|
+
Returns a tuple of read DataFrame and a boolean indicating if DataFrame was read from external table.
|
|
147
|
+
"""
|
|
84
148
|
|
|
85
149
|
partition_columns, inferred_types = _discover_partition_columns(session, path)
|
|
86
|
-
df = reader.with_metadata(METADATA_FILENAME).parquet(path)
|
|
87
150
|
|
|
88
|
-
|
|
89
|
-
|
|
151
|
+
def _get_df() -> DataFrame:
|
|
152
|
+
if not partition_columns:
|
|
153
|
+
return reader.parquet(path)
|
|
154
|
+
else:
|
|
155
|
+
# In case of too big overhead we can always optimize by using option: MAX_FILE_COUNT and allow user to define how many files should be scanned
|
|
156
|
+
df = reader.with_metadata(METADATA_FILENAME).parquet(path)
|
|
157
|
+
|
|
158
|
+
for col_name in partition_columns:
|
|
159
|
+
quoted_col_name = quote_name_without_upper_casing(col_name)
|
|
160
|
+
escaped_col_name = re.escape(col_name)
|
|
161
|
+
regex_pattern = rf"{escaped_col_name}=([^/]+)"
|
|
162
|
+
|
|
163
|
+
raw_value = snowpark_fn.regexp_extract(
|
|
164
|
+
METADATA_FILENAME, regex_pattern, 1
|
|
165
|
+
)
|
|
166
|
+
value_or_null = snowpark_fn.when(raw_value == "", None).otherwise(
|
|
167
|
+
raw_value
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
df = df.with_column(
|
|
171
|
+
quoted_col_name,
|
|
172
|
+
snowpark_fn.cast(value_or_null, inferred_types[col_name]),
|
|
173
|
+
)
|
|
174
|
+
return df.drop(METADATA_FILENAME)
|
|
175
|
+
|
|
176
|
+
if use_external_table(session, path):
|
|
177
|
+
if schema is None:
|
|
178
|
+
schema = _get_df().schema
|
|
179
|
+
return (
|
|
180
|
+
read_partitioned_parquet_from_external_table(
|
|
181
|
+
session,
|
|
182
|
+
schema,
|
|
183
|
+
external_table_location(),
|
|
184
|
+
path[1:-1],
|
|
185
|
+
partition_columns,
|
|
186
|
+
inferred_types,
|
|
187
|
+
snowpark_options,
|
|
188
|
+
),
|
|
189
|
+
True,
|
|
190
|
+
)
|
|
191
|
+
else:
|
|
192
|
+
# TODO: SNOW-2736756 support user schema
|
|
193
|
+
assert schema is None, "Read PARQUET does not support user schema"
|
|
194
|
+
return _get_df(), False
|
|
90
195
|
|
|
91
|
-
for col_name in partition_columns:
|
|
92
|
-
quoted_col_name = quote_name_without_upper_casing(col_name)
|
|
93
|
-
escaped_col_name = re.escape(col_name)
|
|
94
|
-
regex_pattern = rf"{escaped_col_name}=([^/]+)"
|
|
95
196
|
|
|
96
|
-
|
|
97
|
-
|
|
197
|
+
_parquet_file_format_allowed_options = {
|
|
198
|
+
"COMPRESSION",
|
|
199
|
+
"SNAPPY_COMPRESSION",
|
|
200
|
+
"BINARY_AS_TEXT",
|
|
201
|
+
"TRIM_SPACE",
|
|
202
|
+
"USE_LOGICAL_TYPE",
|
|
203
|
+
"USE_VECTORIZED_SCANNER",
|
|
204
|
+
"REPLACE_INVALID_CHARACTERS",
|
|
205
|
+
"NULL_IF",
|
|
206
|
+
}
|
|
98
207
|
|
|
99
|
-
df = df.with_column(
|
|
100
|
-
quoted_col_name, snowpark_fn.cast(value_or_null, inferred_types[col_name])
|
|
101
|
-
)
|
|
102
208
|
|
|
103
|
-
|
|
209
|
+
def _parse_parquet_snowpark_options(snowpark_options: dict[str, Any]) -> dict[str, Any]:
|
|
210
|
+
file_format_options = dict()
|
|
211
|
+
for key, value in snowpark_options.items():
|
|
212
|
+
upper_key = key.upper()
|
|
213
|
+
if upper_key in _parquet_file_format_allowed_options:
|
|
214
|
+
file_format_options[upper_key] = value
|
|
215
|
+
return file_format_options
|
|
104
216
|
|
|
105
217
|
|
|
106
218
|
def _extract_partitions_from_path(path: str) -> dict[str, str]:
|
|
@@ -149,10 +261,14 @@ def _discover_partition_columns(
|
|
|
149
261
|
if i not in dir_level_to_column_name:
|
|
150
262
|
dir_level_to_column_name[i] = key
|
|
151
263
|
elif dir_level_to_column_name[i] != key:
|
|
152
|
-
|
|
264
|
+
exception = ValueError(
|
|
153
265
|
f"Conflicting partition column names detected: '{dir_level_to_column_name[i]}' and '{key}' "
|
|
154
266
|
f"at the same directory level"
|
|
155
267
|
)
|
|
268
|
+
attach_custom_error_code(
|
|
269
|
+
exception, ErrorCodes.INVALID_OPERATION
|
|
270
|
+
)
|
|
271
|
+
raise exception
|
|
156
272
|
|
|
157
273
|
partition_columns_values[key].add(value)
|
|
158
274
|
|
|
@@ -160,10 +276,12 @@ def _discover_partition_columns(
|
|
|
160
276
|
for level in sorted(dir_level_to_column_name.keys()):
|
|
161
277
|
col_name = dir_level_to_column_name[level]
|
|
162
278
|
if col_name in seen_columns:
|
|
163
|
-
|
|
279
|
+
exception = ValueError(
|
|
164
280
|
f"Found partition column '{col_name}' at multiple directory levels. "
|
|
165
281
|
f"A partition column can only appear at a single level."
|
|
166
282
|
)
|
|
283
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
284
|
+
raise exception
|
|
167
285
|
seen_columns.add(col_name)
|
|
168
286
|
|
|
169
287
|
ordered_columns = [
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from copy import deepcopy
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from snowflake import snowpark
|
|
10
|
+
from snowflake.snowpark import Session
|
|
11
|
+
from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
12
|
+
quote_name_without_upper_casing,
|
|
13
|
+
unquote_if_quoted,
|
|
14
|
+
)
|
|
15
|
+
from snowflake.snowpark.functions import col, lit
|
|
16
|
+
from snowflake.snowpark.types import ArrayType, DataType, MapType, StructType
|
|
17
|
+
from snowflake.snowpark_connect.config import external_table_location
|
|
18
|
+
from snowflake.snowpark_connect.utils.context import (
|
|
19
|
+
get_spark_session_id,
|
|
20
|
+
register_request_external_table,
|
|
21
|
+
)
|
|
22
|
+
from snowflake.snowpark_connect.utils.io_utils import cached_file_format
|
|
23
|
+
from snowflake.snowpark_connect.utils.scala_udf_utils import map_type_to_snowflake_type
|
|
24
|
+
|
|
25
|
+
STRUCTURED_TYPE_PATTERN = re.compile(r"\([^)]*\)")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def use_external_table(session: Session, path: str) -> bool:
|
|
29
|
+
external_table_path = external_table_location()
|
|
30
|
+
stripped_path = path[1:-1]
|
|
31
|
+
|
|
32
|
+
is_external_table_path_defined = external_table_path is not None
|
|
33
|
+
is_stage = stripped_path.startswith("@")
|
|
34
|
+
|
|
35
|
+
return (
|
|
36
|
+
is_external_table_path_defined
|
|
37
|
+
and is_stage
|
|
38
|
+
and _is_external_stage(session, stripped_path)
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _is_external_stage(session: Session, path: str) -> bool:
|
|
43
|
+
try:
|
|
44
|
+
stage_description = (
|
|
45
|
+
session.sql(f"DESCRIBE STAGE {path.split('/')[0][1:]}")
|
|
46
|
+
.filter(col('"property"') == lit("URL"))
|
|
47
|
+
.collect()
|
|
48
|
+
)
|
|
49
|
+
return stage_description[0]["property_value"] != ""
|
|
50
|
+
except Exception:
|
|
51
|
+
return False
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _get_count_of_non_partition_path_parts(path: str) -> int:
|
|
55
|
+
count = 0
|
|
56
|
+
# First element of a path is a stage identifier we need to ignore it to count relative path parts
|
|
57
|
+
for element in path.split("/")[1:]:
|
|
58
|
+
if "=" in element:
|
|
59
|
+
break
|
|
60
|
+
count += 1
|
|
61
|
+
return count
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def read_partitioned_parquet_from_external_table(
|
|
65
|
+
session: Session,
|
|
66
|
+
schema: StructType,
|
|
67
|
+
external_table_path: str,
|
|
68
|
+
path: str,
|
|
69
|
+
partition_columns: list[str],
|
|
70
|
+
inferred_types: dict[str, DataType],
|
|
71
|
+
snowpark_options: dict[str, Any],
|
|
72
|
+
) -> snowpark.DataFrame:
|
|
73
|
+
skip_path_parts = _get_count_of_non_partition_path_parts(path)
|
|
74
|
+
snowpark_partition_columns = ", ".join(
|
|
75
|
+
[quote_name_without_upper_casing(col) for col in partition_columns]
|
|
76
|
+
)
|
|
77
|
+
snowpark_typed_partition_columns = ", ".join(
|
|
78
|
+
[
|
|
79
|
+
f"{quote_name_without_upper_casing(col)} {map_type_to_snowflake_type(inferred_types[col])} as (split_part(split_part(METADATA$FILENAME, '/', {i + skip_path_parts}), '=', 2)::{map_type_to_snowflake_type(inferred_types[col])})"
|
|
80
|
+
for col, i in zip(partition_columns, range(len(partition_columns)))
|
|
81
|
+
]
|
|
82
|
+
)
|
|
83
|
+
snowpark_schema_columns = ",".join(
|
|
84
|
+
[
|
|
85
|
+
f"{field.name} {_map_snowpark_type_to_simplified_snowflake_type(field.datatype)} as (value:{field.name}::{_map_snowpark_type_to_simplified_snowflake_type(field.datatype)})"
|
|
86
|
+
for field in schema.fields
|
|
87
|
+
if unquote_if_quoted(field.name) not in snowpark_partition_columns
|
|
88
|
+
]
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
table_name = f"{external_table_path}.{quote_name_without_upper_casing(path + get_spark_session_id())}"
|
|
92
|
+
snowpark_options_copy = deepcopy(snowpark_options)
|
|
93
|
+
# These options are only used in the Snowpark Python reader, but not the actual emitted SQL.
|
|
94
|
+
snowpark_options_copy.pop("PATTERN")
|
|
95
|
+
snowpark_options_copy.pop("FORMAT_NAME")
|
|
96
|
+
snowpark_options_copy.pop("ENFORCE_EXISTING_FILE_FORMAT")
|
|
97
|
+
file_format_name = cached_file_format(session, "parquet", snowpark_options_copy)
|
|
98
|
+
session.sql(
|
|
99
|
+
f"""
|
|
100
|
+
CREATE OR REPLACE EXTERNAL TABLE {table_name} (
|
|
101
|
+
{snowpark_typed_partition_columns},
|
|
102
|
+
{snowpark_schema_columns}
|
|
103
|
+
)
|
|
104
|
+
PARTITION BY ({snowpark_partition_columns})
|
|
105
|
+
WITH LOCATION = {path}
|
|
106
|
+
FILE_FORMAT = {file_format_name}
|
|
107
|
+
PATTERN = '{snowpark_options.get('PATTERN', '.*')}'
|
|
108
|
+
AUTO_REFRESH = false
|
|
109
|
+
"""
|
|
110
|
+
).collect()
|
|
111
|
+
register_request_external_table(table_name)
|
|
112
|
+
map_fields = ", ".join(
|
|
113
|
+
[
|
|
114
|
+
f"{field.name}::{_map_snowpark_type_to_snowflake(field.datatype)} as {field.name}"
|
|
115
|
+
if isinstance(field.datatype, (StructType, MapType, ArrayType))
|
|
116
|
+
else field.name
|
|
117
|
+
for field in schema.fields
|
|
118
|
+
]
|
|
119
|
+
)
|
|
120
|
+
return session.sql(f"SELECT {map_fields} FROM {table_name}")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _map_snowpark_type_to_simplified_snowflake_type(datatype: DataType) -> str:
|
|
124
|
+
if isinstance(datatype, StructType):
|
|
125
|
+
return "OBJECT"
|
|
126
|
+
elif isinstance(datatype, MapType):
|
|
127
|
+
return "VARIANT"
|
|
128
|
+
else:
|
|
129
|
+
return STRUCTURED_TYPE_PATTERN.sub("", map_type_to_snowflake_type(datatype))
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _map_snowpark_type_to_snowflake(datatype: DataType) -> str:
|
|
133
|
+
if isinstance(datatype, StructType):
|
|
134
|
+
object_fields = ", ".join(
|
|
135
|
+
[
|
|
136
|
+
f"{field.name} { _map_snowpark_type_to_snowflake(field.datatype)}"
|
|
137
|
+
for field in datatype.fields
|
|
138
|
+
]
|
|
139
|
+
)
|
|
140
|
+
return f"OBJECT({object_fields})"
|
|
141
|
+
else:
|
|
142
|
+
return map_type_to_snowflake_type(datatype)
|
|
@@ -9,6 +9,9 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
|
9
9
|
|
|
10
10
|
from snowflake import snowpark
|
|
11
11
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
12
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
13
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
14
|
+
from snowflake.snowpark_connect.type_support import emulate_integral_types
|
|
12
15
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
13
16
|
SnowparkConnectNotImplementedError,
|
|
14
17
|
)
|
|
@@ -30,7 +33,9 @@ def map_read_socket(
|
|
|
30
33
|
host = options.get("host", None)
|
|
31
34
|
port = options.get("port", None)
|
|
32
35
|
if not host or not port:
|
|
33
|
-
|
|
36
|
+
exception = ValueError("Host and port must be provided in options.")
|
|
37
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
38
|
+
raise exception
|
|
34
39
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
35
40
|
try:
|
|
36
41
|
s.connect((host, int(port)))
|
|
@@ -54,10 +59,17 @@ def map_read_socket(
|
|
|
54
59
|
dataframe=df,
|
|
55
60
|
spark_column_names=[spark_cname],
|
|
56
61
|
snowpark_column_names=[snowpark_cname],
|
|
62
|
+
snowpark_column_types=[
|
|
63
|
+
emulate_integral_types(f.datatype) for f in df.schema.fields
|
|
64
|
+
],
|
|
57
65
|
)
|
|
58
66
|
except OSError as e:
|
|
59
|
-
|
|
67
|
+
exception = Exception(f"Error connecting to {host}:{port} - {e}")
|
|
68
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
69
|
+
raise exception
|
|
60
70
|
else:
|
|
61
|
-
|
|
71
|
+
exception = SnowparkConnectNotImplementedError(
|
|
62
72
|
"Socket reads are only supported in streaming mode."
|
|
63
73
|
)
|
|
74
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
75
|
+
raise exception
|
|
@@ -11,11 +11,21 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
|
11
11
|
unquote_if_quoted,
|
|
12
12
|
)
|
|
13
13
|
from snowflake.snowpark.exceptions import SnowparkSQLException
|
|
14
|
+
from snowflake.snowpark.types import StructField, StructType
|
|
15
|
+
from snowflake.snowpark_connect.column_name_handler import (
|
|
16
|
+
ColumnNameMap,
|
|
17
|
+
make_column_names_snowpark_compatible,
|
|
18
|
+
)
|
|
19
|
+
from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
|
|
14
20
|
from snowflake.snowpark_connect.config import auto_uppercase_non_column_identifiers
|
|
15
21
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
22
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
23
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
16
24
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
17
25
|
rename_columns_as_snowflake_standard,
|
|
18
26
|
)
|
|
27
|
+
from snowflake.snowpark_connect.type_support import emulate_integral_types
|
|
28
|
+
from snowflake.snowpark_connect.utils.context import get_processed_views
|
|
19
29
|
from snowflake.snowpark_connect.utils.identifiers import (
|
|
20
30
|
split_fully_qualified_spark_name,
|
|
21
31
|
)
|
|
@@ -23,6 +33,7 @@ from snowflake.snowpark_connect.utils.session import _get_current_snowpark_sessi
|
|
|
23
33
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
24
34
|
SnowparkConnectNotImplementedError,
|
|
25
35
|
)
|
|
36
|
+
from snowflake.snowpark_connect.utils.temporary_view_helper import get_temp_view
|
|
26
37
|
|
|
27
38
|
|
|
28
39
|
def post_process_df(
|
|
@@ -48,8 +59,10 @@ def post_process_df(
|
|
|
48
59
|
dataframe=renamed_df,
|
|
49
60
|
spark_column_names=true_names,
|
|
50
61
|
snowpark_column_names=snowpark_column_names,
|
|
51
|
-
snowpark_column_types=[
|
|
52
|
-
|
|
62
|
+
snowpark_column_types=[
|
|
63
|
+
emulate_integral_types(f.datatype) for f in df.schema.fields
|
|
64
|
+
],
|
|
65
|
+
column_qualifiers=[{ColumnQualifier(tuple(name_parts))} for _ in true_names]
|
|
53
66
|
if source_table_name
|
|
54
67
|
else None,
|
|
55
68
|
)
|
|
@@ -57,22 +70,85 @@ def post_process_df(
|
|
|
57
70
|
# Check if this is a table/view not found error
|
|
58
71
|
# Snowflake error codes: 002003 (42S02) - Object does not exist or not authorized
|
|
59
72
|
if hasattr(e, "sql_error_code") and e.sql_error_code == 2003:
|
|
60
|
-
|
|
73
|
+
exception = AnalysisException(
|
|
61
74
|
f"[TABLE_OR_VIEW_NOT_FOUND] The table or view cannot be found. {source_table_name}"
|
|
62
|
-
)
|
|
75
|
+
)
|
|
76
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
77
|
+
raise exception from None # Suppress original exception to reduce message size
|
|
63
78
|
# Re-raise if it's not a table not found error
|
|
64
79
|
raise
|
|
65
80
|
|
|
66
81
|
|
|
82
|
+
def _get_temporary_view(
|
|
83
|
+
temp_view: DataFrameContainer, table_name: str, plan_id: int
|
|
84
|
+
) -> DataFrameContainer:
|
|
85
|
+
fields_names = [field.name for field in temp_view.dataframe.schema.fields]
|
|
86
|
+
fields_types = [field.datatype for field in temp_view.dataframe.schema.fields]
|
|
87
|
+
|
|
88
|
+
snowpark_column_names = make_column_names_snowpark_compatible(
|
|
89
|
+
temp_view.column_map.get_spark_columns(), plan_id
|
|
90
|
+
)
|
|
91
|
+
# Rename columns in dataframe to prevent conflicting names during joins
|
|
92
|
+
renamed_df = temp_view.dataframe.select(
|
|
93
|
+
*(
|
|
94
|
+
temp_view.dataframe.col(orig).alias(alias)
|
|
95
|
+
for orig, alias in zip(fields_names, snowpark_column_names)
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
# do not flatten initial rename when reading table
|
|
99
|
+
# TODO: remove once SNOW-2203826 is done
|
|
100
|
+
if renamed_df._select_statement is not None:
|
|
101
|
+
renamed_df._select_statement.flatten_disabled = True
|
|
102
|
+
|
|
103
|
+
new_column_map = ColumnNameMap(
|
|
104
|
+
spark_column_names=temp_view.column_map.get_spark_columns(),
|
|
105
|
+
snowpark_column_names=snowpark_column_names,
|
|
106
|
+
column_metadata=temp_view.column_map.column_metadata,
|
|
107
|
+
column_qualifiers=[
|
|
108
|
+
{ColumnQualifier(tuple(split_fully_qualified_spark_name(table_name)))}
|
|
109
|
+
for _ in range(len(temp_view.column_map.get_spark_columns()))
|
|
110
|
+
],
|
|
111
|
+
parent_column_name_map=temp_view.column_map.get_parent_column_name_map(),
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
schema = StructType(
|
|
115
|
+
[
|
|
116
|
+
StructField(name, type, _is_column=False)
|
|
117
|
+
for name, type in zip(snowpark_column_names, fields_types)
|
|
118
|
+
]
|
|
119
|
+
)
|
|
120
|
+
return DataFrameContainer(
|
|
121
|
+
dataframe=renamed_df,
|
|
122
|
+
column_map=new_column_map,
|
|
123
|
+
table_name=temp_view.table_name,
|
|
124
|
+
alias=temp_view.alias,
|
|
125
|
+
partition_hint=temp_view.partition_hint,
|
|
126
|
+
cached_schema_getter=lambda: schema,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
67
130
|
def get_table_from_name(
|
|
68
131
|
table_name: str, session: snowpark.Session, plan_id: int
|
|
69
132
|
) -> DataFrameContainer:
|
|
70
133
|
"""Get table from name returning a container."""
|
|
134
|
+
|
|
135
|
+
# Verify if recursive view read is not attempted
|
|
136
|
+
if table_name in get_processed_views():
|
|
137
|
+
exception = AnalysisException(
|
|
138
|
+
f"[RECURSIVE_VIEW] Recursive view `{table_name}` detected (cycle: `{table_name}` -> `{table_name}`)"
|
|
139
|
+
)
|
|
140
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
141
|
+
raise exception
|
|
142
|
+
|
|
71
143
|
snowpark_name = ".".join(
|
|
72
144
|
quote_name_without_upper_casing(part)
|
|
73
145
|
for part in split_fully_qualified_spark_name(table_name)
|
|
74
146
|
)
|
|
75
147
|
|
|
148
|
+
temp_view = get_temp_view(snowpark_name)
|
|
149
|
+
if temp_view:
|
|
150
|
+
return _get_temporary_view(temp_view, table_name, plan_id)
|
|
151
|
+
|
|
76
152
|
if auto_uppercase_non_column_identifiers():
|
|
77
153
|
snowpark_name = snowpark_name.upper()
|
|
78
154
|
|
|
@@ -101,10 +177,14 @@ def map_read_table(
|
|
|
101
177
|
and rel.read.data_source.format.lower() == "iceberg"
|
|
102
178
|
):
|
|
103
179
|
if len(rel.read.data_source.paths) != 1:
|
|
104
|
-
|
|
180
|
+
exception = SnowparkConnectNotImplementedError(
|
|
105
181
|
f"Unexpected paths: {rel.read.data_source.paths}"
|
|
106
182
|
)
|
|
183
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
184
|
+
raise exception
|
|
107
185
|
table_identifier = rel.read.data_source.paths[0]
|
|
108
186
|
else:
|
|
109
|
-
|
|
187
|
+
exception = ValueError("The relation must have a table identifier.")
|
|
188
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_INPUT)
|
|
189
|
+
raise exception
|
|
110
190
|
return get_table_from_name(table_identifier, session, rel.common.plan_id)
|
|
@@ -8,10 +8,13 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
|
8
8
|
|
|
9
9
|
from snowflake import snowpark
|
|
10
10
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
11
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
12
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
11
13
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
12
14
|
get_spark_column_names_from_snowpark_columns,
|
|
13
15
|
rename_columns_as_snowflake_standard,
|
|
14
16
|
)
|
|
17
|
+
from snowflake.snowpark_connect.type_support import emulate_integral_types
|
|
15
18
|
from snowflake.snowpark_connect.utils.io_utils import file_format
|
|
16
19
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
17
20
|
SnowparkConnectNotImplementedError,
|
|
@@ -24,11 +27,17 @@ def get_file_paths_from_stage(
|
|
|
24
27
|
) -> typing.List[str]:
|
|
25
28
|
files_paths = []
|
|
26
29
|
for listed_path_row in session.sql(f"LIST {path}").collect():
|
|
30
|
+
# Skip _SUCCESS marker files
|
|
31
|
+
if listed_path_row[0].endswith("_SUCCESS"):
|
|
32
|
+
continue
|
|
33
|
+
|
|
27
34
|
listed_path = listed_path_row[0].split("/")
|
|
28
35
|
if listed_path_row[0].startswith("s3://") or listed_path_row[0].startswith(
|
|
29
36
|
"s3a://"
|
|
30
37
|
):
|
|
31
38
|
listed_path = listed_path[3:]
|
|
39
|
+
elif listed_path_row[0].startswith("azure://"):
|
|
40
|
+
listed_path = listed_path[4:]
|
|
32
41
|
else:
|
|
33
42
|
listed_path = listed_path[1:]
|
|
34
43
|
files_paths.append("/".join(listed_path))
|
|
@@ -43,7 +52,12 @@ def read_text(
|
|
|
43
52
|
) -> snowpark.DataFrame:
|
|
44
53
|
# TODO: handle stage name with double quotes
|
|
45
54
|
files_paths = get_file_paths_from_stage(path, session)
|
|
46
|
-
|
|
55
|
+
# Remove matching quotes from both ends of the path to get the stage name, if present.
|
|
56
|
+
if path and len(path) > 1 and path[0] == path[-1] and path[0] in ('"', "'"):
|
|
57
|
+
unquoted_path = path[1:-1]
|
|
58
|
+
else:
|
|
59
|
+
unquoted_path = path
|
|
60
|
+
stage_name = unquoted_path.split("/")[0]
|
|
47
61
|
line_sep = options.get("lineSep") or "\n"
|
|
48
62
|
column_name = (
|
|
49
63
|
schema[0].name if schema is not None and len(schema.fields) > 0 else '"value"'
|
|
@@ -59,7 +73,7 @@ def read_text(
|
|
|
59
73
|
)
|
|
60
74
|
for fp in files_paths:
|
|
61
75
|
content = session.sql(
|
|
62
|
-
f"SELECT T.$1 AS {default_column_name} FROM {stage_name}/{fp} (FILE_FORMAT => {text_file_format}) AS T"
|
|
76
|
+
f"SELECT T.$1 AS {default_column_name} FROM '{stage_name}/{fp}' (FILE_FORMAT => {text_file_format}) AS T"
|
|
63
77
|
).collect()
|
|
64
78
|
for row in content:
|
|
65
79
|
result.append(row[0])
|
|
@@ -77,9 +91,11 @@ def map_read_text(
|
|
|
77
91
|
"""
|
|
78
92
|
if rel.read.is_streaming is True:
|
|
79
93
|
# TODO: Structured streaming implementation.
|
|
80
|
-
|
|
94
|
+
exception = SnowparkConnectNotImplementedError(
|
|
81
95
|
"Streaming is not supported for CSV files."
|
|
82
96
|
)
|
|
97
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
98
|
+
raise exception
|
|
83
99
|
|
|
84
100
|
df = read_text(paths[0], schema, session, rel.read.data_source.options)
|
|
85
101
|
if len(paths) > 1:
|
|
@@ -102,5 +118,7 @@ def map_read_text(
|
|
|
102
118
|
dataframe=renamed_df,
|
|
103
119
|
spark_column_names=spark_column_names,
|
|
104
120
|
snowpark_column_names=snowpark_column_names,
|
|
105
|
-
snowpark_column_types=[
|
|
121
|
+
snowpark_column_types=[
|
|
122
|
+
emulate_integral_types(f.datatype) for f in df.schema.fields
|
|
123
|
+
],
|
|
106
124
|
)
|