snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +680 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +237 -23
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +123 -5
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +85 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2748 -746
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +196 -255
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +255 -45
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +17 -5
- snowflake/snowpark_connect/relation/read/map_read_json.py +320 -85
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +142 -27
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +11 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +82 -5
- snowflake/snowpark_connect/relation/read/map_read_text.py +18 -3
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +36 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +110 -48
- snowflake/snowpark_connect/server.py +546 -456
- snowflake/snowpark_connect/server_common/__init__.py +500 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +187 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +125 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +303 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +248 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +101 -332
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +163 -22
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-submit +2 -2
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/METADATA +14 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/RECORD +129 -167
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
10
|
+
quote_name_without_upper_casing,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class ColumnQualifier:
|
|
16
|
+
parts: tuple[str, ...]
|
|
17
|
+
|
|
18
|
+
def __post_init__(self) -> None:
|
|
19
|
+
if not all(isinstance(x, str) for x in self.parts):
|
|
20
|
+
raise TypeError("ColumnQualifier.parts must be strings")
|
|
21
|
+
|
|
22
|
+
@property
|
|
23
|
+
def is_empty(self) -> bool:
|
|
24
|
+
return len(self.parts) == 0
|
|
25
|
+
|
|
26
|
+
def all_qualified_names(self, name: str) -> list[str]:
|
|
27
|
+
qualifier_parts = self.parts
|
|
28
|
+
qualifier_prefixes = [
|
|
29
|
+
".".join(quote_name_without_upper_casing(x) for x in qualifier_parts[i:])
|
|
30
|
+
for i in range(len(qualifier_parts))
|
|
31
|
+
]
|
|
32
|
+
return [f"{prefix}.{name}" for prefix in qualifier_prefixes]
|
|
33
|
+
|
|
34
|
+
def to_upper(self):
|
|
35
|
+
return ColumnQualifier(tuple(part.upper() for part in self.parts))
|
|
36
|
+
|
|
37
|
+
def matches(self, target: ColumnQualifier) -> bool:
|
|
38
|
+
if self.is_empty or target.is_empty:
|
|
39
|
+
return False
|
|
40
|
+
# If the column has fewer qualifiers than the target, it cannot match
|
|
41
|
+
if len(self.parts) < len(target.parts):
|
|
42
|
+
return False
|
|
43
|
+
return self.parts[-len(target.parts) :] == target.parts
|
|
@@ -8,7 +8,7 @@ import re
|
|
|
8
8
|
import sys
|
|
9
9
|
from collections import defaultdict
|
|
10
10
|
from copy import copy, deepcopy
|
|
11
|
-
from typing import Any
|
|
11
|
+
from typing import Any, Dict, Optional
|
|
12
12
|
|
|
13
13
|
import jpype
|
|
14
14
|
import pyspark.sql.connect.proto.base_pb2 as proto_base
|
|
@@ -17,11 +17,17 @@ from tzlocal import get_localzone_name
|
|
|
17
17
|
from snowflake import snowpark
|
|
18
18
|
from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
19
19
|
quote_name_without_upper_casing,
|
|
20
|
+
unquote_if_quoted,
|
|
20
21
|
)
|
|
21
22
|
from snowflake.snowpark.exceptions import SnowparkSQLException
|
|
22
23
|
from snowflake.snowpark.types import TimestampTimeZone, TimestampType
|
|
24
|
+
from snowflake.snowpark_connect.error.error_codes import ErrorCodes
|
|
25
|
+
from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
|
|
23
26
|
from snowflake.snowpark_connect.utils.concurrent import SynchronizedDict
|
|
24
|
-
from snowflake.snowpark_connect.utils.context import
|
|
27
|
+
from snowflake.snowpark_connect.utils.context import (
|
|
28
|
+
get_jpype_jclass_lock,
|
|
29
|
+
get_spark_session_id,
|
|
30
|
+
)
|
|
25
31
|
from snowflake.snowpark_connect.utils.external_udxf_cache import (
|
|
26
32
|
clear_external_udxf_cache,
|
|
27
33
|
)
|
|
@@ -139,9 +145,21 @@ class GlobalConfig:
|
|
|
139
145
|
"spark.sql.parser.quotedRegexColumnNames": "false",
|
|
140
146
|
# custom configs
|
|
141
147
|
"snowpark.connect.version": ".".join(map(str, sas_version)),
|
|
148
|
+
"snowpark.connect.temporary.views.create_in_snowflake": "false",
|
|
142
149
|
# Control whether repartition(n) on a DataFrame forces splitting into n files during writes
|
|
143
150
|
# This matches spark behavior more closely, but introduces overhead.
|
|
144
151
|
"snowflake.repartition.for.writes": "false",
|
|
152
|
+
"snowpark.connect.structured_types.fix": "true",
|
|
153
|
+
# Local relation optimization: Use List[Row] for small data, PyArrow for large data
|
|
154
|
+
# Enabled in production by default to improve performance for createDataFrame on small local relations.
|
|
155
|
+
# Disabled in tests by default unless explicitly enabled to stabilize flaky tests that are not applying row ordering.
|
|
156
|
+
# SNOW-2719980: Remove this flag after test fragility issues are resolved
|
|
157
|
+
"snowpark.connect.localRelation.optimizeSmallData": "true",
|
|
158
|
+
"spark.sql.execution.arrow.maxRecordsPerBatch": "10000", # TODO: no-op
|
|
159
|
+
# USE_VECTORIZED_SCANNER will become the default in a future BCR; Snowflake recommends setting it to TRUE for new workloads.
|
|
160
|
+
# This significantly reduces latency for loading Parquet files by downloading only relevant columnar sections into memory.
|
|
161
|
+
"snowpark.connect.parquet.useVectorizedScanner": "true",
|
|
162
|
+
"spark.sql.legacy.dataset.nameNonStructGroupingKeyAsValue": "false",
|
|
145
163
|
}
|
|
146
164
|
|
|
147
165
|
boolean_config_list = [
|
|
@@ -150,11 +168,14 @@ class GlobalConfig:
|
|
|
150
168
|
"spark.sql.repl.eagerEval.enabled",
|
|
151
169
|
"spark.sql.crossJoin.enabled",
|
|
152
170
|
"spark.sql.caseSensitive",
|
|
171
|
+
"snowpark.connect.localRelation.optimizeSmallData",
|
|
172
|
+
"snowpark.connect.parquet.useVectorizedScanner",
|
|
153
173
|
"spark.sql.ansi.enabled",
|
|
154
174
|
"spark.sql.legacy.allowHashOnMapType",
|
|
155
175
|
"spark.Catalog.databaseFilterInformationSchema",
|
|
156
176
|
"spark.sql.parser.quotedRegexColumnNames",
|
|
157
177
|
"snowflake.repartition.for.writes",
|
|
178
|
+
"spark.sql.legacy.dataset.nameNonStructGroupingKeyAsValue",
|
|
158
179
|
]
|
|
159
180
|
|
|
160
181
|
int_config_list = [
|
|
@@ -257,21 +278,34 @@ SESSION_CONFIG_KEY_WHITELIST = {
|
|
|
257
278
|
"spark.sql.execution.pythonUDTF.arrow.enabled",
|
|
258
279
|
"spark.sql.tvf.allowMultipleTableArguments.enabled",
|
|
259
280
|
"snowpark.connect.sql.passthrough",
|
|
281
|
+
"snowpark.connect.cte.optimization_enabled",
|
|
260
282
|
"snowpark.connect.iceberg.external_volume",
|
|
261
283
|
"snowpark.connect.sql.identifiers.auto-uppercase",
|
|
284
|
+
"snowpark.connect.sql.partition.external_table_location",
|
|
262
285
|
"snowpark.connect.udtf.compatibility_mode",
|
|
263
286
|
"snowpark.connect.views.duplicate_column_names_handling_mode",
|
|
264
|
-
"
|
|
287
|
+
"snowpark.connect.temporary.views.create_in_snowflake",
|
|
288
|
+
"snowpark.connect.enable_snowflake_extension_behavior",
|
|
289
|
+
"spark.hadoop.fs.s3a.server-side-encryption.key",
|
|
290
|
+
"spark.hadoop.fs.s3a.assumed.role.arn",
|
|
291
|
+
"snowpark.connect.describe_cache_ttl_seconds",
|
|
292
|
+
"mapreduce.fileoutputcommitter.marksuccessfuljobs",
|
|
293
|
+
"spark.sql.parquet.enable.summary-metadata",
|
|
294
|
+
"parquet.enable.summary-metadata",
|
|
265
295
|
}
|
|
266
|
-
|
|
296
|
+
AZURE_ACCOUNT_KEY = re.compile(
|
|
267
297
|
r"^fs\.azure\.sas\.[^\.]+\.[^\.]+\.blob\.core\.windows\.net$"
|
|
268
298
|
)
|
|
299
|
+
AZURE_SAS_KEY = re.compile(
|
|
300
|
+
r"^fs\.azure\.sas\.fixed\.token\.[^\.]+\.dfs\.core\.windows\.net$"
|
|
301
|
+
)
|
|
269
302
|
|
|
270
303
|
|
|
271
304
|
def valid_session_config_key(key: str):
|
|
272
305
|
return (
|
|
273
306
|
key in SESSION_CONFIG_KEY_WHITELIST # AWS session keys
|
|
274
307
|
or AZURE_SAS_KEY.match(key) # Azure session keys
|
|
308
|
+
or AZURE_ACCOUNT_KEY.match(key) # Azure account keys
|
|
275
309
|
)
|
|
276
310
|
|
|
277
311
|
|
|
@@ -279,17 +313,23 @@ class SessionConfig:
|
|
|
279
313
|
"""This class contains the session configuration for the Spark Server."""
|
|
280
314
|
|
|
281
315
|
default_session_config = {
|
|
282
|
-
"snowpark.connect.sql.identifiers.auto-uppercase": "all_except_columns",
|
|
283
316
|
"snowpark.connect.sql.passthrough": "false",
|
|
317
|
+
"snowpark.connect.cte.optimization_enabled": "false",
|
|
284
318
|
"snowpark.connect.udtf.compatibility_mode": "false",
|
|
285
319
|
"snowpark.connect.views.duplicate_column_names_handling_mode": "rename",
|
|
286
320
|
"spark.sql.execution.pythonUDTF.arrow.enabled": "false",
|
|
287
321
|
"spark.sql.tvf.allowMultipleTableArguments.enabled": "true",
|
|
288
|
-
"enable_snowflake_extension_behavior": "false",
|
|
322
|
+
"snowpark.connect.enable_snowflake_extension_behavior": "false",
|
|
323
|
+
"snowpark.connect.describe_cache_ttl_seconds": "300",
|
|
324
|
+
"snowpark.connect.sql.partition.external_table_location": None,
|
|
325
|
+
"mapreduce.fileoutputcommitter.marksuccessfuljobs": "false",
|
|
326
|
+
"spark.sql.parquet.enable.summary-metadata": "false",
|
|
327
|
+
"parquet.enable.summary-metadata": "false",
|
|
289
328
|
}
|
|
290
329
|
|
|
291
330
|
def __init__(self) -> None:
|
|
292
331
|
self.config = deepcopy(self.default_session_config)
|
|
332
|
+
self.table_metadata: Dict[str, Dict[str, Any]] = {}
|
|
293
333
|
|
|
294
334
|
def __getitem__(self, item: str) -> str:
|
|
295
335
|
return self.get(item)
|
|
@@ -344,9 +384,11 @@ def route_config_proto(
|
|
|
344
384
|
if not pair.HasField("value"):
|
|
345
385
|
from pyspark.errors import IllegalArgumentException
|
|
346
386
|
|
|
347
|
-
|
|
387
|
+
exception = IllegalArgumentException(
|
|
348
388
|
f"Cannot set config '{pair.key}' to None"
|
|
349
389
|
)
|
|
390
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_CONFIG_VALUE)
|
|
391
|
+
raise exception
|
|
350
392
|
|
|
351
393
|
set_config_param(
|
|
352
394
|
config.session_id, pair.key, pair.value, snowpark_session
|
|
@@ -429,7 +471,11 @@ def route_config_proto(
|
|
|
429
471
|
pair.value = str(global_config.is_modifiable(key)).lower()
|
|
430
472
|
return res
|
|
431
473
|
case _:
|
|
432
|
-
|
|
474
|
+
exception = SnowparkConnectNotImplementedError(
|
|
475
|
+
f"Unexpected request {config}"
|
|
476
|
+
)
|
|
477
|
+
attach_custom_error_code(exception, ErrorCodes.UNSUPPORTED_OPERATION)
|
|
478
|
+
raise exception
|
|
433
479
|
|
|
434
480
|
|
|
435
481
|
def set_config_param(
|
|
@@ -469,19 +515,27 @@ def _verify_static_config_not_modified(key: str) -> None:
|
|
|
469
515
|
# https://github.com/apache/spark/blob/v3.5.3/sql/core/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala#L161
|
|
470
516
|
# Spark does not allow to modify static configurations at runtime.
|
|
471
517
|
if global_config.is_static_config(key) and global_config.is_set(key):
|
|
472
|
-
|
|
518
|
+
exception = ValueError(f"Cannot modify the value of a static config: {key}")
|
|
519
|
+
attach_custom_error_code(exception, ErrorCodes.CONFIG_CHANGE_NOT_ALLOWED)
|
|
520
|
+
raise exception
|
|
473
521
|
|
|
474
522
|
|
|
475
523
|
def _verify_is_valid_config_value(key: str, value: Any) -> None:
|
|
476
524
|
if key in CONFIG_ALLOWED_VALUES and value not in CONFIG_ALLOWED_VALUES[key]:
|
|
477
|
-
|
|
525
|
+
exception = ValueError(
|
|
478
526
|
f"Invalid value '{value}' for key '{key}'. Allowed values: {', '.join(CONFIG_ALLOWED_VALUES[key])}."
|
|
479
527
|
)
|
|
528
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_CONFIG_VALUE)
|
|
529
|
+
raise exception
|
|
480
530
|
|
|
481
531
|
|
|
482
532
|
def _verify_is_not_readonly_config(key):
|
|
483
533
|
if key in global_config.readonly_config_list:
|
|
484
|
-
|
|
534
|
+
exception = ValueError(
|
|
535
|
+
f"Config with key {key} is read-only and cannot be modified."
|
|
536
|
+
)
|
|
537
|
+
attach_custom_error_code(exception, ErrorCodes.CONFIG_CHANGE_NOT_ALLOWED)
|
|
538
|
+
raise exception
|
|
485
539
|
|
|
486
540
|
|
|
487
541
|
def set_jvm_timezone(timezone_id: str):
|
|
@@ -498,10 +552,13 @@ def set_jvm_timezone(timezone_id: str):
|
|
|
498
552
|
RuntimeError: If JVM is not started
|
|
499
553
|
"""
|
|
500
554
|
if not jpype.isJVMStarted():
|
|
501
|
-
|
|
555
|
+
exception = RuntimeError("JVM must be started before setting timezone")
|
|
556
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
557
|
+
raise exception
|
|
502
558
|
|
|
503
559
|
try:
|
|
504
|
-
|
|
560
|
+
with get_jpype_jclass_lock():
|
|
561
|
+
TimeZone = jpype.JClass("java.util.TimeZone")
|
|
505
562
|
new_timezone = TimeZone.getTimeZone(timezone_id)
|
|
506
563
|
TimeZone.setDefault(new_timezone)
|
|
507
564
|
|
|
@@ -513,7 +570,9 @@ def set_jvm_timezone(timezone_id: str):
|
|
|
513
570
|
def reset_jvm_timezone_to_system_default():
|
|
514
571
|
"""Reset JVM timezone to the system's default timezone"""
|
|
515
572
|
if not jpype.isJVMStarted():
|
|
516
|
-
|
|
573
|
+
exception = RuntimeError("JVM must be started first")
|
|
574
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
575
|
+
raise exception
|
|
517
576
|
|
|
518
577
|
try:
|
|
519
578
|
TimeZone = jpype.JClass("java.util.TimeZone")
|
|
@@ -522,9 +581,13 @@ def reset_jvm_timezone_to_system_default():
|
|
|
522
581
|
f"Reset JVM timezone to system default: {TimeZone.getDefault().getID()}"
|
|
523
582
|
)
|
|
524
583
|
except jpype.JException as e:
|
|
525
|
-
|
|
584
|
+
exception = RuntimeError(f"Java exception while resetting timezone: {e}")
|
|
585
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
586
|
+
raise exception
|
|
526
587
|
except Exception as e:
|
|
527
|
-
|
|
588
|
+
exception = RuntimeError(f"Unexpected error resetting JVM timezone: {e}")
|
|
589
|
+
attach_custom_error_code(exception, ErrorCodes.INTERNAL_ERROR)
|
|
590
|
+
raise exception
|
|
528
591
|
|
|
529
592
|
|
|
530
593
|
def set_snowflake_parameters(
|
|
@@ -569,27 +632,98 @@ def set_snowflake_parameters(
|
|
|
569
632
|
snowpark_session.use_database(db)
|
|
570
633
|
case (prev, curr) if prev != curr:
|
|
571
634
|
snowpark_session.use_schema(prev)
|
|
635
|
+
case "snowpark.connect.cte.optimization_enabled":
|
|
636
|
+
# Set CTE optimization on the snowpark session
|
|
637
|
+
cte_enabled = str_to_bool(value)
|
|
638
|
+
snowpark_session.cte_optimization_enabled = cte_enabled
|
|
639
|
+
logger.info(f"Updated snowpark session CTE optimization: {cte_enabled}")
|
|
640
|
+
case "snowpark.connect.structured_types.fix":
|
|
641
|
+
# TODO: SNOW-2367714 Remove this once the fix is automatically enabled in Snowpark
|
|
642
|
+
snowpark.context._enable_fix_2360274 = str_to_bool(value)
|
|
643
|
+
logger.info(f"Updated snowpark session structured types fix: {value}")
|
|
572
644
|
case _:
|
|
573
645
|
pass
|
|
574
646
|
|
|
575
647
|
|
|
576
648
|
def get_boolean_session_config_param(name: str) -> bool:
|
|
577
|
-
session_config = sessions_config[
|
|
649
|
+
session_config = sessions_config[get_spark_session_id()]
|
|
578
650
|
return str_to_bool(session_config[name])
|
|
579
651
|
|
|
580
652
|
|
|
653
|
+
def get_string_session_config_param(name: str) -> str:
|
|
654
|
+
session_config = sessions_config[get_spark_session_id()]
|
|
655
|
+
return str(session_config[name])
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
def get_cte_optimization_enabled() -> bool:
|
|
659
|
+
"""Get the CTE optimization configuration setting."""
|
|
660
|
+
return get_boolean_session_config_param("snowpark.connect.cte.optimization_enabled")
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def get_success_file_generation_enabled() -> bool:
|
|
664
|
+
"""Get the _SUCCESS file generation configuration setting."""
|
|
665
|
+
return get_boolean_session_config_param(
|
|
666
|
+
"mapreduce.fileoutputcommitter.marksuccessfuljobs"
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
|
|
670
|
+
def get_parquet_metadata_generation_enabled() -> bool:
|
|
671
|
+
"""
|
|
672
|
+
Get the Parquet metadata file generation configuration setting.
|
|
673
|
+
"""
|
|
674
|
+
return get_boolean_session_config_param(
|
|
675
|
+
"spark.sql.parquet.enable.summary-metadata"
|
|
676
|
+
) or get_boolean_session_config_param("parquet.enable.summary-metadata")
|
|
677
|
+
|
|
678
|
+
|
|
679
|
+
def get_describe_cache_ttl_seconds() -> int:
|
|
680
|
+
"""Get the describe query cache TTL from session config, with a default fallback."""
|
|
681
|
+
session_config: SessionConfig = sessions_config[get_spark_session_id()]
|
|
682
|
+
default_ttl: str = SessionConfig.default_session_config[
|
|
683
|
+
"snowpark.connect.describe_cache_ttl_seconds"
|
|
684
|
+
]
|
|
685
|
+
try:
|
|
686
|
+
ttl_str = session_config.get(
|
|
687
|
+
"snowpark.connect.describe_cache_ttl_seconds", default_ttl
|
|
688
|
+
)
|
|
689
|
+
return int(ttl_str)
|
|
690
|
+
except ValueError: # fallback to default ttl
|
|
691
|
+
return int(default_ttl)
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
def should_create_temporary_view_in_snowflake() -> bool:
|
|
695
|
+
return str_to_bool(
|
|
696
|
+
global_config["snowpark.connect.temporary.views.create_in_snowflake"]
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
|
|
581
700
|
def auto_uppercase_column_identifiers() -> bool:
|
|
582
|
-
session_config = sessions_config[
|
|
583
|
-
|
|
701
|
+
session_config = sessions_config[get_spark_session_id()]
|
|
702
|
+
auto_upper_case_config = session_config[
|
|
584
703
|
"snowpark.connect.sql.identifiers.auto-uppercase"
|
|
585
|
-
]
|
|
704
|
+
]
|
|
705
|
+
if auto_upper_case_config:
|
|
706
|
+
return auto_upper_case_config.lower() in ("all", "only_columns")
|
|
707
|
+
|
|
708
|
+
return not global_config.spark_sql_caseSensitive
|
|
586
709
|
|
|
587
710
|
|
|
588
711
|
def auto_uppercase_non_column_identifiers() -> bool:
|
|
589
|
-
session_config = sessions_config[
|
|
590
|
-
|
|
712
|
+
session_config = sessions_config[get_spark_session_id()]
|
|
713
|
+
auto_upper_case_config = session_config[
|
|
591
714
|
"snowpark.connect.sql.identifiers.auto-uppercase"
|
|
592
|
-
]
|
|
715
|
+
]
|
|
716
|
+
if auto_upper_case_config:
|
|
717
|
+
return auto_upper_case_config.lower() in ("all", "all_except_columns")
|
|
718
|
+
|
|
719
|
+
return not global_config.spark_sql_caseSensitive
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
def external_table_location() -> Optional[str]:
|
|
723
|
+
session_config = sessions_config[get_spark_session_id()]
|
|
724
|
+
return session_config.get(
|
|
725
|
+
"snowpark.connect.sql.partition.external_table_location", None
|
|
726
|
+
)
|
|
593
727
|
|
|
594
728
|
|
|
595
729
|
def parse_imports(session: snowpark.Session, imports: str | None) -> None:
|
|
@@ -613,3 +747,83 @@ def get_timestamp_type():
|
|
|
613
747
|
# shouldn't happen since `spark.sql.timestampType` is always defined, and `spark.conf.unset` sets it to default (TIMESTAMP_LTZ)
|
|
614
748
|
timestamp_type = TimestampType(TimestampTimeZone.LTZ)
|
|
615
749
|
return timestamp_type
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
def record_table_metadata(
|
|
753
|
+
table_identifier: str,
|
|
754
|
+
table_type: str,
|
|
755
|
+
data_source: str,
|
|
756
|
+
supports_column_rename: bool = True,
|
|
757
|
+
) -> None:
|
|
758
|
+
"""
|
|
759
|
+
Record metadata about a table for Spark compatibility checks.
|
|
760
|
+
|
|
761
|
+
Args:
|
|
762
|
+
table_identifier: Full table identifier (catalog.database.table)
|
|
763
|
+
table_type: "v1" or "v2"
|
|
764
|
+
data_source: Source format (parquet, csv, iceberg, etc.)
|
|
765
|
+
supports_column_rename: Whether the table supports RENAME COLUMN
|
|
766
|
+
"""
|
|
767
|
+
session_id = get_spark_session_id()
|
|
768
|
+
session_config = sessions_config[session_id]
|
|
769
|
+
|
|
770
|
+
# Normalize table identifier for consistent lookup
|
|
771
|
+
# Use the full catalog.database.table identifier to avoid conflicts
|
|
772
|
+
normalized_identifier = table_identifier.upper().strip('"')
|
|
773
|
+
|
|
774
|
+
session_config.table_metadata[normalized_identifier] = {
|
|
775
|
+
"table_type": table_type,
|
|
776
|
+
"data_source": data_source,
|
|
777
|
+
"supports_column_rename": supports_column_rename,
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
|
|
781
|
+
def get_table_metadata(table_identifier: str) -> Dict[str, Any] | None:
|
|
782
|
+
"""
|
|
783
|
+
Get stored metadata for a table.
|
|
784
|
+
|
|
785
|
+
Args:
|
|
786
|
+
table_identifier: Full table identifier (catalog.database.table)
|
|
787
|
+
|
|
788
|
+
Returns:
|
|
789
|
+
Table metadata dict or None if not found
|
|
790
|
+
"""
|
|
791
|
+
session_id = get_spark_session_id()
|
|
792
|
+
session_config = sessions_config[session_id]
|
|
793
|
+
|
|
794
|
+
normalized_identifier = unquote_if_quoted(table_identifier).upper()
|
|
795
|
+
|
|
796
|
+
return session_config.table_metadata.get(normalized_identifier)
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
def check_table_supports_operation(table_identifier: str, operation: str) -> bool:
|
|
800
|
+
"""
|
|
801
|
+
Check if a table supports a given operation based on metadata and config.
|
|
802
|
+
|
|
803
|
+
Args:
|
|
804
|
+
table_identifier: Full table identifier (catalog.database.table)
|
|
805
|
+
operation: Operation to check (e.g., "rename_column")
|
|
806
|
+
|
|
807
|
+
Returns:
|
|
808
|
+
True if operation is supported, False if should be blocked
|
|
809
|
+
"""
|
|
810
|
+
table_metadata = get_table_metadata(table_identifier)
|
|
811
|
+
|
|
812
|
+
if not table_metadata:
|
|
813
|
+
return True
|
|
814
|
+
|
|
815
|
+
session_id = get_spark_session_id()
|
|
816
|
+
session_config = sessions_config[session_id]
|
|
817
|
+
enable_extensions = str_to_bool(
|
|
818
|
+
session_config.get(
|
|
819
|
+
"snowpark.connect.enable_snowflake_extension_behavior", "false"
|
|
820
|
+
)
|
|
821
|
+
)
|
|
822
|
+
|
|
823
|
+
if enable_extensions:
|
|
824
|
+
return True
|
|
825
|
+
|
|
826
|
+
if operation == "rename_column":
|
|
827
|
+
return table_metadata.get("supports_column_rename", True)
|
|
828
|
+
|
|
829
|
+
return True
|
|
@@ -16,3 +16,5 @@ MAP_IN_ARROW_EVAL_TYPE = 207 # eval_type for mapInArrow operations
|
|
|
16
16
|
COLUMN_METADATA_COLLISION_KEY = "{expr_id}_{key}"
|
|
17
17
|
|
|
18
18
|
DUPLICATE_KEY_FOUND_ERROR_TEMPLATE = "Duplicate key found: {key}. You can set spark.sql.mapKeyDedupPolicy to LAST_WIN to deduplicate map keys with last wins policy."
|
|
19
|
+
|
|
20
|
+
SPARK_VERSION = "3.5.3"
|
|
@@ -4,14 +4,40 @@
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
+
from dataclasses import dataclass
|
|
7
8
|
from typing import TYPE_CHECKING, Callable
|
|
8
9
|
|
|
9
10
|
from snowflake import snowpark
|
|
10
11
|
from snowflake.snowpark.types import StructField, StructType
|
|
11
|
-
from snowflake.snowpark_connect.
|
|
12
|
+
from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
15
|
+
import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
|
|
16
|
+
|
|
14
17
|
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap
|
|
18
|
+
from snowflake.snowpark_connect.typed_column import TypedColumn
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class AggregateMetadata:
|
|
23
|
+
"""
|
|
24
|
+
Metadata about aggregation for resolving expressions in ORDER BY.
|
|
25
|
+
|
|
26
|
+
When a Sort operation follows an Aggregate operation, ORDER BY expressions
|
|
27
|
+
may reference:
|
|
28
|
+
1. Grouping columns from the GROUP BY clause
|
|
29
|
+
2. Aggregate result columns (aliases)
|
|
30
|
+
3. Expressions on pre-aggregation columns (e.g., year(date) where date existed before GROUP BY)
|
|
31
|
+
|
|
32
|
+
This metadata enables hybrid resolution similar to HAVING clause.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
input_column_map: ColumnNameMap
|
|
36
|
+
input_dataframe: snowpark.DataFrame
|
|
37
|
+
grouping_expressions: list[expressions_proto.Expression]
|
|
38
|
+
aggregate_expressions: list[expressions_proto.Expression]
|
|
39
|
+
spark_columns: list[str]
|
|
40
|
+
raw_aggregations: list[tuple[str, TypedColumn]]
|
|
15
41
|
|
|
16
42
|
|
|
17
43
|
class DataFrameContainer:
|
|
@@ -30,6 +56,9 @@ class DataFrameContainer:
|
|
|
30
56
|
alias: str | None = None,
|
|
31
57
|
cached_schema_getter: Callable[[], StructType] | None = None,
|
|
32
58
|
partition_hint: int | None = None,
|
|
59
|
+
can_be_cached: bool = True,
|
|
60
|
+
can_be_materialized: bool = True,
|
|
61
|
+
aggregate_metadata: AggregateMetadata | None = None,
|
|
33
62
|
) -> None:
|
|
34
63
|
"""
|
|
35
64
|
Initialize a new DataFrameContainer.
|
|
@@ -41,12 +70,16 @@ class DataFrameContainer:
|
|
|
41
70
|
alias: Optional alias for the DataFrame
|
|
42
71
|
cached_schema_getter: Optional function to get cached schema
|
|
43
72
|
partition_hint: Optional partition count from repartition() operations
|
|
73
|
+
aggregate_metadata: Optional metadata about aggregation for ORDER BY resolution
|
|
44
74
|
"""
|
|
45
75
|
self._dataframe = dataframe
|
|
46
76
|
self._column_map = self._create_default_column_map(column_map)
|
|
47
77
|
self._table_name = table_name
|
|
48
78
|
self._alias = alias
|
|
49
79
|
self._partition_hint = partition_hint
|
|
80
|
+
self._can_be_cached = can_be_cached
|
|
81
|
+
self._can_be_materialized = can_be_materialized
|
|
82
|
+
self._aggregate_metadata = aggregate_metadata
|
|
50
83
|
|
|
51
84
|
if cached_schema_getter is not None:
|
|
52
85
|
self._apply_cached_schema_getter(cached_schema_getter)
|
|
@@ -59,13 +92,16 @@ class DataFrameContainer:
|
|
|
59
92
|
snowpark_column_names: list[str],
|
|
60
93
|
snowpark_column_types: list | None = None,
|
|
61
94
|
column_metadata: dict | None = None,
|
|
62
|
-
column_qualifiers: list[
|
|
95
|
+
column_qualifiers: list[set[ColumnQualifier]] | None = None,
|
|
63
96
|
parent_column_name_map: ColumnNameMap | None = None,
|
|
64
|
-
hidden_columns: set[HiddenColumn] | None = None,
|
|
65
97
|
table_name: str | None = None,
|
|
66
98
|
alias: str | None = None,
|
|
67
99
|
cached_schema_getter: Callable[[], StructType] | None = None,
|
|
68
100
|
partition_hint: int | None = None,
|
|
101
|
+
equivalent_snowpark_names: list[set[str]] | None = None,
|
|
102
|
+
column_is_hidden: list[bool] | None = None,
|
|
103
|
+
can_be_cached: bool = True,
|
|
104
|
+
aggregate_metadata: AggregateMetadata | None = None,
|
|
69
105
|
) -> DataFrameContainer:
|
|
70
106
|
"""
|
|
71
107
|
Create a new container with complete column mapping configuration.
|
|
@@ -78,11 +114,14 @@ class DataFrameContainer:
|
|
|
78
114
|
column_metadata: Optional metadata dictionary
|
|
79
115
|
column_qualifiers: Optional column qualifiers
|
|
80
116
|
parent_column_name_map: Optional parent column name map
|
|
81
|
-
hidden_columns: Optional list of hidden column names
|
|
82
117
|
table_name: Optional table name
|
|
83
118
|
alias: Optional alias
|
|
84
119
|
cached_schema_getter: Optional function to get cached schema
|
|
85
120
|
partition_hint: Optional partition count from repartition() operations
|
|
121
|
+
equivalent_snowpark_names: list of sets with old snowpark names that can be resolved with an existing column
|
|
122
|
+
column_is_hidden: Optional list of booleans indicating whether each column is hidden
|
|
123
|
+
can_be_cached: Optional boolean indicating if the dataframe can be cached
|
|
124
|
+
aggregate_metadata: Optional metadata about aggregation for ORDER BY resolution
|
|
86
125
|
|
|
87
126
|
Returns:
|
|
88
127
|
A new DataFrameContainer instance
|
|
@@ -101,7 +140,8 @@ class DataFrameContainer:
|
|
|
101
140
|
column_metadata,
|
|
102
141
|
column_qualifiers,
|
|
103
142
|
parent_column_name_map,
|
|
104
|
-
|
|
143
|
+
equivalent_snowpark_names,
|
|
144
|
+
column_is_hidden,
|
|
105
145
|
)
|
|
106
146
|
|
|
107
147
|
# Determine the schema getter to use
|
|
@@ -129,8 +169,25 @@ class DataFrameContainer:
|
|
|
129
169
|
alias=alias,
|
|
130
170
|
cached_schema_getter=final_schema_getter,
|
|
131
171
|
partition_hint=partition_hint,
|
|
172
|
+
can_be_cached=can_be_cached,
|
|
173
|
+
aggregate_metadata=aggregate_metadata,
|
|
132
174
|
)
|
|
133
175
|
|
|
176
|
+
@property
|
|
177
|
+
def can_be_cached(self) -> bool:
|
|
178
|
+
"""Indicate if the DataFrame can be cached in df_cache"""
|
|
179
|
+
return self._can_be_cached
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def can_be_materialized(self) -> bool:
|
|
183
|
+
"""Indicate if the DataFrame can be materialized in df_cache"""
|
|
184
|
+
return self._can_be_materialized
|
|
185
|
+
|
|
186
|
+
def without_materialization(self):
|
|
187
|
+
"""Prevent the DataFrame from being materialized in df_cache"""
|
|
188
|
+
self._can_be_materialized = False
|
|
189
|
+
return self
|
|
190
|
+
|
|
134
191
|
@property
|
|
135
192
|
def dataframe(self) -> snowpark.DataFrame:
|
|
136
193
|
"""Get the underlying Snowpark DataFrame."""
|
|
@@ -224,9 +281,10 @@ class DataFrameContainer:
|
|
|
224
281
|
spark_column_names: list[str],
|
|
225
282
|
snowpark_column_names: list[str],
|
|
226
283
|
column_metadata: dict | None = None,
|
|
227
|
-
column_qualifiers: list[
|
|
284
|
+
column_qualifiers: list[set[ColumnQualifier]] | None = None,
|
|
228
285
|
parent_column_name_map: ColumnNameMap | None = None,
|
|
229
|
-
|
|
286
|
+
equivalent_snowpark_names: list[set[str]] | None = None,
|
|
287
|
+
column_is_hidden: list[bool] | None = None,
|
|
230
288
|
) -> ColumnNameMap:
|
|
231
289
|
"""Create a ColumnNameMap with the provided configuration."""
|
|
232
290
|
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap
|
|
@@ -237,7 +295,8 @@ class DataFrameContainer:
|
|
|
237
295
|
column_metadata=column_metadata,
|
|
238
296
|
column_qualifiers=column_qualifiers,
|
|
239
297
|
parent_column_name_map=parent_column_name_map,
|
|
240
|
-
|
|
298
|
+
equivalent_snowpark_names=equivalent_snowpark_names,
|
|
299
|
+
column_is_hidden=column_is_hidden,
|
|
241
300
|
)
|
|
242
301
|
|
|
243
302
|
@staticmethod
|
|
@@ -262,3 +321,38 @@ class DataFrameContainer:
|
|
|
262
321
|
)
|
|
263
322
|
]
|
|
264
323
|
)
|
|
324
|
+
|
|
325
|
+
def without_hidden_columns(self) -> DataFrameContainer:
|
|
326
|
+
from snowflake.snowpark_connect.column_name_handler import ColumnNameMap
|
|
327
|
+
|
|
328
|
+
if not any(c.is_hidden for c in self._column_map.columns):
|
|
329
|
+
return self
|
|
330
|
+
|
|
331
|
+
hidden_column_names = [
|
|
332
|
+
c.snowpark_name for c in self._column_map.columns if c.is_hidden
|
|
333
|
+
]
|
|
334
|
+
visible_columns = [c for c in self._column_map.columns if not c.is_hidden]
|
|
335
|
+
|
|
336
|
+
filtered_df = self._dataframe.drop(hidden_column_names)
|
|
337
|
+
filtered_column_map = ColumnNameMap(
|
|
338
|
+
spark_column_names=[c.spark_name for c in visible_columns],
|
|
339
|
+
snowpark_column_names=[c.snowpark_name for c in visible_columns],
|
|
340
|
+
column_metadata=self._column_map.column_metadata,
|
|
341
|
+
column_qualifiers=[c.qualifiers for c in visible_columns],
|
|
342
|
+
parent_column_name_map=self._column_map._parent_column_name_map,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
return DataFrameContainer(
|
|
346
|
+
dataframe=filtered_df,
|
|
347
|
+
column_map=filtered_column_map,
|
|
348
|
+
table_name=self._table_name,
|
|
349
|
+
alias=self._alias,
|
|
350
|
+
cached_schema_getter=lambda: StructType(
|
|
351
|
+
[
|
|
352
|
+
field
|
|
353
|
+
for field in self._dataframe.schema.fields
|
|
354
|
+
if field.name not in hidden_column_names
|
|
355
|
+
]
|
|
356
|
+
),
|
|
357
|
+
partition_hint=self._partition_hint,
|
|
358
|
+
)
|