snowpark-connect 0.27.0__py3-none-any.whl → 1.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/__init__.py +1 -0
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client/__init__.py +15 -0
- snowflake/snowpark_connect/client/error_utils.py +30 -0
- snowflake/snowpark_connect/client/exceptions.py +36 -0
- snowflake/snowpark_connect/client/query_results.py +90 -0
- snowflake/snowpark_connect/client/server.py +717 -0
- snowflake/snowpark_connect/client/utils/__init__.py +10 -0
- snowflake/snowpark_connect/client/utils/session.py +85 -0
- snowflake/snowpark_connect/column_name_handler.py +404 -243
- snowflake/snowpark_connect/column_qualifier.py +43 -0
- snowflake/snowpark_connect/config.py +309 -26
- snowflake/snowpark_connect/constants.py +2 -0
- snowflake/snowpark_connect/dataframe_container.py +102 -8
- snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
- snowflake/snowpark_connect/error/error_codes.py +50 -0
- snowflake/snowpark_connect/error/error_utils.py +172 -23
- snowflake/snowpark_connect/error/exceptions.py +13 -4
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
- snowflake/snowpark_connect/execute_plan/utils.py +5 -1
- snowflake/snowpark_connect/expression/error_utils.py +28 -0
- snowflake/snowpark_connect/expression/function_defaults.py +9 -2
- snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
- snowflake/snowpark_connect/expression/integral_types_support.py +219 -0
- snowflake/snowpark_connect/expression/literal.py +37 -13
- snowflake/snowpark_connect/expression/map_cast.py +224 -15
- snowflake/snowpark_connect/expression/map_expression.py +80 -27
- snowflake/snowpark_connect/expression/map_extension.py +322 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
- snowflake/snowpark_connect/expression/map_udf.py +86 -20
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
- snowflake/snowpark_connect/expression/map_unresolved_function.py +2964 -829
- snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
- snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
- snowflake/snowpark_connect/expression/map_window_function.py +18 -3
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.13-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.13-0.2.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.13.16.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.13-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-connect-client-jvm_2.12-3.5.6.jar → spark-connect-client-jvm_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/jars/{spark-sql_2.12-3.5.6.jar → spark-sql_2.13-3.5.6.jar} +0 -0
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
- snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
- snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
- snowflake/snowpark_connect/relation/io_utils.py +110 -10
- snowflake/snowpark_connect/relation/map_aggregate.py +239 -256
- snowflake/snowpark_connect/relation/map_catalog.py +5 -1
- snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
- snowflake/snowpark_connect/relation/map_extension.py +263 -29
- snowflake/snowpark_connect/relation/map_join.py +683 -442
- snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
- snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
- snowflake/snowpark_connect/relation/map_relation.py +48 -19
- snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
- snowflake/snowpark_connect/relation/map_show_string.py +13 -6
- snowflake/snowpark_connect/relation/map_sql.py +1233 -222
- snowflake/snowpark_connect/relation/map_stats.py +48 -9
- snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
- snowflake/snowpark_connect/relation/map_udtf.py +14 -4
- snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
- snowflake/snowpark_connect/relation/read/map_read.py +134 -43
- snowflake/snowpark_connect/relation/read/map_read_csv.py +326 -47
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +21 -6
- snowflake/snowpark_connect/relation/read/map_read_json.py +324 -86
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +146 -28
- snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
- snowflake/snowpark_connect/relation/read/map_read_socket.py +15 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +86 -6
- snowflake/snowpark_connect/relation/read/map_read_text.py +22 -4
- snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +42 -3
- snowflake/snowpark_connect/relation/read/utils.py +50 -5
- snowflake/snowpark_connect/relation/stage_locator.py +91 -55
- snowflake/snowpark_connect/relation/utils.py +128 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
- snowflake/snowpark_connect/relation/write/map_write.py +929 -319
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
- snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
- snowflake/snowpark_connect/resources_initializer.py +171 -48
- snowflake/snowpark_connect/server.py +528 -473
- snowflake/snowpark_connect/server_common/__init__.py +503 -0
- snowflake/snowpark_connect/snowflake_session.py +65 -0
- snowflake/snowpark_connect/start_server.py +53 -5
- snowflake/snowpark_connect/type_mapping.py +349 -27
- snowflake/snowpark_connect/type_support.py +130 -0
- snowflake/snowpark_connect/typed_column.py +9 -7
- snowflake/snowpark_connect/utils/artifacts.py +9 -8
- snowflake/snowpark_connect/utils/cache.py +49 -27
- snowflake/snowpark_connect/utils/concurrent.py +36 -1
- snowflake/snowpark_connect/utils/context.py +195 -37
- snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
- snowflake/snowpark_connect/utils/env_utils.py +5 -1
- snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
- snowflake/snowpark_connect/utils/identifiers.py +137 -3
- snowflake/snowpark_connect/utils/io_utils.py +57 -1
- snowflake/snowpark_connect/utils/java_stored_procedure.py +151 -0
- snowflake/snowpark_connect/utils/java_udaf_utils.py +321 -0
- snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
- snowflake/snowpark_connect/utils/jvm_udf_utils.py +281 -0
- snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
- snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
- snowflake/snowpark_connect/utils/profiling.py +25 -8
- snowflake/snowpark_connect/utils/scala_udf_utils.py +185 -340
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +64 -28
- snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
- snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
- snowflake/snowpark_connect/utils/telemetry.py +192 -40
- snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
- snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
- snowflake/snowpark_connect/utils/udf_cache.py +117 -41
- snowflake/snowpark_connect/utils/udf_helper.py +39 -37
- snowflake/snowpark_connect/utils/udf_utils.py +133 -14
- snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
- snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
- snowflake/snowpark_connect/utils/udxf_import_utils.py +9 -2
- snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
- snowflake/snowpark_connect/version.py +1 -1
- snowflake/snowpark_decoder/dp_session.py +6 -2
- snowflake/snowpark_decoder/spark_decoder.py +12 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-submit +14 -4
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/METADATA +16 -7
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/RECORD +139 -168
- snowflake/snowpark_connect/hidden_column.py +0 -39
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.27.0.data → snowpark_connect-1.7.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.7.0.dist-info}/top_level.txt +0 -0
|
@@ -8,6 +8,7 @@ from functools import cached_property
|
|
|
8
8
|
import snowflake.snowpark.functions as snowpark_fn
|
|
9
9
|
from snowflake import snowpark
|
|
10
10
|
from snowflake.snowpark.column import Column
|
|
11
|
+
from snowflake.snowpark_connect.column_qualifier import ColumnQualifier
|
|
11
12
|
|
|
12
13
|
_EMPTY_COLUMN = Column("")
|
|
13
14
|
|
|
@@ -19,7 +20,7 @@ class TypedColumn:
|
|
|
19
20
|
type_resolver: Callable[[], list[snowpark.types.DataType] | None],
|
|
20
21
|
) -> None:
|
|
21
22
|
self.col = col
|
|
22
|
-
self._regex_matched_columns: list
|
|
23
|
+
self._regex_matched_columns: list = list()
|
|
23
24
|
self._type_resolver = type_resolver
|
|
24
25
|
self._catalog_database_info: dict[str, str] = {}
|
|
25
26
|
|
|
@@ -44,11 +45,11 @@ class TypedColumn:
|
|
|
44
45
|
def alias(self, alias_name: str):
|
|
45
46
|
return TypedColumn(self.col.alias(alias_name), self._type_resolver)
|
|
46
47
|
|
|
47
|
-
def set_qualifiers(self, qualifiers:
|
|
48
|
+
def set_qualifiers(self, qualifiers: set[ColumnQualifier]) -> None:
|
|
48
49
|
self.qualifiers = qualifiers
|
|
49
50
|
|
|
50
|
-
def get_qualifiers(self) ->
|
|
51
|
-
return getattr(self, "qualifiers",
|
|
51
|
+
def get_qualifiers(self) -> set[ColumnQualifier]:
|
|
52
|
+
return getattr(self, "qualifiers", set())
|
|
52
53
|
|
|
53
54
|
def set_catalog_database_info(self, catalog_database_info: dict[str, str]) -> None:
|
|
54
55
|
self._catalog_database_info = catalog_database_info
|
|
@@ -63,12 +64,13 @@ class TypedColumn:
|
|
|
63
64
|
def get_database(self) -> str | None:
|
|
64
65
|
return self._catalog_database_info.get("database")
|
|
65
66
|
|
|
66
|
-
def set_multi_col_qualifiers(self, qualifiers: list[
|
|
67
|
+
def set_multi_col_qualifiers(self, qualifiers: list[set[ColumnQualifier]]) -> None:
|
|
67
68
|
self.multi_col_qualifiers = qualifiers
|
|
68
69
|
|
|
69
|
-
def get_multi_col_qualifiers(self, num_columns) -> list[
|
|
70
|
+
def get_multi_col_qualifiers(self, num_columns) -> list[set[ColumnQualifier]]:
|
|
70
71
|
if not hasattr(self, "multi_col_qualifiers"):
|
|
71
|
-
|
|
72
|
+
|
|
73
|
+
return [set() for i in range(num_columns)]
|
|
72
74
|
assert (
|
|
73
75
|
len(self.multi_col_qualifiers) == num_columns
|
|
74
76
|
), f"Expected {num_columns} multi-column qualifiers, got {len(self.multi_col_qualifiers)}"
|
|
@@ -50,21 +50,22 @@ def write_temporary_artifact(
|
|
|
50
50
|
return filepath
|
|
51
51
|
|
|
52
52
|
|
|
53
|
-
def write_class_files_to_stage(
|
|
54
|
-
|
|
55
|
-
) -> None:
|
|
53
|
+
def write_class_files_to_stage(session: snowpark.Session, files: dict[str, str]) -> str:
|
|
54
|
+
jar_name = f'{hashlib.sha256(str(files).encode("utf-8")).hexdigest()[:10]}.jar'
|
|
56
55
|
if os.name != "nt":
|
|
57
56
|
filepath = f"/tmp/sas-{session.session_id}"
|
|
58
|
-
|
|
57
|
+
jar_path = f"{filepath}/{jar_name}"
|
|
59
58
|
else:
|
|
60
59
|
filepath = f"{tempfile.gettempdir()}\\sas-{session.session_id}"
|
|
61
|
-
|
|
62
|
-
with zipfile.ZipFile(
|
|
60
|
+
jar_path = f"{filepath}\\{jar_name}"
|
|
61
|
+
with zipfile.ZipFile(jar_path, "w", zipfile.ZIP_DEFLATED) as jar:
|
|
63
62
|
for name, path in files.items():
|
|
64
63
|
jar.write(path, name)
|
|
64
|
+
stage_path = f"{session.get_session_stage()}/class_jars/"
|
|
65
65
|
session.file.put(
|
|
66
|
-
|
|
67
|
-
|
|
66
|
+
jar_path,
|
|
67
|
+
stage_path,
|
|
68
68
|
auto_compress=False,
|
|
69
69
|
overwrite=True,
|
|
70
70
|
)
|
|
71
|
+
return stage_path + jar_name
|
|
@@ -26,7 +26,6 @@ def df_cache_map_get(key: Tuple[str, any]) -> DataFrameContainer | None:
|
|
|
26
26
|
def df_cache_map_put_if_absent(
|
|
27
27
|
key: Tuple[str, any],
|
|
28
28
|
compute_fn: Callable[[], DataFrameContainer | pandas.DataFrame],
|
|
29
|
-
materialize: bool,
|
|
30
29
|
) -> DataFrameContainer | pandas.DataFrame:
|
|
31
30
|
"""
|
|
32
31
|
Put a DataFrame container into the cache map if the key is absent. Optionally, as side effect, materialize
|
|
@@ -35,7 +34,6 @@ def df_cache_map_put_if_absent(
|
|
|
35
34
|
Args:
|
|
36
35
|
key (Tuple[str, int]): The key to insert into the cache map (session_id, plan_id).
|
|
37
36
|
compute_fn (Callable[[], DataFrameContainer | pandas.DataFrame]): A function to compute the DataFrame container if the key is absent.
|
|
38
|
-
materialize (bool): Whether to materialize the DataFrame.
|
|
39
37
|
|
|
40
38
|
Returns:
|
|
41
39
|
DataFrameContainer | pandas.DataFrame: The cached or newly computed DataFrame container.
|
|
@@ -45,7 +43,7 @@ def df_cache_map_put_if_absent(
|
|
|
45
43
|
container: DataFrameContainer,
|
|
46
44
|
) -> DataFrameContainer:
|
|
47
45
|
|
|
48
|
-
if
|
|
46
|
+
if container.can_be_materialized:
|
|
49
47
|
df = container.dataframe
|
|
50
48
|
cached_result = df.cache_result()
|
|
51
49
|
return DataFrameContainer(
|
|
@@ -58,30 +56,54 @@ def df_cache_map_put_if_absent(
|
|
|
58
56
|
return container
|
|
59
57
|
|
|
60
58
|
with _cache_map_lock:
|
|
61
|
-
if key
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
59
|
+
if key in df_cache_map:
|
|
60
|
+
return df_cache_map[key]
|
|
61
|
+
|
|
62
|
+
# the compute_fn is not guaranteed to be called only once, but it's acceptable based upon the following analysis:
|
|
63
|
+
# there are in total 5 occurrences of passing compute_fn callback falling into two categories:
|
|
64
|
+
# 2 occurrences as lambda that needs to be computed:
|
|
65
|
+
# 1) server::AnalyzePlan case "persist"
|
|
66
|
+
# 2) server::AddArtifacts case "read"
|
|
67
|
+
# 3 occurrences as lambda that simply returns pre-computed dataframe without any computation:
|
|
68
|
+
# 1) map_relation case "local_relation"
|
|
69
|
+
# 2) map_relation case "sample"
|
|
70
|
+
# 3) map_read case "data_source"
|
|
71
|
+
# based upon the analysis of the code, the chance of concurrently calling compute_fn for the same key is very low and if it happens
|
|
72
|
+
# repeating the computation will not affect the result.
|
|
73
|
+
# This is a trade-off between implementation simplicity and fine-grained locking.
|
|
74
|
+
result = compute_fn()
|
|
75
|
+
|
|
76
|
+
if isinstance(result, DataFrameContainer) and not result.can_be_cached:
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
# check cache again, since recursive call in compute_fn could've already cached the result.
|
|
80
|
+
# we want return it, instead of saving it again. This is important if materialize = True
|
|
81
|
+
# because materialization is expensive operation that we don't want to do twice.
|
|
82
|
+
with _cache_map_lock:
|
|
83
|
+
if key in df_cache_map:
|
|
84
|
+
return df_cache_map[key]
|
|
85
|
+
|
|
86
|
+
# only cache DataFrameContainer, but not pandas result.
|
|
87
|
+
# Pandas result is only returned when df.show() is called, where we convert
|
|
88
|
+
# a dataframe to a string representation.
|
|
89
|
+
# We don't expect map_relation would return pandas df here because that would
|
|
90
|
+
# be equivalent to calling df.show().cache(), which is not allowed.
|
|
91
|
+
if isinstance(result, DataFrameContainer):
|
|
92
|
+
# The _object_to_cache function is not guaranteed to be called only once.
|
|
93
|
+
# In rare multithreading cases, this may result in duplicate temporary table
|
|
94
|
+
# creation because df.cache_result() materializes the DataFrame into a temp table each time.
|
|
95
|
+
# This is acceptable because correctness is not affected, the likelihood is very low, and
|
|
96
|
+
# it simplifies the implementation by avoiding fine-grained locking.
|
|
97
|
+
cached_result = _object_to_cache(result)
|
|
98
|
+
with _cache_map_lock:
|
|
99
|
+
df_cache_map[key] = cached_result
|
|
100
|
+
return df_cache_map[key]
|
|
101
|
+
else:
|
|
102
|
+
# This is not expected, but we will just log a warning
|
|
103
|
+
logger.warning(
|
|
104
|
+
"Unexpected pandas dataframe returned for caching. Ignoring the cache call."
|
|
105
|
+
)
|
|
106
|
+
return result
|
|
85
107
|
|
|
86
108
|
|
|
87
109
|
def df_cache_map_pop(key: Tuple[str, any]) -> None:
|
|
@@ -5,10 +5,11 @@
|
|
|
5
5
|
import threading
|
|
6
6
|
from collections.abc import Mapping
|
|
7
7
|
from copy import copy
|
|
8
|
-
from typing import TypeVar
|
|
8
|
+
from typing import Callable, TypeVar
|
|
9
9
|
|
|
10
10
|
K = TypeVar("K")
|
|
11
11
|
V = TypeVar("V")
|
|
12
|
+
T = TypeVar("T")
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
class SynchronizedDict(Mapping[K, V]):
|
|
@@ -52,6 +53,10 @@ class SynchronizedDict(Mapping[K, V]):
|
|
|
52
53
|
with self._lock.writer():
|
|
53
54
|
self._dict[key] = value
|
|
54
55
|
|
|
56
|
+
def __delitem__(self, key: K) -> None:
|
|
57
|
+
with self._lock.writer():
|
|
58
|
+
del self._dict[key]
|
|
59
|
+
|
|
55
60
|
def __contains__(self, key: K) -> bool:
|
|
56
61
|
with self._lock.reader():
|
|
57
62
|
return key in self._dict
|
|
@@ -69,6 +74,36 @@ class SynchronizedDict(Mapping[K, V]):
|
|
|
69
74
|
self._dict.clear()
|
|
70
75
|
|
|
71
76
|
|
|
77
|
+
class SynchronizedList:
|
|
78
|
+
def __init__(self, in_list: list[T] | None = None) -> None:
|
|
79
|
+
self._lock = ReadWriteLock()
|
|
80
|
+
self._list = in_list if in_list is not None else []
|
|
81
|
+
|
|
82
|
+
def append(self, item: T) -> None:
|
|
83
|
+
with self._lock.writer():
|
|
84
|
+
self._list.append(item)
|
|
85
|
+
|
|
86
|
+
def clear(self) -> None:
|
|
87
|
+
with self._lock.writer():
|
|
88
|
+
self._list.clear()
|
|
89
|
+
|
|
90
|
+
def copy(self) -> list[T]:
|
|
91
|
+
with self._lock.reader():
|
|
92
|
+
return self._list.copy()
|
|
93
|
+
|
|
94
|
+
def filter(self, predicate: Callable[[T], bool]) -> None:
|
|
95
|
+
with self._lock.writer():
|
|
96
|
+
self._list = [item for item in self._list if predicate(item)]
|
|
97
|
+
|
|
98
|
+
def __len__(self) -> int:
|
|
99
|
+
with self._lock.reader():
|
|
100
|
+
return len(self._list)
|
|
101
|
+
|
|
102
|
+
def __iter__(self):
|
|
103
|
+
with self._lock.reader():
|
|
104
|
+
return iter(self._list.copy())
|
|
105
|
+
|
|
106
|
+
|
|
72
107
|
class ReadWriteLock:
|
|
73
108
|
class _Reader:
|
|
74
109
|
def __init__(self, lock) -> None:
|
|
@@ -2,37 +2,46 @@
|
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
4
|
|
|
5
|
+
import os
|
|
5
6
|
import re
|
|
7
|
+
import threading
|
|
6
8
|
from contextlib import contextmanager
|
|
7
9
|
from contextvars import ContextVar
|
|
8
|
-
from typing import Mapping, Optional
|
|
10
|
+
from typing import Iterator, Mapping, Optional
|
|
9
11
|
|
|
10
12
|
import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
|
|
11
13
|
|
|
12
14
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
15
|
+
from snowflake.snowpark_connect.type_support import (
|
|
16
|
+
set_integral_types_for_client_default,
|
|
17
|
+
)
|
|
13
18
|
from snowflake.snowpark_connect.typed_column import TypedColumn
|
|
14
19
|
|
|
15
20
|
# TODO: remove session id from context when we host SAS in Snowflake server
|
|
16
21
|
|
|
17
|
-
|
|
22
|
+
_spark_session_id = ContextVar[str]("_spark_session_id")
|
|
18
23
|
_plan_id_map = ContextVar[Mapping[int, DataFrameContainer]]("_plan_id_map")
|
|
19
24
|
_alias_map = ContextVar[Mapping[str, DataFrameContainer | None]]("_alias_map")
|
|
20
25
|
_spark_version = ContextVar[str]("_spark_version")
|
|
21
26
|
_is_aggregate_function = ContextVar(
|
|
22
27
|
"_is_aggregate_function", default=("default", False)
|
|
23
28
|
)
|
|
29
|
+
_grouping_by_scala_udf_key = ContextVar[bool](
|
|
30
|
+
"_grouping_by_scala_udf_key", default=False
|
|
31
|
+
)
|
|
24
32
|
_is_evaluating_sql = ContextVar[bool]("_is_evaluating_sql", default=False)
|
|
25
33
|
_is_evaluating_join_condition = ContextVar(
|
|
26
34
|
"_is_evaluating_join_condition", default=("default", False, [], [])
|
|
27
35
|
)
|
|
36
|
+
_is_processing_order_by = ContextVar[bool]("_is_processing_order_by", default=False)
|
|
37
|
+
_is_processing_aliased_relation = ContextVar[bool](
|
|
38
|
+
"_is_processing_aliased_relation", default=False
|
|
39
|
+
)
|
|
28
40
|
|
|
29
41
|
_sql_aggregate_function_count = ContextVar[int](
|
|
30
42
|
"_contains_aggregate_function", default=0
|
|
31
43
|
)
|
|
32
44
|
|
|
33
|
-
# Context for parsing map_partitions
|
|
34
|
-
_map_partitions_stack = ContextVar[int]("_map_partitions_stack", default=0)
|
|
35
|
-
|
|
36
45
|
# We have to generate our own plan IDs that are different from Spark's.
|
|
37
46
|
# Spark plan IDs start at 0, so pick a "big enough" number to avoid overlaps.
|
|
38
47
|
_STARTING_SQL_PLAN_ID = 0x80000000
|
|
@@ -55,10 +64,26 @@ _resolving_lambda_fun = ContextVar[bool]("_resolving_lambdas", default=False)
|
|
|
55
64
|
_current_lambda_params = ContextVar[list[str]]("_current_lambda_params", default=[])
|
|
56
65
|
|
|
57
66
|
_is_window_enabled = ContextVar[bool]("_is_window_enabled", default=False)
|
|
58
|
-
_is_in_pivot = ContextVar[bool]("_is_in_pivot", default=False)
|
|
59
67
|
_is_in_udtf_context = ContextVar[bool]("_is_in_udtf_context", default=False)
|
|
60
68
|
_accessing_temp_object = ContextVar[bool]("_accessing_temp_object", default=False)
|
|
61
69
|
|
|
70
|
+
# Thread-safe lock for JPype JClass creation to prevent access violations
|
|
71
|
+
_jpype_jclass_lock = threading.Lock()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@contextmanager
|
|
75
|
+
def get_jpype_jclass_lock() -> Iterator[None]:
|
|
76
|
+
"""
|
|
77
|
+
Context manager that acquires the JPype JClass lock on Windows platforms.
|
|
78
|
+
On non-Windows (os.name != 'nt'), it yields without acquiring the lock.
|
|
79
|
+
"""
|
|
80
|
+
if os.name == "nt":
|
|
81
|
+
with _jpype_jclass_lock:
|
|
82
|
+
yield
|
|
83
|
+
else:
|
|
84
|
+
yield
|
|
85
|
+
|
|
86
|
+
|
|
62
87
|
# Lateral Column Alias helpers
|
|
63
88
|
# We keep a thread-local mapping from alias name -> TypedColumn that is
|
|
64
89
|
# populated incrementally while the projection list is being processed.
|
|
@@ -70,12 +95,60 @@ _lca_alias_map: ContextVar[dict[str, TypedColumn]] = ContextVar(
|
|
|
70
95
|
default={},
|
|
71
96
|
)
|
|
72
97
|
|
|
98
|
+
_view_process_context = ContextVar("_view_process_context", default=[])
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@contextmanager
|
|
102
|
+
def push_processed_view(name: str):
|
|
103
|
+
_view_process_context.set(_view_process_context.get() + [name])
|
|
104
|
+
yield
|
|
105
|
+
_view_process_context.set(_view_process_context.get()[:-1])
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_processed_views() -> list[str]:
|
|
109
|
+
return _view_process_context.get()
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def register_processed_view(name: str) -> None:
|
|
113
|
+
context = _view_process_context.get()
|
|
114
|
+
context.append(name)
|
|
115
|
+
_view_process_context.set(context)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
_request_external_tables = ContextVar[list[str]]("_used_external_tables", default=[])
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def register_request_external_table(name: str) -> None:
|
|
122
|
+
_request_external_tables.set(_request_external_tables.get() + [name])
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def get_request_external_tables() -> list[str]:
|
|
126
|
+
return _request_external_tables.get()
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def clean_request_external_tables() -> None:
|
|
130
|
+
_request_external_tables.set([])
|
|
131
|
+
|
|
132
|
+
|
|
73
133
|
# Context variable to track current grouping columns for grouping_id() function
|
|
74
134
|
_current_grouping_columns: ContextVar[list[str]] = ContextVar(
|
|
75
135
|
"_current_grouping_columns",
|
|
76
136
|
default=[],
|
|
77
137
|
)
|
|
78
138
|
|
|
139
|
+
# Context variable to capture all original_attr_name values during subquery resolution
|
|
140
|
+
# This is a stack of lists to handle nested subqueries correctly
|
|
141
|
+
_captured_attribute_names: ContextVar[list[list[str]]] = ContextVar(
|
|
142
|
+
"_captured_attribute_names",
|
|
143
|
+
default=[],
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Context variable to track if we're resolving a subquery expression
|
|
147
|
+
_is_resolving_subquery_exp: ContextVar[bool] = ContextVar(
|
|
148
|
+
"_is_resolving_subquery_exp",
|
|
149
|
+
default=False,
|
|
150
|
+
)
|
|
151
|
+
|
|
79
152
|
|
|
80
153
|
def clear_lca_alias_map() -> None:
|
|
81
154
|
_lca_alias_map.set({})
|
|
@@ -112,14 +185,56 @@ def get_current_grouping_columns() -> list[str]:
|
|
|
112
185
|
return _current_grouping_columns.get()
|
|
113
186
|
|
|
114
187
|
|
|
115
|
-
def
|
|
116
|
-
"""
|
|
117
|
-
|
|
188
|
+
def capture_attribute_name(attr_name: str) -> None:
|
|
189
|
+
"""Capture an original_attr_name during expression resolution."""
|
|
190
|
+
stack = _captured_attribute_names.get()
|
|
191
|
+
if stack:
|
|
192
|
+
stack[-1].append(attr_name)
|
|
193
|
+
_captured_attribute_names.set(stack)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def get_captured_attribute_names() -> list[str]:
|
|
197
|
+
"""Get the list of captured attribute names from the current top of the stack."""
|
|
198
|
+
stack = _captured_attribute_names.get()
|
|
199
|
+
return stack[-1] if stack else []
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def is_resolving_subquery_exp() -> bool:
|
|
203
|
+
"""
|
|
204
|
+
Returns True if currently resolving a subquery expression.
|
|
205
|
+
"""
|
|
206
|
+
return _is_resolving_subquery_exp.get()
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@contextmanager
|
|
210
|
+
def resolving_subquery_exp():
|
|
211
|
+
"""
|
|
212
|
+
Context manager that captures all original_attr_name values during subquery expression resolution.
|
|
213
|
+
Sets a flag to indicate we're in a subquery context and pushes a new list onto the stack.
|
|
214
|
+
When the context exits, pops the list from the stack.
|
|
215
|
+
"""
|
|
216
|
+
stack = _captured_attribute_names.get()
|
|
217
|
+
stack.append([])
|
|
218
|
+
_captured_attribute_names.set(stack)
|
|
219
|
+
token = _is_resolving_subquery_exp.set(True)
|
|
220
|
+
try:
|
|
221
|
+
yield
|
|
222
|
+
finally:
|
|
223
|
+
stack = _captured_attribute_names.get()
|
|
224
|
+
if stack:
|
|
225
|
+
stack.pop()
|
|
226
|
+
_captured_attribute_names.set(stack)
|
|
227
|
+
_is_resolving_subquery_exp.reset(token)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def set_spark_session_id(value: str) -> None:
|
|
231
|
+
"""Set the Spark session ID for the current context"""
|
|
232
|
+
_spark_session_id.set(value)
|
|
118
233
|
|
|
119
234
|
|
|
120
|
-
def
|
|
121
|
-
"""Get the session ID for the current context."""
|
|
122
|
-
return
|
|
235
|
+
def get_spark_session_id() -> str:
|
|
236
|
+
"""Get the Spark session ID for the current context."""
|
|
237
|
+
return _spark_session_id.get(None)
|
|
123
238
|
|
|
124
239
|
|
|
125
240
|
def set_plan_id_map(plan_id: int, container: DataFrameContainer) -> None:
|
|
@@ -155,6 +270,11 @@ def set_spark_version(client_type: str) -> None:
|
|
|
155
270
|
version = match.group("spark_version") if match else ""
|
|
156
271
|
_spark_version.set(version)
|
|
157
272
|
|
|
273
|
+
# enable integral types (only if config is "client_default")
|
|
274
|
+
|
|
275
|
+
is_python_client = "_SPARK_CONNECT_PYTHON" in client_type
|
|
276
|
+
set_integral_types_for_client_default(is_python_client)
|
|
277
|
+
|
|
158
278
|
|
|
159
279
|
def get_is_aggregate_function() -> tuple[str, bool]:
|
|
160
280
|
"""
|
|
@@ -190,6 +310,66 @@ def push_evaluating_sql_scope():
|
|
|
190
310
|
_is_evaluating_sql.set(prev)
|
|
191
311
|
|
|
192
312
|
|
|
313
|
+
def get_grouping_by_scala_udf_key() -> bool:
|
|
314
|
+
"""
|
|
315
|
+
Gets the value of _grouping_by_scala_udf_key for the current context, defaults to False.
|
|
316
|
+
"""
|
|
317
|
+
return _grouping_by_scala_udf_key.get()
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
@contextmanager
|
|
321
|
+
def grouping_by_scala_udf_key(value: bool):
|
|
322
|
+
"""
|
|
323
|
+
Context manager that conditionally sets a flag indicating grouping by scala_udf key.
|
|
324
|
+
Only activates the flag when value=True, otherwise leaves the current context unchanged
|
|
325
|
+
"""
|
|
326
|
+
prev = _grouping_by_scala_udf_key.get()
|
|
327
|
+
try:
|
|
328
|
+
if value:
|
|
329
|
+
_grouping_by_scala_udf_key.set(True)
|
|
330
|
+
yield
|
|
331
|
+
finally:
|
|
332
|
+
_grouping_by_scala_udf_key.set(prev)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def get_is_processing_order_by() -> bool:
|
|
336
|
+
"""
|
|
337
|
+
Gets the value of _is_processing_order_by for the current context, defaults to False.
|
|
338
|
+
"""
|
|
339
|
+
return _is_processing_order_by.get()
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
@contextmanager
|
|
343
|
+
def push_processing_order_by_scope():
|
|
344
|
+
"""
|
|
345
|
+
Context manager that sets a flag indicating if ORDER BY expressions are being evaluated.
|
|
346
|
+
This enables optimizations like reusing already-computed UDF columns.
|
|
347
|
+
"""
|
|
348
|
+
prev = _is_processing_order_by.get()
|
|
349
|
+
try:
|
|
350
|
+
_is_processing_order_by.set(True)
|
|
351
|
+
yield
|
|
352
|
+
finally:
|
|
353
|
+
_is_processing_order_by.set(prev)
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def get_is_processing_aliased_relation() -> bool:
|
|
357
|
+
return _is_processing_aliased_relation.get()
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
@contextmanager
|
|
361
|
+
def push_processing_aliased_relation_scope(process_aliased_relation: bool):
|
|
362
|
+
"""
|
|
363
|
+
Context manager that sets a flag indicating if an aliased relation is being resolved.
|
|
364
|
+
"""
|
|
365
|
+
prev = _is_processing_aliased_relation.get()
|
|
366
|
+
try:
|
|
367
|
+
_is_processing_aliased_relation.set(process_aliased_relation)
|
|
368
|
+
yield
|
|
369
|
+
finally:
|
|
370
|
+
_is_processing_aliased_relation.set(prev)
|
|
371
|
+
|
|
372
|
+
|
|
193
373
|
def get_is_evaluating_join_condition() -> tuple[str, bool, list, list]:
|
|
194
374
|
"""
|
|
195
375
|
Gets the value of _is_evaluating_join_condition for the current context, defaults to False.
|
|
@@ -210,16 +390,6 @@ def push_evaluating_join_condition(join_type, left_keys, right_keys):
|
|
|
210
390
|
_is_evaluating_join_condition.set(prev)
|
|
211
391
|
|
|
212
392
|
|
|
213
|
-
@contextmanager
|
|
214
|
-
def push_map_partitions():
|
|
215
|
-
_map_partitions_stack.set(_map_partitions_stack.get() + 1)
|
|
216
|
-
yield
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
def map_partitions_depth() -> int:
|
|
220
|
-
return _map_partitions_stack.get()
|
|
221
|
-
|
|
222
|
-
|
|
223
393
|
@contextmanager
|
|
224
394
|
def push_sql_scope():
|
|
225
395
|
"""
|
|
@@ -383,13 +553,14 @@ def get_outer_dataframes() -> list[DataFrameContainer]:
|
|
|
383
553
|
|
|
384
554
|
|
|
385
555
|
def clear_context_data() -> None:
|
|
386
|
-
|
|
556
|
+
_spark_session_id.set(None)
|
|
387
557
|
_plan_id_map.set({})
|
|
388
558
|
_alias_map.set({})
|
|
389
559
|
|
|
560
|
+
_request_external_tables.set([])
|
|
561
|
+
_view_process_context.set([])
|
|
390
562
|
_next_sql_plan_id.set(_STARTING_SQL_PLAN_ID)
|
|
391
563
|
_sql_plan_name_map.set({})
|
|
392
|
-
_map_partitions_stack.set(0)
|
|
393
564
|
_sql_aggregate_function_count.set(0)
|
|
394
565
|
_sql_named_args.set({})
|
|
395
566
|
_sql_pos_args.set({})
|
|
@@ -419,19 +590,6 @@ def is_window_enabled():
|
|
|
419
590
|
return _is_window_enabled.get()
|
|
420
591
|
|
|
421
592
|
|
|
422
|
-
@contextmanager
|
|
423
|
-
def temporary_pivot_expression(value: bool):
|
|
424
|
-
token = _is_in_pivot.set(value)
|
|
425
|
-
try:
|
|
426
|
-
yield
|
|
427
|
-
finally:
|
|
428
|
-
_is_in_pivot.reset(token)
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
def is_in_pivot() -> bool:
|
|
432
|
-
return _is_in_pivot.get()
|
|
433
|
-
|
|
434
|
-
|
|
435
593
|
def get_is_in_udtf_context() -> bool:
|
|
436
594
|
"""
|
|
437
595
|
Gets the value of _is_in_udtf_context for the current context, defaults to False.
|