snowpark-connect 0.29.0__py3-none-any.whl → 0.30.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
- snowflake/snowpark_connect/client.py +65 -0
- snowflake/snowpark_connect/column_name_handler.py +6 -0
- snowflake/snowpark_connect/config.py +25 -3
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +21 -19
- snowflake/snowpark_connect/expression/map_extension.py +277 -1
- snowflake/snowpark_connect/expression/map_sql_expression.py +107 -2
- snowflake/snowpark_connect/expression/map_unresolved_function.py +253 -59
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2_grpc.py +4 -0
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2_grpc.py +4 -0
- snowflake/snowpark_connect/relation/io_utils.py +61 -4
- snowflake/snowpark_connect/relation/map_column_ops.py +9 -4
- snowflake/snowpark_connect/relation/map_join.py +8 -0
- snowflake/snowpark_connect/relation/map_row_ops.py +129 -17
- snowflake/snowpark_connect/relation/map_show_string.py +14 -6
- snowflake/snowpark_connect/relation/map_sql.py +39 -5
- snowflake/snowpark_connect/relation/map_stats.py +21 -6
- snowflake/snowpark_connect/relation/read/map_read.py +9 -0
- snowflake/snowpark_connect/relation/read/map_read_csv.py +17 -6
- snowflake/snowpark_connect/relation/read/map_read_json.py +12 -2
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +7 -1
- snowflake/snowpark_connect/relation/read/metadata_utils.py +159 -0
- snowflake/snowpark_connect/relation/utils.py +19 -2
- snowflake/snowpark_connect/relation/write/map_write.py +44 -29
- snowflake/snowpark_connect/server.py +11 -3
- snowflake/snowpark_connect/type_mapping.py +75 -3
- snowflake/snowpark_connect/utils/describe_query_cache.py +6 -3
- snowflake/snowpark_connect/utils/telemetry.py +105 -23
- snowflake/snowpark_connect/version.py +1 -1
- {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/METADATA +1 -1
- {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/RECORD +41 -37
- {snowpark_connect-0.29.0.data → snowpark_connect-0.30.1.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.29.0.data → snowpark_connect-0.30.1.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.29.0.data → snowpark_connect-0.30.1.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/top_level.txt +0 -0
|
@@ -13,6 +13,10 @@ from snowflake.snowpark.dataframe_reader import DataFrameReader
|
|
|
13
13
|
from snowflake.snowpark.types import StringType, StructField, StructType
|
|
14
14
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
15
15
|
from snowflake.snowpark_connect.relation.read.map_read import CsvReaderConfig
|
|
16
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
17
|
+
add_filename_metadata_to_reader,
|
|
18
|
+
get_non_metadata_fields,
|
|
19
|
+
)
|
|
16
20
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
17
21
|
get_spark_column_names_from_snowpark_columns,
|
|
18
22
|
rename_columns_as_snowflake_standard,
|
|
@@ -57,12 +61,17 @@ def map_read_csv(
|
|
|
57
61
|
snowpark_read_options["PATTERN"] = snowpark_options.get("PATTERN", None)
|
|
58
62
|
|
|
59
63
|
raw_options = rel.read.data_source.options
|
|
64
|
+
|
|
60
65
|
if schema is None or (
|
|
61
66
|
parse_header and raw_options.get("enforceSchema", "True").lower() == "false"
|
|
62
67
|
): # Schema has to equals to header's format
|
|
63
|
-
reader =
|
|
68
|
+
reader = add_filename_metadata_to_reader(
|
|
69
|
+
session.read.options(snowpark_options), raw_options
|
|
70
|
+
)
|
|
64
71
|
else:
|
|
65
|
-
reader =
|
|
72
|
+
reader = add_filename_metadata_to_reader(
|
|
73
|
+
session.read.options(snowpark_options).schema(schema), raw_options
|
|
74
|
+
)
|
|
66
75
|
df = read_data(
|
|
67
76
|
reader,
|
|
68
77
|
schema,
|
|
@@ -175,14 +184,16 @@ def read_data(
|
|
|
175
184
|
) -> snowpark.DataFrame:
|
|
176
185
|
df = reader.csv(path)
|
|
177
186
|
filename = path.strip("/").split("/")[-1]
|
|
187
|
+
non_metadata_fields = get_non_metadata_fields(df.schema.fields)
|
|
188
|
+
|
|
178
189
|
if schema is not None:
|
|
179
|
-
if len(schema.fields) != len(
|
|
190
|
+
if len(schema.fields) != len(non_metadata_fields):
|
|
180
191
|
raise Exception(f"csv load from {filename} failed.")
|
|
181
192
|
if raw_options.get("enforceSchema", "True").lower() == "false":
|
|
182
193
|
for i in range(len(schema.fields)):
|
|
183
194
|
if (
|
|
184
|
-
schema.fields[i].name !=
|
|
185
|
-
and f'"{schema.fields[i].name}"' !=
|
|
195
|
+
schema.fields[i].name != non_metadata_fields[i].name
|
|
196
|
+
and f'"{schema.fields[i].name}"' != non_metadata_fields[i].name
|
|
186
197
|
):
|
|
187
198
|
raise Exception("CSV header does not conform to the schema")
|
|
188
199
|
return df
|
|
@@ -191,7 +202,7 @@ def read_data(
|
|
|
191
202
|
session, path, file_format_options, snowpark_read_options
|
|
192
203
|
)
|
|
193
204
|
|
|
194
|
-
df_schema_fields =
|
|
205
|
+
df_schema_fields = non_metadata_fields
|
|
195
206
|
if len(headers) == len(df_schema_fields) and parse_header:
|
|
196
207
|
return df.select(
|
|
197
208
|
[
|
|
@@ -29,6 +29,9 @@ from snowflake.snowpark.types import (
|
|
|
29
29
|
)
|
|
30
30
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
31
31
|
from snowflake.snowpark_connect.relation.read.map_read import JsonReaderConfig
|
|
32
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
33
|
+
add_filename_metadata_to_reader,
|
|
34
|
+
)
|
|
32
35
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
33
36
|
get_spark_column_names_from_snowpark_columns,
|
|
34
37
|
rename_columns_as_snowflake_standard,
|
|
@@ -66,19 +69,26 @@ def map_read_json(
|
|
|
66
69
|
)
|
|
67
70
|
else:
|
|
68
71
|
snowpark_options = options.convert_to_snowpark_args()
|
|
72
|
+
raw_options = rel.read.data_source.options
|
|
69
73
|
snowpark_options["infer_schema"] = True
|
|
70
74
|
|
|
71
75
|
rows_to_infer_schema = snowpark_options.pop("rowstoinferschema", 1000)
|
|
72
76
|
dropFieldIfAllNull = snowpark_options.pop("dropfieldifallnull", False)
|
|
73
77
|
batch_size = snowpark_options.pop("batchsize", 1000)
|
|
74
78
|
|
|
75
|
-
reader =
|
|
79
|
+
reader = add_filename_metadata_to_reader(
|
|
80
|
+
session.read.options(snowpark_options), raw_options
|
|
81
|
+
)
|
|
76
82
|
|
|
77
83
|
df = reader.json(paths[0])
|
|
78
84
|
if len(paths) > 1:
|
|
79
85
|
# TODO: figure out if this is what Spark does.
|
|
80
86
|
for p in paths[1:]:
|
|
81
|
-
df = df.union_all(
|
|
87
|
+
df = df.union_all(
|
|
88
|
+
add_filename_metadata_to_reader(
|
|
89
|
+
session.read.options(snowpark_options), raw_options
|
|
90
|
+
).json(p)
|
|
91
|
+
)
|
|
82
92
|
|
|
83
93
|
if schema is None:
|
|
84
94
|
schema = copy.deepcopy(df.schema)
|
|
@@ -22,6 +22,9 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
|
22
22
|
from snowflake.snowpark.column import METADATA_FILENAME
|
|
23
23
|
from snowflake.snowpark.types import DataType, DoubleType, IntegerType, StringType
|
|
24
24
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
25
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
26
|
+
add_filename_metadata_to_reader,
|
|
27
|
+
)
|
|
25
28
|
from snowflake.snowpark_connect.relation.read.reader_config import ReaderWriterConfig
|
|
26
29
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
27
30
|
rename_columns_as_snowflake_standard,
|
|
@@ -46,10 +49,13 @@ def map_read_parquet(
|
|
|
46
49
|
)
|
|
47
50
|
|
|
48
51
|
snowpark_options = options.convert_to_snowpark_args()
|
|
52
|
+
raw_options = rel.read.data_source.options
|
|
49
53
|
assert schema is None, "Read PARQUET does not support user schema"
|
|
50
54
|
assert len(paths) > 0, "Read PARQUET expects at least one path"
|
|
51
55
|
|
|
52
|
-
reader =
|
|
56
|
+
reader = add_filename_metadata_to_reader(
|
|
57
|
+
session.read.options(snowpark_options), raw_options
|
|
58
|
+
)
|
|
53
59
|
|
|
54
60
|
if len(paths) == 1:
|
|
55
61
|
df = _read_parquet_with_partitions(session, reader, paths[0])
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
Utilities for handling internal metadata columns in file-based DataFrames.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
import pandas
|
|
12
|
+
from pyspark.errors.exceptions.base import AnalysisException
|
|
13
|
+
|
|
14
|
+
from snowflake import snowpark
|
|
15
|
+
from snowflake.snowpark.column import METADATA_FILENAME
|
|
16
|
+
from snowflake.snowpark.functions import col
|
|
17
|
+
from snowflake.snowpark.types import StructField
|
|
18
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
19
|
+
|
|
20
|
+
# Constant for the metadata filename column name
|
|
21
|
+
METADATA_FILENAME_COLUMN = "METADATA$FILENAME"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def add_filename_metadata_to_reader(
|
|
25
|
+
reader: snowpark.DataFrameReader,
|
|
26
|
+
options: dict | None = None,
|
|
27
|
+
) -> snowpark.DataFrameReader:
|
|
28
|
+
"""
|
|
29
|
+
Add filename metadata to a DataFrameReader based on configuration.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
reader: Snowpark DataFrameReader instance
|
|
33
|
+
options: Dictionary of options to check for metadata configuration
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
DataFrameReader with filename metadata enabled if configured, otherwise unchanged
|
|
37
|
+
"""
|
|
38
|
+
# NOTE: SNOWPARK_POPULATE_FILE_METADATA_DEFAULT is an internal environment variable
|
|
39
|
+
# used only for CI testing to verify no metadata columns leak in regular file operations.
|
|
40
|
+
# This environment variable should NOT be exposed to end users. Users should only use snowpark.populateFileMetadata
|
|
41
|
+
# to enable metadata population.
|
|
42
|
+
metadata_default = os.environ.get(
|
|
43
|
+
"SNOWPARK_POPULATE_FILE_METADATA_DEFAULT", "false"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
populate_metadata = (
|
|
47
|
+
options.get("snowpark.populateFileMetadata", metadata_default)
|
|
48
|
+
if options
|
|
49
|
+
else metadata_default
|
|
50
|
+
).lower() == "true"
|
|
51
|
+
|
|
52
|
+
if populate_metadata:
|
|
53
|
+
return reader.with_metadata(METADATA_FILENAME)
|
|
54
|
+
else:
|
|
55
|
+
return reader
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_non_metadata_fields(schema_fields: list[StructField]) -> list[StructField]:
|
|
59
|
+
"""
|
|
60
|
+
Filter out METADATA$FILENAME fields from a list of schema fields.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
schema_fields: List of StructField objects from a DataFrame schema
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
List of StructField objects excluding METADATA$FILENAME
|
|
67
|
+
"""
|
|
68
|
+
return [field for field in schema_fields if field.name != METADATA_FILENAME_COLUMN]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_non_metadata_column_names(schema_fields: list[StructField]) -> list[str]:
|
|
72
|
+
"""
|
|
73
|
+
Get column names from schema fields, excluding METADATA$FILENAME.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
schema_fields: List of StructField objects from a DataFrame schema
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
List of column names (strings) excluding METADATA$FILENAME
|
|
80
|
+
"""
|
|
81
|
+
return [
|
|
82
|
+
field.name for field in schema_fields if field.name != METADATA_FILENAME_COLUMN
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def filter_metadata_column_name(column_names: list[str]) -> list[str]:
|
|
87
|
+
"""
|
|
88
|
+
Get column names from column_names, excluding METADATA$FILENAME.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
List of column names (strings) excluding METADATA$FILENAME
|
|
92
|
+
"""
|
|
93
|
+
return [
|
|
94
|
+
col_name for col_name in column_names if col_name != METADATA_FILENAME_COLUMN
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def filter_metadata_columns(
|
|
99
|
+
result_container: DataFrameContainer | pandas.DataFrame | None,
|
|
100
|
+
) -> DataFrameContainer | pandas.DataFrame | None:
|
|
101
|
+
"""
|
|
102
|
+
Filter METADATA$FILENAME from DataFrame container for execution and write operations.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
result_container: DataFrameContainer or pandas DataFrame to filter
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Filtered container (callers can access dataframe via container.dataframe)
|
|
109
|
+
"""
|
|
110
|
+
# Handle pandas DataFrame case - return as-is
|
|
111
|
+
if isinstance(result_container, pandas.DataFrame):
|
|
112
|
+
return result_container
|
|
113
|
+
|
|
114
|
+
if result_container is None:
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
result_df = result_container.dataframe
|
|
118
|
+
if not isinstance(result_df, snowpark.DataFrame):
|
|
119
|
+
return result_container
|
|
120
|
+
|
|
121
|
+
df_columns = result_container.column_map.get_snowpark_columns()
|
|
122
|
+
has_metadata_filename = any(name == METADATA_FILENAME_COLUMN for name in df_columns)
|
|
123
|
+
|
|
124
|
+
if not has_metadata_filename:
|
|
125
|
+
return result_container
|
|
126
|
+
|
|
127
|
+
non_metadata_columns = filter_metadata_column_name(df_columns)
|
|
128
|
+
|
|
129
|
+
if len(non_metadata_columns) == 0:
|
|
130
|
+
# DataFrame contains only metadata columns (METADATA$FILENAME), no actual data columns remaining.
|
|
131
|
+
# We don't have a way to return an empty dataframe.
|
|
132
|
+
raise AnalysisException(
|
|
133
|
+
"[DATAFRAME_MISSING_DATA_COLUMNS] Cannot perform operation on DataFrame that contains no data columns."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
filtered_df = result_df.select([col(name) for name in non_metadata_columns])
|
|
137
|
+
|
|
138
|
+
original_spark_columns = result_container.column_map.get_spark_columns()
|
|
139
|
+
original_snowpark_columns = result_container.column_map.get_snowpark_columns()
|
|
140
|
+
|
|
141
|
+
filtered_spark_columns = []
|
|
142
|
+
filtered_snowpark_columns = []
|
|
143
|
+
|
|
144
|
+
for i, colname in enumerate(df_columns):
|
|
145
|
+
if colname != METADATA_FILENAME_COLUMN:
|
|
146
|
+
filtered_spark_columns.append(original_spark_columns[i])
|
|
147
|
+
filtered_snowpark_columns.append(original_snowpark_columns[i])
|
|
148
|
+
|
|
149
|
+
new_container = DataFrameContainer.create_with_column_mapping(
|
|
150
|
+
dataframe=filtered_df,
|
|
151
|
+
spark_column_names=filtered_spark_columns,
|
|
152
|
+
snowpark_column_names=filtered_snowpark_columns,
|
|
153
|
+
column_metadata=result_container.column_map.column_metadata,
|
|
154
|
+
table_name=result_container.table_name,
|
|
155
|
+
alias=result_container.alias,
|
|
156
|
+
partition_hint=result_container.partition_hint,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
return new_container
|
|
@@ -92,6 +92,21 @@ TYPE_MAP_FOR_TO_SCHEMA = {
|
|
|
92
92
|
}
|
|
93
93
|
|
|
94
94
|
|
|
95
|
+
# This mapping is used to map the compression type to the extension of the file.
|
|
96
|
+
FILE_COMPRESSION_TO_EXTENSION = {
|
|
97
|
+
"GZIP": "gz",
|
|
98
|
+
"BZ2": "bz2",
|
|
99
|
+
"BROTLI": "br",
|
|
100
|
+
"ZSTD": "zst",
|
|
101
|
+
"DEFLATE": "deflate",
|
|
102
|
+
"RAW_DEFLATE": "raw_deflate",
|
|
103
|
+
"SNAPPY": "snappy",
|
|
104
|
+
"LZO": "lzo",
|
|
105
|
+
"LZ4": "lz4",
|
|
106
|
+
"BZIP2": "bz2",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
95
110
|
def get_df_with_partition_row_number(
|
|
96
111
|
container: DataFrameContainer,
|
|
97
112
|
plan_id: int | None,
|
|
@@ -186,13 +201,15 @@ def generate_spark_compatible_filename(
|
|
|
186
201
|
|
|
187
202
|
# Add compression if specified and not 'none'
|
|
188
203
|
if compression and compression.lower() not in ("none", "uncompressed"):
|
|
189
|
-
compression_part = f".{compression.lower()}"
|
|
204
|
+
compression_part = f".{FILE_COMPRESSION_TO_EXTENSION.get(compression.upper(), compression.lower())}"
|
|
190
205
|
else:
|
|
191
206
|
compression_part = ""
|
|
192
207
|
|
|
193
208
|
# Add format extension if specified
|
|
194
|
-
if format_ext:
|
|
209
|
+
if format_ext == "parquet":
|
|
195
210
|
return f"{base_name}{compression_part}.{format_ext}"
|
|
211
|
+
elif format_ext is not None and format_ext != "":
|
|
212
|
+
return f"{base_name}.{format_ext}{compression_part}"
|
|
196
213
|
else:
|
|
197
214
|
return f"{base_name}{compression_part}"
|
|
198
215
|
|
|
@@ -35,11 +35,13 @@ from snowflake.snowpark_connect.config import (
|
|
|
35
35
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
36
36
|
from snowflake.snowpark_connect.relation.io_utils import (
|
|
37
37
|
convert_file_prefix_path,
|
|
38
|
+
get_compression_for_source_and_options,
|
|
38
39
|
is_cloud_path,
|
|
39
|
-
is_supported_compression,
|
|
40
|
-
supported_compressions_for_format,
|
|
41
40
|
)
|
|
42
41
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
42
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
43
|
+
filter_metadata_columns,
|
|
44
|
+
)
|
|
43
45
|
from snowflake.snowpark_connect.relation.read.reader_config import CsvWriterConfig
|
|
44
46
|
from snowflake.snowpark_connect.relation.stage_locator import get_paths_from_stage
|
|
45
47
|
from snowflake.snowpark_connect.relation.utils import (
|
|
@@ -129,8 +131,26 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
129
131
|
|
|
130
132
|
result = map_relation(write_op.input)
|
|
131
133
|
input_df: snowpark.DataFrame = handle_column_names(result, write_op.source)
|
|
134
|
+
|
|
135
|
+
# Create updated container with transformed dataframe, then filter METADATA$FILENAME columns
|
|
136
|
+
# Update the container to use the transformed dataframe from handle_column_names
|
|
137
|
+
updated_result = DataFrameContainer(
|
|
138
|
+
dataframe=input_df,
|
|
139
|
+
column_map=result.column_map,
|
|
140
|
+
table_name=result.table_name,
|
|
141
|
+
alias=result.alias,
|
|
142
|
+
partition_hint=result.partition_hint,
|
|
143
|
+
)
|
|
144
|
+
updated_result = filter_metadata_columns(updated_result)
|
|
145
|
+
input_df = updated_result.dataframe
|
|
146
|
+
|
|
132
147
|
session: snowpark.Session = get_or_create_snowpark_session()
|
|
133
148
|
|
|
149
|
+
# Check for partition hint early to determine precedence over single option
|
|
150
|
+
partition_hint = (
|
|
151
|
+
result.partition_hint if hasattr(result, "partition_hint") else None
|
|
152
|
+
)
|
|
153
|
+
|
|
134
154
|
# Snowflake saveAsTable doesn't support format
|
|
135
155
|
if (
|
|
136
156
|
write_op.HasField("table")
|
|
@@ -160,8 +180,11 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
160
180
|
# Generate Spark-compatible filename with proper extension
|
|
161
181
|
extension = write_op.source if write_op.source != "text" else "txt"
|
|
162
182
|
|
|
163
|
-
|
|
164
|
-
|
|
183
|
+
compression = get_compression_for_source_and_options(
|
|
184
|
+
write_op.source, write_op.options, from_read=False
|
|
185
|
+
)
|
|
186
|
+
if compression is not None:
|
|
187
|
+
write_op.options["compression"] = compression
|
|
165
188
|
|
|
166
189
|
# Generate Spark-compatible filename or prefix
|
|
167
190
|
# we need a random prefix to support "append" mode
|
|
@@ -187,12 +210,12 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
187
210
|
except Exception as e:
|
|
188
211
|
logger.warning(f"Could not clear directory {write_path}: {e}")
|
|
189
212
|
|
|
190
|
-
if should_write_to_single_file:
|
|
213
|
+
if should_write_to_single_file and partition_hint is None:
|
|
191
214
|
# Single file: generate complete filename with extension
|
|
192
215
|
spark_filename = generate_spark_compatible_filename(
|
|
193
216
|
task_id=0,
|
|
194
217
|
attempt_number=0,
|
|
195
|
-
compression=
|
|
218
|
+
compression=compression,
|
|
196
219
|
format_ext=extension,
|
|
197
220
|
)
|
|
198
221
|
temp_file_prefix_on_stage = f"{write_path}/{spark_filename}"
|
|
@@ -201,29 +224,11 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
201
224
|
spark_filename_prefix = generate_spark_compatible_filename(
|
|
202
225
|
task_id=0,
|
|
203
226
|
attempt_number=0,
|
|
204
|
-
compression=
|
|
227
|
+
compression=None,
|
|
205
228
|
format_ext="", # No extension for prefix
|
|
206
229
|
)
|
|
207
230
|
temp_file_prefix_on_stage = f"{write_path}/{spark_filename_prefix}"
|
|
208
231
|
|
|
209
|
-
default_compression = "NONE" if write_op.source != "parquet" else "snappy"
|
|
210
|
-
compression = write_op.options.get(
|
|
211
|
-
"compression", default_compression
|
|
212
|
-
).upper()
|
|
213
|
-
|
|
214
|
-
if not is_supported_compression(write_op.source, compression):
|
|
215
|
-
supported_compressions = supported_compressions_for_format(
|
|
216
|
-
write_op.source
|
|
217
|
-
)
|
|
218
|
-
raise AnalysisException(
|
|
219
|
-
f"Compression {compression} is not supported for {write_op.source} format. "
|
|
220
|
-
+ (
|
|
221
|
-
f"Supported compressions: {sorted(supported_compressions)}"
|
|
222
|
-
if supported_compressions
|
|
223
|
-
else "No compression supported for this format."
|
|
224
|
-
)
|
|
225
|
-
)
|
|
226
|
-
|
|
227
232
|
parameters = {
|
|
228
233
|
"location": temp_file_prefix_on_stage,
|
|
229
234
|
"file_format_type": write_op.source
|
|
@@ -238,9 +243,6 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
238
243
|
# Using the base avoids coupling to exact filenames/prefixes.
|
|
239
244
|
download_stage_path = write_path
|
|
240
245
|
|
|
241
|
-
# Check for partition hint early to determine precedence over single option
|
|
242
|
-
partition_hint = result.partition_hint
|
|
243
|
-
|
|
244
246
|
# Apply max_file_size for both single and multi-file scenarios
|
|
245
247
|
# This helps control when Snowflake splits files into multiple parts
|
|
246
248
|
if max_file_size:
|
|
@@ -298,7 +300,7 @@ def map_write(request: proto_base.ExecutePlanRequest):
|
|
|
298
300
|
per_part_prefix = generate_spark_compatible_filename(
|
|
299
301
|
task_id=part_idx,
|
|
300
302
|
attempt_number=0,
|
|
301
|
-
compression=
|
|
303
|
+
compression=None,
|
|
302
304
|
format_ext="", # prefix only; Snowflake appends extension/counters
|
|
303
305
|
)
|
|
304
306
|
part_params["location"] = f"{write_path}/{per_part_prefix}"
|
|
@@ -537,6 +539,19 @@ def map_write_v2(request: proto_base.ExecutePlanRequest):
|
|
|
537
539
|
snowpark_table_name = _spark_to_snowflake(write_op.table_name)
|
|
538
540
|
result = map_relation(write_op.input)
|
|
539
541
|
input_df: snowpark.DataFrame = handle_column_names(result, "table")
|
|
542
|
+
|
|
543
|
+
# Create updated container with transformed dataframe, then filter METADATA$FILENAME columns
|
|
544
|
+
# Update the container to use the transformed dataframe from handle_column_names
|
|
545
|
+
updated_result = DataFrameContainer(
|
|
546
|
+
dataframe=input_df,
|
|
547
|
+
column_map=result.column_map,
|
|
548
|
+
table_name=result.table_name,
|
|
549
|
+
alias=result.alias,
|
|
550
|
+
partition_hint=result.partition_hint,
|
|
551
|
+
)
|
|
552
|
+
updated_result = filter_metadata_columns(updated_result)
|
|
553
|
+
input_df = updated_result.dataframe
|
|
554
|
+
|
|
540
555
|
session: snowpark.Session = get_or_create_snowpark_session()
|
|
541
556
|
|
|
542
557
|
if write_op.table_name is None or write_op.table_name == "":
|
|
@@ -232,12 +232,20 @@ class SnowflakeConnectServicer(proto_base_grpc.SparkConnectServiceServicer):
|
|
|
232
232
|
match request.WhichOneof("analyze"):
|
|
233
233
|
case "schema":
|
|
234
234
|
result = map_relation(request.schema.plan.root)
|
|
235
|
-
|
|
236
|
-
|
|
235
|
+
|
|
236
|
+
from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
237
|
+
filter_metadata_columns,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
filtered_result = filter_metadata_columns(result)
|
|
241
|
+
filtered_df = filtered_result.dataframe
|
|
242
|
+
|
|
237
243
|
schema = proto_base.AnalyzePlanResponse.Schema(
|
|
238
244
|
schema=types_proto.DataType(
|
|
239
245
|
**snowpark_to_proto_type(
|
|
240
|
-
|
|
246
|
+
filtered_df.schema,
|
|
247
|
+
filtered_result.column_map,
|
|
248
|
+
filtered_df,
|
|
241
249
|
)
|
|
242
250
|
)
|
|
243
251
|
)
|
|
@@ -30,6 +30,10 @@ from snowflake.snowpark_connect.date_time_format_mapping import (
|
|
|
30
30
|
convert_spark_format_to_snowflake,
|
|
31
31
|
)
|
|
32
32
|
from snowflake.snowpark_connect.expression.literal import get_literal_field_and_name
|
|
33
|
+
from snowflake.snowpark_connect.expression.map_sql_expression import (
|
|
34
|
+
_INTERVAL_DAYTIME_PATTERN_RE,
|
|
35
|
+
_INTERVAL_YEARMONTH_PATTERN_RE,
|
|
36
|
+
)
|
|
33
37
|
from snowflake.snowpark_connect.utils.context import get_is_evaluating_sql
|
|
34
38
|
from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
35
39
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
@@ -274,6 +278,18 @@ def snowpark_to_proto_type(
|
|
|
274
278
|
case snowpark.types.VariantType:
|
|
275
279
|
# For now we are returning a string type for variant types.
|
|
276
280
|
return {"string": types_proto.DataType.String()}
|
|
281
|
+
case snowpark.types.YearMonthIntervalType:
|
|
282
|
+
return {
|
|
283
|
+
"year_month_interval": types_proto.DataType.YearMonthInterval(
|
|
284
|
+
start_field=data_type.start_field, end_field=data_type.end_field
|
|
285
|
+
)
|
|
286
|
+
}
|
|
287
|
+
case snowpark.types.DayTimeIntervalType:
|
|
288
|
+
return {
|
|
289
|
+
"day_time_interval": types_proto.DataType.DayTimeInterval(
|
|
290
|
+
start_field=data_type.start_field, end_field=data_type.end_field
|
|
291
|
+
)
|
|
292
|
+
}
|
|
277
293
|
case _:
|
|
278
294
|
raise SnowparkConnectNotImplementedError(
|
|
279
295
|
f"Unsupported snowpark data type: {data_type}"
|
|
@@ -328,6 +344,24 @@ def cast_to_match_snowpark_type(
|
|
|
328
344
|
return str(content)
|
|
329
345
|
case snowpark.types.TimestampType:
|
|
330
346
|
return str(content)
|
|
347
|
+
case snowpark.types.YearMonthIntervalType:
|
|
348
|
+
if isinstance(content, (int, float)):
|
|
349
|
+
total_months = int(content)
|
|
350
|
+
years = total_months // 12
|
|
351
|
+
months = total_months % 12
|
|
352
|
+
return f"INTERVAL '{years}-{months}' YEAR TO MONTH"
|
|
353
|
+
elif isinstance(content, str) and content.startswith(("+", "-")):
|
|
354
|
+
# Handle Snowflake's native interval format (e.g., "+11-08" or "-2-3")
|
|
355
|
+
# Convert to Spark's format: "INTERVAL 'Y-M' YEAR TO MONTH"
|
|
356
|
+
sign = content[0]
|
|
357
|
+
interval_part = content[1:] # Remove sign
|
|
358
|
+
if sign == "-":
|
|
359
|
+
return f"INTERVAL '-{interval_part}' YEAR TO MONTH"
|
|
360
|
+
else:
|
|
361
|
+
return f"INTERVAL '{interval_part}' YEAR TO MONTH"
|
|
362
|
+
return str(content)
|
|
363
|
+
case snowpark.types.DayTimeIntervalType:
|
|
364
|
+
return str(content)
|
|
331
365
|
case _:
|
|
332
366
|
raise SnowparkConnectNotImplementedError(
|
|
333
367
|
f"Unsupported snowpark data type in casting: {data_type}"
|
|
@@ -411,6 +445,18 @@ def proto_to_snowpark_type(
|
|
|
411
445
|
# For UDT types, return the underlying SQL type
|
|
412
446
|
logger.debug("Returning underlying sql type for udt")
|
|
413
447
|
return proto_to_snowpark_type(data_type.udt.sql_type)
|
|
448
|
+
case "year_month_interval":
|
|
449
|
+
# Preserve start_field and end_field from protobuf
|
|
450
|
+
return snowpark.types.YearMonthIntervalType(
|
|
451
|
+
start_field=data_type.year_month_interval.start_field,
|
|
452
|
+
end_field=data_type.year_month_interval.end_field,
|
|
453
|
+
)
|
|
454
|
+
case "day_time_interval":
|
|
455
|
+
# Preserve start_field and end_field from protobuf
|
|
456
|
+
return snowpark.types.DayTimeIntervalType(
|
|
457
|
+
start_field=data_type.day_time_interval.start_field,
|
|
458
|
+
end_field=data_type.day_time_interval.end_field,
|
|
459
|
+
)
|
|
414
460
|
case _:
|
|
415
461
|
return map_simple_types(data_type.WhichOneof("kind"))
|
|
416
462
|
|
|
@@ -523,6 +569,12 @@ def map_snowpark_types_to_pyarrow_types(
|
|
|
523
569
|
return pa.timestamp(unit, tz=tz)
|
|
524
570
|
case snowpark.types.VariantType:
|
|
525
571
|
return pa.string()
|
|
572
|
+
case snowpark.types.YearMonthIntervalType:
|
|
573
|
+
# Return string type so formatted intervals are preserved in display
|
|
574
|
+
return pa.string()
|
|
575
|
+
case snowpark.types.DayTimeIntervalType:
|
|
576
|
+
# Return string type so formatted intervals are preserved in display
|
|
577
|
+
return pa.string()
|
|
526
578
|
case _:
|
|
527
579
|
raise SnowparkConnectNotImplementedError(
|
|
528
580
|
f"Unsupported snowpark data type: {snowpark_type}"
|
|
@@ -676,6 +728,14 @@ def map_pyspark_types_to_snowpark_types(
|
|
|
676
728
|
return snowpark.types.TimestampType()
|
|
677
729
|
if isinstance(type_to_map, pyspark.sql.types.TimestampNTZType):
|
|
678
730
|
return snowpark.types.TimestampType(timezone=TimestampTimeZone.NTZ)
|
|
731
|
+
if isinstance(type_to_map, pyspark.sql.types.YearMonthIntervalType):
|
|
732
|
+
return snowpark.types.YearMonthIntervalType(
|
|
733
|
+
type_to_map.startField, type_to_map.endField
|
|
734
|
+
)
|
|
735
|
+
if isinstance(type_to_map, pyspark.sql.types.DayTimeIntervalType):
|
|
736
|
+
return snowpark.types.DayTimeIntervalType(
|
|
737
|
+
type_to_map.startField, type_to_map.endField
|
|
738
|
+
)
|
|
679
739
|
raise SnowparkConnectNotImplementedError(
|
|
680
740
|
f"Unsupported spark data type: {type_to_map}"
|
|
681
741
|
)
|
|
@@ -743,6 +803,14 @@ def map_snowpark_to_pyspark_types(
|
|
|
743
803
|
if type_to_map.tz == snowpark.types.TimestampTimeZone.NTZ:
|
|
744
804
|
return pyspark.sql.types.TimestampNTZType()
|
|
745
805
|
return pyspark.sql.types.TimestampType()
|
|
806
|
+
if isinstance(type_to_map, snowpark.types.YearMonthIntervalType):
|
|
807
|
+
return pyspark.sql.types.YearMonthIntervalType(
|
|
808
|
+
type_to_map.start_field, type_to_map.end_field
|
|
809
|
+
)
|
|
810
|
+
if isinstance(type_to_map, snowpark.types.DayTimeIntervalType):
|
|
811
|
+
return pyspark.sql.types.DayTimeIntervalType(
|
|
812
|
+
type_to_map.start_field, type_to_map.end_field
|
|
813
|
+
)
|
|
746
814
|
raise SnowparkConnectNotImplementedError(f"Unsupported data type: {type_to_map}")
|
|
747
815
|
|
|
748
816
|
|
|
@@ -785,10 +853,14 @@ def map_simple_types(simple_type: str) -> snowpark.types.DataType:
|
|
|
785
853
|
return snowpark.types.TimestampType(snowpark.types.TimestampTimeZone.NTZ)
|
|
786
854
|
case "timestamp_ltz":
|
|
787
855
|
return snowpark.types.TimestampType(snowpark.types.TimestampTimeZone.LTZ)
|
|
856
|
+
case "year_month_interval":
|
|
857
|
+
return snowpark.types.YearMonthIntervalType()
|
|
788
858
|
case "day_time_interval":
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
return snowpark.types.
|
|
859
|
+
return snowpark.types.DayTimeIntervalType()
|
|
860
|
+
case type_name if _INTERVAL_YEARMONTH_PATTERN_RE.match(type_name):
|
|
861
|
+
return snowpark.types.YearMonthIntervalType()
|
|
862
|
+
case type_name if _INTERVAL_DAYTIME_PATTERN_RE.match(type_name):
|
|
863
|
+
return snowpark.types.DayTimeIntervalType()
|
|
792
864
|
case _:
|
|
793
865
|
if simple_type.startswith("decimal"):
|
|
794
866
|
precision = int(simple_type.split("(")[1].split(",")[0])
|
|
@@ -16,7 +16,6 @@ from snowflake.snowpark_connect.utils.concurrent import SynchronizedDict
|
|
|
16
16
|
from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
|
|
17
17
|
from snowflake.snowpark_connect.utils.telemetry import telemetry
|
|
18
18
|
|
|
19
|
-
DESCRIBE_CACHE_TTL_SECONDS = 15
|
|
20
19
|
USE_DESCRIBE_QUERY_CACHE = True
|
|
21
20
|
|
|
22
21
|
DDL_DETECTION_PATTERN = re.compile(r"\s*(CREATE|ALTER|DROP)\b", re.IGNORECASE)
|
|
@@ -51,6 +50,8 @@ class DescribeQueryCache:
|
|
|
51
50
|
return sql_query
|
|
52
51
|
|
|
53
52
|
def get(self, sql_query: str) -> list[ResultMetadataV2] | None:
|
|
53
|
+
from snowflake.snowpark_connect.config import get_describe_cache_ttl_seconds
|
|
54
|
+
|
|
54
55
|
telemetry.report_describe_query_cache_lookup()
|
|
55
56
|
|
|
56
57
|
cache_key = self._get_cache_key(sql_query)
|
|
@@ -59,7 +60,9 @@ class DescribeQueryCache:
|
|
|
59
60
|
|
|
60
61
|
if key in self._cache:
|
|
61
62
|
result, timestamp = self._cache[key]
|
|
62
|
-
|
|
63
|
+
|
|
64
|
+
expired_by = current_time - (timestamp + get_describe_cache_ttl_seconds())
|
|
65
|
+
if expired_by < 0:
|
|
63
66
|
logger.debug(
|
|
64
67
|
f"Returning query result from cache for query: {sql_query[:20]}"
|
|
65
68
|
)
|
|
@@ -92,7 +95,7 @@ class DescribeQueryCache:
|
|
|
92
95
|
telemetry.report_describe_query_cache_hit()
|
|
93
96
|
return result
|
|
94
97
|
else:
|
|
95
|
-
telemetry.report_describe_query_cache_expired()
|
|
98
|
+
telemetry.report_describe_query_cache_expired(expired_by)
|
|
96
99
|
del self._cache[key]
|
|
97
100
|
return None
|
|
98
101
|
|