snowpark-connect 0.29.0__py3-none-any.whl → 0.30.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (41) hide show
  1. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
  2. snowflake/snowpark_connect/client.py +65 -0
  3. snowflake/snowpark_connect/column_name_handler.py +6 -0
  4. snowflake/snowpark_connect/config.py +25 -3
  5. snowflake/snowpark_connect/execute_plan/map_execution_root.py +21 -19
  6. snowflake/snowpark_connect/expression/map_extension.py +277 -1
  7. snowflake/snowpark_connect/expression/map_sql_expression.py +107 -2
  8. snowflake/snowpark_connect/expression/map_unresolved_function.py +253 -59
  9. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
  10. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
  11. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2_grpc.py +4 -0
  12. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2_grpc.py +4 -0
  13. snowflake/snowpark_connect/relation/io_utils.py +61 -4
  14. snowflake/snowpark_connect/relation/map_column_ops.py +9 -4
  15. snowflake/snowpark_connect/relation/map_join.py +8 -0
  16. snowflake/snowpark_connect/relation/map_row_ops.py +129 -17
  17. snowflake/snowpark_connect/relation/map_show_string.py +14 -6
  18. snowflake/snowpark_connect/relation/map_sql.py +39 -5
  19. snowflake/snowpark_connect/relation/map_stats.py +21 -6
  20. snowflake/snowpark_connect/relation/read/map_read.py +9 -0
  21. snowflake/snowpark_connect/relation/read/map_read_csv.py +17 -6
  22. snowflake/snowpark_connect/relation/read/map_read_json.py +12 -2
  23. snowflake/snowpark_connect/relation/read/map_read_parquet.py +7 -1
  24. snowflake/snowpark_connect/relation/read/metadata_utils.py +159 -0
  25. snowflake/snowpark_connect/relation/utils.py +19 -2
  26. snowflake/snowpark_connect/relation/write/map_write.py +44 -29
  27. snowflake/snowpark_connect/server.py +11 -3
  28. snowflake/snowpark_connect/type_mapping.py +75 -3
  29. snowflake/snowpark_connect/utils/describe_query_cache.py +6 -3
  30. snowflake/snowpark_connect/utils/telemetry.py +105 -23
  31. snowflake/snowpark_connect/version.py +1 -1
  32. {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/METADATA +1 -1
  33. {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/RECORD +41 -37
  34. {snowpark_connect-0.29.0.data → snowpark_connect-0.30.1.data}/scripts/snowpark-connect +0 -0
  35. {snowpark_connect-0.29.0.data → snowpark_connect-0.30.1.data}/scripts/snowpark-session +0 -0
  36. {snowpark_connect-0.29.0.data → snowpark_connect-0.30.1.data}/scripts/snowpark-submit +0 -0
  37. {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/WHEEL +0 -0
  38. {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/licenses/LICENSE-binary +0 -0
  39. {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/licenses/LICENSE.txt +0 -0
  40. {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/licenses/NOTICE-binary +0 -0
  41. {snowpark_connect-0.29.0.dist-info → snowpark_connect-0.30.1.dist-info}/top_level.txt +0 -0
@@ -13,6 +13,10 @@ from snowflake.snowpark.dataframe_reader import DataFrameReader
13
13
  from snowflake.snowpark.types import StringType, StructField, StructType
14
14
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
15
15
  from snowflake.snowpark_connect.relation.read.map_read import CsvReaderConfig
16
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
17
+ add_filename_metadata_to_reader,
18
+ get_non_metadata_fields,
19
+ )
16
20
  from snowflake.snowpark_connect.relation.read.utils import (
17
21
  get_spark_column_names_from_snowpark_columns,
18
22
  rename_columns_as_snowflake_standard,
@@ -57,12 +61,17 @@ def map_read_csv(
57
61
  snowpark_read_options["PATTERN"] = snowpark_options.get("PATTERN", None)
58
62
 
59
63
  raw_options = rel.read.data_source.options
64
+
60
65
  if schema is None or (
61
66
  parse_header and raw_options.get("enforceSchema", "True").lower() == "false"
62
67
  ): # Schema has to equals to header's format
63
- reader = session.read.options(snowpark_read_options)
68
+ reader = add_filename_metadata_to_reader(
69
+ session.read.options(snowpark_options), raw_options
70
+ )
64
71
  else:
65
- reader = session.read.options(snowpark_read_options).schema(schema)
72
+ reader = add_filename_metadata_to_reader(
73
+ session.read.options(snowpark_options).schema(schema), raw_options
74
+ )
66
75
  df = read_data(
67
76
  reader,
68
77
  schema,
@@ -175,14 +184,16 @@ def read_data(
175
184
  ) -> snowpark.DataFrame:
176
185
  df = reader.csv(path)
177
186
  filename = path.strip("/").split("/")[-1]
187
+ non_metadata_fields = get_non_metadata_fields(df.schema.fields)
188
+
178
189
  if schema is not None:
179
- if len(schema.fields) != len(df.schema.fields):
190
+ if len(schema.fields) != len(non_metadata_fields):
180
191
  raise Exception(f"csv load from {filename} failed.")
181
192
  if raw_options.get("enforceSchema", "True").lower() == "false":
182
193
  for i in range(len(schema.fields)):
183
194
  if (
184
- schema.fields[i].name != df.schema.fields[i].name
185
- and f'"{schema.fields[i].name}"' != df.schema.fields[i].name
195
+ schema.fields[i].name != non_metadata_fields[i].name
196
+ and f'"{schema.fields[i].name}"' != non_metadata_fields[i].name
186
197
  ):
187
198
  raise Exception("CSV header does not conform to the schema")
188
199
  return df
@@ -191,7 +202,7 @@ def read_data(
191
202
  session, path, file_format_options, snowpark_read_options
192
203
  )
193
204
 
194
- df_schema_fields = df.schema.fields
205
+ df_schema_fields = non_metadata_fields
195
206
  if len(headers) == len(df_schema_fields) and parse_header:
196
207
  return df.select(
197
208
  [
@@ -29,6 +29,9 @@ from snowflake.snowpark.types import (
29
29
  )
30
30
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
31
31
  from snowflake.snowpark_connect.relation.read.map_read import JsonReaderConfig
32
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
33
+ add_filename_metadata_to_reader,
34
+ )
32
35
  from snowflake.snowpark_connect.relation.read.utils import (
33
36
  get_spark_column_names_from_snowpark_columns,
34
37
  rename_columns_as_snowflake_standard,
@@ -66,19 +69,26 @@ def map_read_json(
66
69
  )
67
70
  else:
68
71
  snowpark_options = options.convert_to_snowpark_args()
72
+ raw_options = rel.read.data_source.options
69
73
  snowpark_options["infer_schema"] = True
70
74
 
71
75
  rows_to_infer_schema = snowpark_options.pop("rowstoinferschema", 1000)
72
76
  dropFieldIfAllNull = snowpark_options.pop("dropfieldifallnull", False)
73
77
  batch_size = snowpark_options.pop("batchsize", 1000)
74
78
 
75
- reader = session.read.options(snowpark_options)
79
+ reader = add_filename_metadata_to_reader(
80
+ session.read.options(snowpark_options), raw_options
81
+ )
76
82
 
77
83
  df = reader.json(paths[0])
78
84
  if len(paths) > 1:
79
85
  # TODO: figure out if this is what Spark does.
80
86
  for p in paths[1:]:
81
- df = df.union_all(session.read.options(snowpark_options).json(p))
87
+ df = df.union_all(
88
+ add_filename_metadata_to_reader(
89
+ session.read.options(snowpark_options), raw_options
90
+ ).json(p)
91
+ )
82
92
 
83
93
  if schema is None:
84
94
  schema = copy.deepcopy(df.schema)
@@ -22,6 +22,9 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
22
22
  from snowflake.snowpark.column import METADATA_FILENAME
23
23
  from snowflake.snowpark.types import DataType, DoubleType, IntegerType, StringType
24
24
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
25
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
26
+ add_filename_metadata_to_reader,
27
+ )
25
28
  from snowflake.snowpark_connect.relation.read.reader_config import ReaderWriterConfig
26
29
  from snowflake.snowpark_connect.relation.read.utils import (
27
30
  rename_columns_as_snowflake_standard,
@@ -46,10 +49,13 @@ def map_read_parquet(
46
49
  )
47
50
 
48
51
  snowpark_options = options.convert_to_snowpark_args()
52
+ raw_options = rel.read.data_source.options
49
53
  assert schema is None, "Read PARQUET does not support user schema"
50
54
  assert len(paths) > 0, "Read PARQUET expects at least one path"
51
55
 
52
- reader = session.read.options(snowpark_options)
56
+ reader = add_filename_metadata_to_reader(
57
+ session.read.options(snowpark_options), raw_options
58
+ )
53
59
 
54
60
  if len(paths) == 1:
55
61
  df = _read_parquet_with_partitions(session, reader, paths[0])
@@ -0,0 +1,159 @@
1
+ #
2
+ # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3
+ #
4
+
5
+ """
6
+ Utilities for handling internal metadata columns in file-based DataFrames.
7
+ """
8
+
9
+ import os
10
+
11
+ import pandas
12
+ from pyspark.errors.exceptions.base import AnalysisException
13
+
14
+ from snowflake import snowpark
15
+ from snowflake.snowpark.column import METADATA_FILENAME
16
+ from snowflake.snowpark.functions import col
17
+ from snowflake.snowpark.types import StructField
18
+ from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
19
+
20
+ # Constant for the metadata filename column name
21
+ METADATA_FILENAME_COLUMN = "METADATA$FILENAME"
22
+
23
+
24
+ def add_filename_metadata_to_reader(
25
+ reader: snowpark.DataFrameReader,
26
+ options: dict | None = None,
27
+ ) -> snowpark.DataFrameReader:
28
+ """
29
+ Add filename metadata to a DataFrameReader based on configuration.
30
+
31
+ Args:
32
+ reader: Snowpark DataFrameReader instance
33
+ options: Dictionary of options to check for metadata configuration
34
+
35
+ Returns:
36
+ DataFrameReader with filename metadata enabled if configured, otherwise unchanged
37
+ """
38
+ # NOTE: SNOWPARK_POPULATE_FILE_METADATA_DEFAULT is an internal environment variable
39
+ # used only for CI testing to verify no metadata columns leak in regular file operations.
40
+ # This environment variable should NOT be exposed to end users. Users should only use snowpark.populateFileMetadata
41
+ # to enable metadata population.
42
+ metadata_default = os.environ.get(
43
+ "SNOWPARK_POPULATE_FILE_METADATA_DEFAULT", "false"
44
+ )
45
+
46
+ populate_metadata = (
47
+ options.get("snowpark.populateFileMetadata", metadata_default)
48
+ if options
49
+ else metadata_default
50
+ ).lower() == "true"
51
+
52
+ if populate_metadata:
53
+ return reader.with_metadata(METADATA_FILENAME)
54
+ else:
55
+ return reader
56
+
57
+
58
+ def get_non_metadata_fields(schema_fields: list[StructField]) -> list[StructField]:
59
+ """
60
+ Filter out METADATA$FILENAME fields from a list of schema fields.
61
+
62
+ Args:
63
+ schema_fields: List of StructField objects from a DataFrame schema
64
+
65
+ Returns:
66
+ List of StructField objects excluding METADATA$FILENAME
67
+ """
68
+ return [field for field in schema_fields if field.name != METADATA_FILENAME_COLUMN]
69
+
70
+
71
+ def get_non_metadata_column_names(schema_fields: list[StructField]) -> list[str]:
72
+ """
73
+ Get column names from schema fields, excluding METADATA$FILENAME.
74
+
75
+ Args:
76
+ schema_fields: List of StructField objects from a DataFrame schema
77
+
78
+ Returns:
79
+ List of column names (strings) excluding METADATA$FILENAME
80
+ """
81
+ return [
82
+ field.name for field in schema_fields if field.name != METADATA_FILENAME_COLUMN
83
+ ]
84
+
85
+
86
+ def filter_metadata_column_name(column_names: list[str]) -> list[str]:
87
+ """
88
+ Get column names from column_names, excluding METADATA$FILENAME.
89
+
90
+ Returns:
91
+ List of column names (strings) excluding METADATA$FILENAME
92
+ """
93
+ return [
94
+ col_name for col_name in column_names if col_name != METADATA_FILENAME_COLUMN
95
+ ]
96
+
97
+
98
+ def filter_metadata_columns(
99
+ result_container: DataFrameContainer | pandas.DataFrame | None,
100
+ ) -> DataFrameContainer | pandas.DataFrame | None:
101
+ """
102
+ Filter METADATA$FILENAME from DataFrame container for execution and write operations.
103
+
104
+ Args:
105
+ result_container: DataFrameContainer or pandas DataFrame to filter
106
+
107
+ Returns:
108
+ Filtered container (callers can access dataframe via container.dataframe)
109
+ """
110
+ # Handle pandas DataFrame case - return as-is
111
+ if isinstance(result_container, pandas.DataFrame):
112
+ return result_container
113
+
114
+ if result_container is None:
115
+ return None
116
+
117
+ result_df = result_container.dataframe
118
+ if not isinstance(result_df, snowpark.DataFrame):
119
+ return result_container
120
+
121
+ df_columns = result_container.column_map.get_snowpark_columns()
122
+ has_metadata_filename = any(name == METADATA_FILENAME_COLUMN for name in df_columns)
123
+
124
+ if not has_metadata_filename:
125
+ return result_container
126
+
127
+ non_metadata_columns = filter_metadata_column_name(df_columns)
128
+
129
+ if len(non_metadata_columns) == 0:
130
+ # DataFrame contains only metadata columns (METADATA$FILENAME), no actual data columns remaining.
131
+ # We don't have a way to return an empty dataframe.
132
+ raise AnalysisException(
133
+ "[DATAFRAME_MISSING_DATA_COLUMNS] Cannot perform operation on DataFrame that contains no data columns."
134
+ )
135
+
136
+ filtered_df = result_df.select([col(name) for name in non_metadata_columns])
137
+
138
+ original_spark_columns = result_container.column_map.get_spark_columns()
139
+ original_snowpark_columns = result_container.column_map.get_snowpark_columns()
140
+
141
+ filtered_spark_columns = []
142
+ filtered_snowpark_columns = []
143
+
144
+ for i, colname in enumerate(df_columns):
145
+ if colname != METADATA_FILENAME_COLUMN:
146
+ filtered_spark_columns.append(original_spark_columns[i])
147
+ filtered_snowpark_columns.append(original_snowpark_columns[i])
148
+
149
+ new_container = DataFrameContainer.create_with_column_mapping(
150
+ dataframe=filtered_df,
151
+ spark_column_names=filtered_spark_columns,
152
+ snowpark_column_names=filtered_snowpark_columns,
153
+ column_metadata=result_container.column_map.column_metadata,
154
+ table_name=result_container.table_name,
155
+ alias=result_container.alias,
156
+ partition_hint=result_container.partition_hint,
157
+ )
158
+
159
+ return new_container
@@ -92,6 +92,21 @@ TYPE_MAP_FOR_TO_SCHEMA = {
92
92
  }
93
93
 
94
94
 
95
+ # This mapping is used to map the compression type to the extension of the file.
96
+ FILE_COMPRESSION_TO_EXTENSION = {
97
+ "GZIP": "gz",
98
+ "BZ2": "bz2",
99
+ "BROTLI": "br",
100
+ "ZSTD": "zst",
101
+ "DEFLATE": "deflate",
102
+ "RAW_DEFLATE": "raw_deflate",
103
+ "SNAPPY": "snappy",
104
+ "LZO": "lzo",
105
+ "LZ4": "lz4",
106
+ "BZIP2": "bz2",
107
+ }
108
+
109
+
95
110
  def get_df_with_partition_row_number(
96
111
  container: DataFrameContainer,
97
112
  plan_id: int | None,
@@ -186,13 +201,15 @@ def generate_spark_compatible_filename(
186
201
 
187
202
  # Add compression if specified and not 'none'
188
203
  if compression and compression.lower() not in ("none", "uncompressed"):
189
- compression_part = f".{compression.lower()}"
204
+ compression_part = f".{FILE_COMPRESSION_TO_EXTENSION.get(compression.upper(), compression.lower())}"
190
205
  else:
191
206
  compression_part = ""
192
207
 
193
208
  # Add format extension if specified
194
- if format_ext:
209
+ if format_ext == "parquet":
195
210
  return f"{base_name}{compression_part}.{format_ext}"
211
+ elif format_ext is not None and format_ext != "":
212
+ return f"{base_name}.{format_ext}{compression_part}"
196
213
  else:
197
214
  return f"{base_name}{compression_part}"
198
215
 
@@ -35,11 +35,13 @@ from snowflake.snowpark_connect.config import (
35
35
  from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
36
36
  from snowflake.snowpark_connect.relation.io_utils import (
37
37
  convert_file_prefix_path,
38
+ get_compression_for_source_and_options,
38
39
  is_cloud_path,
39
- is_supported_compression,
40
- supported_compressions_for_format,
41
40
  )
42
41
  from snowflake.snowpark_connect.relation.map_relation import map_relation
42
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
43
+ filter_metadata_columns,
44
+ )
43
45
  from snowflake.snowpark_connect.relation.read.reader_config import CsvWriterConfig
44
46
  from snowflake.snowpark_connect.relation.stage_locator import get_paths_from_stage
45
47
  from snowflake.snowpark_connect.relation.utils import (
@@ -129,8 +131,26 @@ def map_write(request: proto_base.ExecutePlanRequest):
129
131
 
130
132
  result = map_relation(write_op.input)
131
133
  input_df: snowpark.DataFrame = handle_column_names(result, write_op.source)
134
+
135
+ # Create updated container with transformed dataframe, then filter METADATA$FILENAME columns
136
+ # Update the container to use the transformed dataframe from handle_column_names
137
+ updated_result = DataFrameContainer(
138
+ dataframe=input_df,
139
+ column_map=result.column_map,
140
+ table_name=result.table_name,
141
+ alias=result.alias,
142
+ partition_hint=result.partition_hint,
143
+ )
144
+ updated_result = filter_metadata_columns(updated_result)
145
+ input_df = updated_result.dataframe
146
+
132
147
  session: snowpark.Session = get_or_create_snowpark_session()
133
148
 
149
+ # Check for partition hint early to determine precedence over single option
150
+ partition_hint = (
151
+ result.partition_hint if hasattr(result, "partition_hint") else None
152
+ )
153
+
134
154
  # Snowflake saveAsTable doesn't support format
135
155
  if (
136
156
  write_op.HasField("table")
@@ -160,8 +180,11 @@ def map_write(request: proto_base.ExecutePlanRequest):
160
180
  # Generate Spark-compatible filename with proper extension
161
181
  extension = write_op.source if write_op.source != "text" else "txt"
162
182
 
163
- # Get compression from options for proper filename generation
164
- compression_option = write_op.options.get("compression", "none")
183
+ compression = get_compression_for_source_and_options(
184
+ write_op.source, write_op.options, from_read=False
185
+ )
186
+ if compression is not None:
187
+ write_op.options["compression"] = compression
165
188
 
166
189
  # Generate Spark-compatible filename or prefix
167
190
  # we need a random prefix to support "append" mode
@@ -187,12 +210,12 @@ def map_write(request: proto_base.ExecutePlanRequest):
187
210
  except Exception as e:
188
211
  logger.warning(f"Could not clear directory {write_path}: {e}")
189
212
 
190
- if should_write_to_single_file:
213
+ if should_write_to_single_file and partition_hint is None:
191
214
  # Single file: generate complete filename with extension
192
215
  spark_filename = generate_spark_compatible_filename(
193
216
  task_id=0,
194
217
  attempt_number=0,
195
- compression=compression_option,
218
+ compression=compression,
196
219
  format_ext=extension,
197
220
  )
198
221
  temp_file_prefix_on_stage = f"{write_path}/{spark_filename}"
@@ -201,29 +224,11 @@ def map_write(request: proto_base.ExecutePlanRequest):
201
224
  spark_filename_prefix = generate_spark_compatible_filename(
202
225
  task_id=0,
203
226
  attempt_number=0,
204
- compression=compression_option,
227
+ compression=None,
205
228
  format_ext="", # No extension for prefix
206
229
  )
207
230
  temp_file_prefix_on_stage = f"{write_path}/{spark_filename_prefix}"
208
231
 
209
- default_compression = "NONE" if write_op.source != "parquet" else "snappy"
210
- compression = write_op.options.get(
211
- "compression", default_compression
212
- ).upper()
213
-
214
- if not is_supported_compression(write_op.source, compression):
215
- supported_compressions = supported_compressions_for_format(
216
- write_op.source
217
- )
218
- raise AnalysisException(
219
- f"Compression {compression} is not supported for {write_op.source} format. "
220
- + (
221
- f"Supported compressions: {sorted(supported_compressions)}"
222
- if supported_compressions
223
- else "No compression supported for this format."
224
- )
225
- )
226
-
227
232
  parameters = {
228
233
  "location": temp_file_prefix_on_stage,
229
234
  "file_format_type": write_op.source
@@ -238,9 +243,6 @@ def map_write(request: proto_base.ExecutePlanRequest):
238
243
  # Using the base avoids coupling to exact filenames/prefixes.
239
244
  download_stage_path = write_path
240
245
 
241
- # Check for partition hint early to determine precedence over single option
242
- partition_hint = result.partition_hint
243
-
244
246
  # Apply max_file_size for both single and multi-file scenarios
245
247
  # This helps control when Snowflake splits files into multiple parts
246
248
  if max_file_size:
@@ -298,7 +300,7 @@ def map_write(request: proto_base.ExecutePlanRequest):
298
300
  per_part_prefix = generate_spark_compatible_filename(
299
301
  task_id=part_idx,
300
302
  attempt_number=0,
301
- compression=compression_option,
303
+ compression=None,
302
304
  format_ext="", # prefix only; Snowflake appends extension/counters
303
305
  )
304
306
  part_params["location"] = f"{write_path}/{per_part_prefix}"
@@ -537,6 +539,19 @@ def map_write_v2(request: proto_base.ExecutePlanRequest):
537
539
  snowpark_table_name = _spark_to_snowflake(write_op.table_name)
538
540
  result = map_relation(write_op.input)
539
541
  input_df: snowpark.DataFrame = handle_column_names(result, "table")
542
+
543
+ # Create updated container with transformed dataframe, then filter METADATA$FILENAME columns
544
+ # Update the container to use the transformed dataframe from handle_column_names
545
+ updated_result = DataFrameContainer(
546
+ dataframe=input_df,
547
+ column_map=result.column_map,
548
+ table_name=result.table_name,
549
+ alias=result.alias,
550
+ partition_hint=result.partition_hint,
551
+ )
552
+ updated_result = filter_metadata_columns(updated_result)
553
+ input_df = updated_result.dataframe
554
+
540
555
  session: snowpark.Session = get_or_create_snowpark_session()
541
556
 
542
557
  if write_op.table_name is None or write_op.table_name == "":
@@ -232,12 +232,20 @@ class SnowflakeConnectServicer(proto_base_grpc.SparkConnectServiceServicer):
232
232
  match request.WhichOneof("analyze"):
233
233
  case "schema":
234
234
  result = map_relation(request.schema.plan.root)
235
- snowpark_df = result.dataframe
236
- snowpark_schema: snowpark.types.StructType = snowpark_df.schema
235
+
236
+ from snowflake.snowpark_connect.relation.read.metadata_utils import (
237
+ filter_metadata_columns,
238
+ )
239
+
240
+ filtered_result = filter_metadata_columns(result)
241
+ filtered_df = filtered_result.dataframe
242
+
237
243
  schema = proto_base.AnalyzePlanResponse.Schema(
238
244
  schema=types_proto.DataType(
239
245
  **snowpark_to_proto_type(
240
- snowpark_schema, result.column_map, snowpark_df
246
+ filtered_df.schema,
247
+ filtered_result.column_map,
248
+ filtered_df,
241
249
  )
242
250
  )
243
251
  )
@@ -30,6 +30,10 @@ from snowflake.snowpark_connect.date_time_format_mapping import (
30
30
  convert_spark_format_to_snowflake,
31
31
  )
32
32
  from snowflake.snowpark_connect.expression.literal import get_literal_field_and_name
33
+ from snowflake.snowpark_connect.expression.map_sql_expression import (
34
+ _INTERVAL_DAYTIME_PATTERN_RE,
35
+ _INTERVAL_YEARMONTH_PATTERN_RE,
36
+ )
33
37
  from snowflake.snowpark_connect.utils.context import get_is_evaluating_sql
34
38
  from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
35
39
  from snowflake.snowpark_connect.utils.telemetry import (
@@ -274,6 +278,18 @@ def snowpark_to_proto_type(
274
278
  case snowpark.types.VariantType:
275
279
  # For now we are returning a string type for variant types.
276
280
  return {"string": types_proto.DataType.String()}
281
+ case snowpark.types.YearMonthIntervalType:
282
+ return {
283
+ "year_month_interval": types_proto.DataType.YearMonthInterval(
284
+ start_field=data_type.start_field, end_field=data_type.end_field
285
+ )
286
+ }
287
+ case snowpark.types.DayTimeIntervalType:
288
+ return {
289
+ "day_time_interval": types_proto.DataType.DayTimeInterval(
290
+ start_field=data_type.start_field, end_field=data_type.end_field
291
+ )
292
+ }
277
293
  case _:
278
294
  raise SnowparkConnectNotImplementedError(
279
295
  f"Unsupported snowpark data type: {data_type}"
@@ -328,6 +344,24 @@ def cast_to_match_snowpark_type(
328
344
  return str(content)
329
345
  case snowpark.types.TimestampType:
330
346
  return str(content)
347
+ case snowpark.types.YearMonthIntervalType:
348
+ if isinstance(content, (int, float)):
349
+ total_months = int(content)
350
+ years = total_months // 12
351
+ months = total_months % 12
352
+ return f"INTERVAL '{years}-{months}' YEAR TO MONTH"
353
+ elif isinstance(content, str) and content.startswith(("+", "-")):
354
+ # Handle Snowflake's native interval format (e.g., "+11-08" or "-2-3")
355
+ # Convert to Spark's format: "INTERVAL 'Y-M' YEAR TO MONTH"
356
+ sign = content[0]
357
+ interval_part = content[1:] # Remove sign
358
+ if sign == "-":
359
+ return f"INTERVAL '-{interval_part}' YEAR TO MONTH"
360
+ else:
361
+ return f"INTERVAL '{interval_part}' YEAR TO MONTH"
362
+ return str(content)
363
+ case snowpark.types.DayTimeIntervalType:
364
+ return str(content)
331
365
  case _:
332
366
  raise SnowparkConnectNotImplementedError(
333
367
  f"Unsupported snowpark data type in casting: {data_type}"
@@ -411,6 +445,18 @@ def proto_to_snowpark_type(
411
445
  # For UDT types, return the underlying SQL type
412
446
  logger.debug("Returning underlying sql type for udt")
413
447
  return proto_to_snowpark_type(data_type.udt.sql_type)
448
+ case "year_month_interval":
449
+ # Preserve start_field and end_field from protobuf
450
+ return snowpark.types.YearMonthIntervalType(
451
+ start_field=data_type.year_month_interval.start_field,
452
+ end_field=data_type.year_month_interval.end_field,
453
+ )
454
+ case "day_time_interval":
455
+ # Preserve start_field and end_field from protobuf
456
+ return snowpark.types.DayTimeIntervalType(
457
+ start_field=data_type.day_time_interval.start_field,
458
+ end_field=data_type.day_time_interval.end_field,
459
+ )
414
460
  case _:
415
461
  return map_simple_types(data_type.WhichOneof("kind"))
416
462
 
@@ -523,6 +569,12 @@ def map_snowpark_types_to_pyarrow_types(
523
569
  return pa.timestamp(unit, tz=tz)
524
570
  case snowpark.types.VariantType:
525
571
  return pa.string()
572
+ case snowpark.types.YearMonthIntervalType:
573
+ # Return string type so formatted intervals are preserved in display
574
+ return pa.string()
575
+ case snowpark.types.DayTimeIntervalType:
576
+ # Return string type so formatted intervals are preserved in display
577
+ return pa.string()
526
578
  case _:
527
579
  raise SnowparkConnectNotImplementedError(
528
580
  f"Unsupported snowpark data type: {snowpark_type}"
@@ -676,6 +728,14 @@ def map_pyspark_types_to_snowpark_types(
676
728
  return snowpark.types.TimestampType()
677
729
  if isinstance(type_to_map, pyspark.sql.types.TimestampNTZType):
678
730
  return snowpark.types.TimestampType(timezone=TimestampTimeZone.NTZ)
731
+ if isinstance(type_to_map, pyspark.sql.types.YearMonthIntervalType):
732
+ return snowpark.types.YearMonthIntervalType(
733
+ type_to_map.startField, type_to_map.endField
734
+ )
735
+ if isinstance(type_to_map, pyspark.sql.types.DayTimeIntervalType):
736
+ return snowpark.types.DayTimeIntervalType(
737
+ type_to_map.startField, type_to_map.endField
738
+ )
679
739
  raise SnowparkConnectNotImplementedError(
680
740
  f"Unsupported spark data type: {type_to_map}"
681
741
  )
@@ -743,6 +803,14 @@ def map_snowpark_to_pyspark_types(
743
803
  if type_to_map.tz == snowpark.types.TimestampTimeZone.NTZ:
744
804
  return pyspark.sql.types.TimestampNTZType()
745
805
  return pyspark.sql.types.TimestampType()
806
+ if isinstance(type_to_map, snowpark.types.YearMonthIntervalType):
807
+ return pyspark.sql.types.YearMonthIntervalType(
808
+ type_to_map.start_field, type_to_map.end_field
809
+ )
810
+ if isinstance(type_to_map, snowpark.types.DayTimeIntervalType):
811
+ return pyspark.sql.types.DayTimeIntervalType(
812
+ type_to_map.start_field, type_to_map.end_field
813
+ )
746
814
  raise SnowparkConnectNotImplementedError(f"Unsupported data type: {type_to_map}")
747
815
 
748
816
 
@@ -785,10 +853,14 @@ def map_simple_types(simple_type: str) -> snowpark.types.DataType:
785
853
  return snowpark.types.TimestampType(snowpark.types.TimestampTimeZone.NTZ)
786
854
  case "timestamp_ltz":
787
855
  return snowpark.types.TimestampType(snowpark.types.TimestampTimeZone.LTZ)
856
+ case "year_month_interval":
857
+ return snowpark.types.YearMonthIntervalType()
788
858
  case "day_time_interval":
789
- # this is not a column type in snowflake so there won't be a dataframe column
790
- # with this, for now this type won't make any sense
791
- return snowpark.types.StringType()
859
+ return snowpark.types.DayTimeIntervalType()
860
+ case type_name if _INTERVAL_YEARMONTH_PATTERN_RE.match(type_name):
861
+ return snowpark.types.YearMonthIntervalType()
862
+ case type_name if _INTERVAL_DAYTIME_PATTERN_RE.match(type_name):
863
+ return snowpark.types.DayTimeIntervalType()
792
864
  case _:
793
865
  if simple_type.startswith("decimal"):
794
866
  precision = int(simple_type.split("(")[1].split(",")[0])
@@ -16,7 +16,6 @@ from snowflake.snowpark_connect.utils.concurrent import SynchronizedDict
16
16
  from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
17
17
  from snowflake.snowpark_connect.utils.telemetry import telemetry
18
18
 
19
- DESCRIBE_CACHE_TTL_SECONDS = 15
20
19
  USE_DESCRIBE_QUERY_CACHE = True
21
20
 
22
21
  DDL_DETECTION_PATTERN = re.compile(r"\s*(CREATE|ALTER|DROP)\b", re.IGNORECASE)
@@ -51,6 +50,8 @@ class DescribeQueryCache:
51
50
  return sql_query
52
51
 
53
52
  def get(self, sql_query: str) -> list[ResultMetadataV2] | None:
53
+ from snowflake.snowpark_connect.config import get_describe_cache_ttl_seconds
54
+
54
55
  telemetry.report_describe_query_cache_lookup()
55
56
 
56
57
  cache_key = self._get_cache_key(sql_query)
@@ -59,7 +60,9 @@ class DescribeQueryCache:
59
60
 
60
61
  if key in self._cache:
61
62
  result, timestamp = self._cache[key]
62
- if current_time < timestamp + DESCRIBE_CACHE_TTL_SECONDS:
63
+
64
+ expired_by = current_time - (timestamp + get_describe_cache_ttl_seconds())
65
+ if expired_by < 0:
63
66
  logger.debug(
64
67
  f"Returning query result from cache for query: {sql_query[:20]}"
65
68
  )
@@ -92,7 +95,7 @@ class DescribeQueryCache:
92
95
  telemetry.report_describe_query_cache_hit()
93
96
  return result
94
97
  else:
95
- telemetry.report_describe_query_cache_expired()
98
+ telemetry.report_describe_query_cache_expired(expired_by)
96
99
  del self._cache[key]
97
100
  return None
98
101