snowpark-connect 0.20.2__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (67) hide show
  1. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +3 -2
  2. snowflake/snowpark_connect/column_name_handler.py +6 -65
  3. snowflake/snowpark_connect/config.py +28 -14
  4. snowflake/snowpark_connect/dataframe_container.py +242 -0
  5. snowflake/snowpark_connect/execute_plan/map_execution_command.py +13 -23
  6. snowflake/snowpark_connect/execute_plan/map_execution_root.py +9 -5
  7. snowflake/snowpark_connect/expression/map_extension.py +2 -1
  8. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +8 -7
  9. snowflake/snowpark_connect/expression/map_unresolved_function.py +279 -43
  10. snowflake/snowpark_connect/expression/map_unresolved_star.py +8 -8
  11. snowflake/snowpark_connect/expression/map_update_fields.py +1 -1
  12. snowflake/snowpark_connect/expression/typer.py +6 -6
  13. snowflake/snowpark_connect/proto/control_pb2.py +17 -16
  14. snowflake/snowpark_connect/proto/control_pb2.pyi +17 -17
  15. snowflake/snowpark_connect/proto/control_pb2_grpc.py +12 -63
  16. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +15 -14
  17. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +19 -14
  18. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +27 -26
  19. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +74 -68
  20. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +5 -5
  21. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +25 -17
  22. snowflake/snowpark_connect/relation/map_aggregate.py +72 -47
  23. snowflake/snowpark_connect/relation/map_catalog.py +2 -2
  24. snowflake/snowpark_connect/relation/map_column_ops.py +207 -144
  25. snowflake/snowpark_connect/relation/map_crosstab.py +25 -6
  26. snowflake/snowpark_connect/relation/map_extension.py +81 -56
  27. snowflake/snowpark_connect/relation/map_join.py +72 -63
  28. snowflake/snowpark_connect/relation/map_local_relation.py +35 -20
  29. snowflake/snowpark_connect/relation/map_map_partitions.py +21 -16
  30. snowflake/snowpark_connect/relation/map_relation.py +22 -16
  31. snowflake/snowpark_connect/relation/map_row_ops.py +232 -146
  32. snowflake/snowpark_connect/relation/map_sample_by.py +15 -8
  33. snowflake/snowpark_connect/relation/map_show_string.py +42 -5
  34. snowflake/snowpark_connect/relation/map_sql.py +155 -78
  35. snowflake/snowpark_connect/relation/map_stats.py +88 -39
  36. snowflake/snowpark_connect/relation/map_subquery_alias.py +13 -14
  37. snowflake/snowpark_connect/relation/map_udtf.py +6 -9
  38. snowflake/snowpark_connect/relation/read/map_read.py +8 -3
  39. snowflake/snowpark_connect/relation/read/map_read_csv.py +7 -7
  40. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +7 -7
  41. snowflake/snowpark_connect/relation/read/map_read_json.py +7 -7
  42. snowflake/snowpark_connect/relation/read/map_read_parquet.py +7 -7
  43. snowflake/snowpark_connect/relation/read/map_read_socket.py +7 -3
  44. snowflake/snowpark_connect/relation/read/map_read_table.py +25 -16
  45. snowflake/snowpark_connect/relation/read/map_read_text.py +7 -7
  46. snowflake/snowpark_connect/relation/utils.py +11 -5
  47. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +15 -12
  48. snowflake/snowpark_connect/relation/write/map_write.py +199 -40
  49. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +3 -2
  50. snowflake/snowpark_connect/server.py +34 -4
  51. snowflake/snowpark_connect/type_mapping.py +2 -23
  52. snowflake/snowpark_connect/utils/cache.py +27 -22
  53. snowflake/snowpark_connect/utils/context.py +33 -17
  54. snowflake/snowpark_connect/utils/{attribute_handling.py → identifiers.py} +47 -0
  55. snowflake/snowpark_connect/utils/session.py +41 -34
  56. snowflake/snowpark_connect/utils/telemetry.py +1 -2
  57. snowflake/snowpark_connect/version.py +1 -1
  58. {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/METADATA +5 -3
  59. {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/RECORD +67 -64
  60. snowpark_connect-0.21.0.dist-info/licenses/LICENSE-binary +568 -0
  61. snowpark_connect-0.21.0.dist-info/licenses/NOTICE-binary +1533 -0
  62. {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-connect +0 -0
  63. {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-session +0 -0
  64. {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-submit +0 -0
  65. {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/WHEEL +0 -0
  66. {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
  67. {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/top_level.txt +0 -0
@@ -11,12 +11,13 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
11
11
 
12
12
  from snowflake import snowpark
13
13
  from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
14
+ from snowflake.snowpark._internal.utils import is_in_stored_procedure
14
15
  from snowflake.snowpark.types import LongType, StructField, StructType
15
16
  from snowflake.snowpark_connect import tcm
16
17
  from snowflake.snowpark_connect.column_name_handler import (
17
18
  make_column_names_snowpark_compatible,
18
- with_column_map,
19
19
  )
20
+ from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
20
21
  from snowflake.snowpark_connect.type_mapping import (
21
22
  map_json_schema_to_snowpark,
22
23
  map_pyarrow_to_snowpark_types,
@@ -126,10 +127,8 @@ def map_pylist_cell_to_python_object(cell, type: pa.lib.DataType):
126
127
  map_pylist_cell_to_python_object(obj, list_type.value_type)
127
128
  for obj in cell
128
129
  ]
129
- case map_type if (
130
- cell is not None
131
- and isinstance(type, pa.lib.MapType)
132
- and all(isinstance(obj, tuple) and len(obj) == 2 for obj in cell)
130
+ case map_type if cell is not None and isinstance(type, pa.lib.MapType) and all(
131
+ isinstance(obj, tuple) and len(obj) == 2 for obj in cell
133
132
  ):
134
133
  # the MapType in arrow becomes list in pylist_df,
135
134
  # e.g. {"Car": "Honda", "Bike": "Yamaha"} --> [("Car", "Honda"), ("Bike", "Yamaha")] , and causes some
@@ -171,7 +170,9 @@ def map_pandas_cell_to_python_object(cell):
171
170
  return res if res == res else None
172
171
 
173
172
 
174
- def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
173
+ def map_local_relation(
174
+ rel: relation_proto.Relation,
175
+ ) -> DataFrameContainer:
175
176
  if rel.local_relation.HasField("data"):
176
177
  data = pa.BufferReader(rel.local_relation.data)
177
178
  with pa.ipc.open_stream(data) as reader:
@@ -204,9 +205,9 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
204
205
  # Only create the pandas dataframe for empty dataframe cases.
205
206
  pandas_df = table.to_pandas()
206
207
  snowpark_df: snowpark.DataFrame = session.create_dataframe(pandas_df)
207
- return with_column_map(
208
- snowpark_df,
209
- spark_column_names,
208
+ return DataFrameContainer.create_with_column_mapping(
209
+ dataframe=snowpark_df,
210
+ spark_column_names=spark_column_names,
210
211
  snowpark_column_names=new_columns,
211
212
  column_metadata=column_metadata,
212
213
  )
@@ -230,16 +231,22 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
230
231
  # Special characters in the schema currently break create_dataframe with arrow
231
232
  # https://snowflakecomputing.atlassian.net/browse/SNOW-2199291
232
233
  current_schema = session.get_current_schema()
234
+
235
+ # _create_temp_stage() changes were not ported to the internal connector, leading to this
236
+ # error on TCM and in notebooks (sproc):
237
+ # TypeError: _create_temp_stage() takes 7 positional arguments but 8 were given
233
238
  use_pyarrow = (
234
- re.match(
239
+ not is_in_stored_procedure()
240
+ # TODO: SNOW-2220726 investigate why use_pyarrow failed in TCM:
241
+ and not tcm.TCM_MODE
242
+ and re.match(
235
243
  # See https://docs.snowflake.com/en/sql-reference/identifiers-syntax
236
244
  r"[A-Za-z_][A-Za-z0-9_\$]*",
237
245
  # Schema may be double-quoted.
238
246
  current_schema.strip('"') if current_schema is not None else "",
239
247
  )
240
248
  is not None
241
- ) and not tcm.TCM_MODE # TODO: SNOW-2220726 investigate why use_pyarrow failed in TCM:
242
- # TypeError: _create_temp_stage() takes 7 positional arguments but 8 were given
249
+ )
243
250
 
244
251
  if use_pyarrow:
245
252
  snowpark_df: snowpark.DataFrame = session.create_dataframe(
@@ -257,6 +264,7 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
257
264
  ]
258
265
 
259
266
  snowpark_df = snowpark_df.select(*casted_columns)
267
+
260
268
  else:
261
269
  pylist_df = [
262
270
  list(row)
@@ -285,9 +293,9 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
285
293
  snowpark_schema,
286
294
  )
287
295
 
288
- return with_column_map(
289
- snowpark_df,
290
- spark_column_names,
296
+ return DataFrameContainer.create_with_column_mapping(
297
+ dataframe=snowpark_df,
298
+ spark_column_names=spark_column_names,
291
299
  snowpark_column_names=new_columns,
292
300
  column_metadata=column_metadata,
293
301
  snowpark_column_types=[f.datatype for f in snowpark_schema.fields],
@@ -305,9 +313,9 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
305
313
  [],
306
314
  snowpark_schema,
307
315
  )
308
- return with_column_map(
309
- snowpark_df,
310
- spark_column_names,
316
+ return DataFrameContainer.create_with_column_mapping(
317
+ dataframe=snowpark_df,
318
+ spark_column_names=spark_column_names,
311
319
  snowpark_column_names=new_columns,
312
320
  column_metadata=column_metadata,
313
321
  )
@@ -317,10 +325,17 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
317
325
  )
318
326
 
319
327
 
320
- def map_range(rel: relation_proto.Relation) -> snowpark.DataFrame:
328
+ def map_range(
329
+ rel: relation_proto.Relation,
330
+ ) -> DataFrameContainer:
321
331
  session = get_or_create_snowpark_session()
322
332
  new_columns = make_column_names_snowpark_compatible(["id"], rel.common.plan_id)
323
333
  result = session.range(
324
334
  rel.range.start, rel.range.end, rel.range.step
325
335
  ).with_column_renamed("ID", new_columns[0])
326
- return with_column_map(result, ["id"], new_columns, [LongType()])
336
+ return DataFrameContainer.create_with_column_mapping(
337
+ dataframe=result,
338
+ spark_column_names=["id"],
339
+ snowpark_column_names=new_columns,
340
+ snowpark_column_types=[LongType()],
341
+ )
@@ -8,9 +8,9 @@ from pyspark.sql.connect.proto.expressions_pb2 import CommonInlineUserDefinedFun
8
8
  import snowflake.snowpark.functions as snowpark_fn
9
9
  from snowflake import snowpark
10
10
  from snowflake.snowpark.types import StructType
11
- from snowflake.snowpark_connect.column_name_handler import with_column_map
12
11
  from snowflake.snowpark_connect.config import global_config
13
12
  from snowflake.snowpark_connect.constants import MAP_IN_ARROW_EVAL_TYPE
13
+ from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
14
14
  from snowflake.snowpark_connect.relation.map_relation import map_relation
15
15
  from snowflake.snowpark_connect.type_mapping import proto_to_snowpark_type
16
16
  from snowflake.snowpark_connect.utils.pandas_udtf_utils import create_pandas_udtf
@@ -30,13 +30,16 @@ from snowflake.snowpark_connect.utils.udtf_helper import (
30
30
  )
31
31
 
32
32
 
33
- def map_map_partitions(rel: relation_proto.Relation) -> snowpark.DataFrame:
33
+ def map_map_partitions(
34
+ rel: relation_proto.Relation,
35
+ ) -> DataFrameContainer:
34
36
  """
35
37
  Map a function over the partitions of the input DataFrame.
36
38
 
37
39
  This is a simple wrapper around the `mapInPandas` method in Snowpark.
38
40
  """
39
- input_df = map_relation(rel.map_partitions.input)
41
+ input_container = map_relation(rel.map_partitions.input)
42
+ input_df = input_container.dataframe
40
43
  udf_proto = rel.map_partitions.func
41
44
  udf_check(udf_proto)
42
45
 
@@ -44,7 +47,7 @@ def map_map_partitions(rel: relation_proto.Relation) -> snowpark.DataFrame:
44
47
  eval_type = udf_proto.python_udf.eval_type
45
48
 
46
49
  if eval_type == MAP_IN_ARROW_EVAL_TYPE:
47
- return _map_in_arrow_with_pandas_udtf(input_df, udf_proto)
50
+ return _map_in_arrow_with_pandas_udtf(input_container, udf_proto)
48
51
  else:
49
52
  return _map_partitions_with_udf(input_df, udf_proto)
50
53
 
@@ -76,22 +79,24 @@ def _call_udtf(
76
79
  # Only return the output columns.
77
80
  result_df = result_df_with_dummy.select(*output_cols)
78
81
 
79
- return with_column_map(
80
- result_df,
81
- output_cols,
82
- output_cols,
83
- [field.datatype for field in return_type.fields],
82
+ return DataFrameContainer.create_with_column_mapping(
83
+ dataframe=result_df,
84
+ spark_column_names=output_cols,
85
+ snowpark_column_names=output_cols,
86
+ snowpark_column_types=[field.datatype for field in return_type.fields],
84
87
  )
85
88
 
86
89
 
87
90
  def _map_in_arrow_with_pandas_udtf(
88
- input_df: snowpark.DataFrame, udf_proto: CommonInlineUserDefinedFunction
91
+ input_df_container: DataFrameContainer,
92
+ udf_proto: CommonInlineUserDefinedFunction,
89
93
  ) -> snowpark.DataFrame:
90
94
  """
91
95
  Handle mapInArrow using pandas_udtf for partition-level Arrow processing.
92
96
  """
97
+ input_df = input_df_container.dataframe
93
98
  input_schema = input_df.schema
94
- spark_column_names = input_df._column_map.get_spark_columns()
99
+ spark_column_names = input_df_container.column_map.get_spark_columns()
95
100
  return_type = proto_to_snowpark_type(udf_proto.python_udf.output_type)
96
101
  if require_creating_udtf_in_sproc(udf_proto):
97
102
  udtf_name = create_pandas_udtf_in_sproc(
@@ -138,9 +143,9 @@ def _map_partitions_with_udf(
138
143
  udf_column_name = "UDF_OUTPUT"
139
144
  snowpark_columns = [snowpark_fn.col(name) for name in input_df.columns]
140
145
  result = input_df.select(snowpark_fn.call_udf(snowpark_udf.name, *snowpark_columns))
141
- return with_column_map(
142
- result,
143
- [udf_column_name],
144
- [udf_column_name],
145
- [snowpark_udf.return_type],
146
+ return DataFrameContainer.create_with_column_mapping(
147
+ dataframe=result,
148
+ spark_column_names=[udf_column_name],
149
+ snowpark_column_names=[udf_column_name],
150
+ snowpark_column_types=[snowpark_udf.return_type],
146
151
  )
@@ -7,8 +7,7 @@ import copy
7
7
  import pandas
8
8
  import pyspark.sql.connect.proto.relations_pb2 as relation_proto
9
9
 
10
- from snowflake import snowpark
11
- from snowflake.snowpark_connect.column_name_handler import set_schema_getter
10
+ from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
12
11
  from snowflake.snowpark_connect.utils.cache import (
13
12
  df_cache_map_get,
14
13
  df_cache_map_put_if_absent,
@@ -32,8 +31,8 @@ NATURAL_JOIN_TYPE_BASE = 22
32
31
 
33
32
  def map_relation(
34
33
  rel: relation_proto.Relation, reuse_parsed_plan: bool = True
35
- ) -> snowpark.DataFrame | pandas.DataFrame:
36
- """Map a Spark Protobuf Relation message to a Snowpark DataFrame or pandas DataFrame.
34
+ ) -> DataFrameContainer | pandas.DataFrame:
35
+ """Map a Spark Protobuf Relation message to a DataFrameContainer or pandas DataFrame.
37
36
 
38
37
  NOTE: We return a pandas DataFrame object when the return value of the operation is a
39
38
  scalar value. The client expects these as an Arrow buffer with this return value packed
@@ -43,11 +42,11 @@ def map_relation(
43
42
 
44
43
  Args:
45
44
  rel (relation_proto.Relation): The Spark Protobuf Relation message to map.
46
- reuse_parsed_plan (bool, optional): If True, reuses previously parsed df from cache
45
+ reuse_parsed_plan (bool, optional): If True, reuses previously parsed container from cache
47
46
  to avoid redundant operations.
48
47
 
49
48
  Returns:
50
- snowpark.DataFrame | pandas.DataFrame: The Snowpark DataFrame or pandas DataFrame
49
+ DataFrameContainer | pandas.DataFrame: The DataFrameContainer or pandas DataFrame
51
50
  that corresponds to the input Spark Protobuf Relation message.
52
51
  """
53
52
  # TODO: from snowflake_connect_server.relation import map_extension
@@ -75,22 +74,27 @@ def map_relation(
75
74
  # Check for cached relation
76
75
  cache_entry = df_cache_map_get((get_session_id(), rel.common.plan_id))
77
76
  if cache_entry is not None:
78
- if isinstance(cache_entry, snowpark.DataFrame):
77
+ if isinstance(cache_entry, DataFrameContainer):
79
78
  set_plan_id_map(rel.common.plan_id, cache_entry)
80
79
  return cache_entry
81
80
 
82
81
  # If df is not cached, check if we have parsed the plan
83
- cached_df = get_plan_id_map(rel.common.plan_id)
84
- if cached_df is not None:
82
+ cached_container = get_plan_id_map(rel.common.plan_id)
83
+ if cached_container is not None:
84
+ cached_df = cached_container.dataframe
85
85
  result = copy.copy(cached_df)
86
- result._column_map = copy.deepcopy(cached_df._column_map)
87
- result._table_name = copy.deepcopy(cached_df._table_name)
88
- set_schema_getter(result, lambda: cached_df.schema)
86
+ # Create new container without triggering schema access
87
+ result_container = DataFrameContainer(
88
+ result,
89
+ column_map=copy.deepcopy(cached_container.column_map),
90
+ table_name=copy.deepcopy(cached_container.table_name),
91
+ alias=cached_container.alias,
92
+ cached_schema_getter=lambda: cached_df.schema,
93
+ )
89
94
  # If we don't make a copy of the df._output, the expression IDs for attributes in Snowpark DataFrames will differ from those stored in the cache,
90
95
  # leading to errors during query execution.
91
96
  result._output = cached_df._output
92
-
93
- return result
97
+ return result_container
94
98
 
95
99
  if rel.WhichOneof("rel_type") is not None:
96
100
  logger.info(rel.WhichOneof("rel_type").upper())
@@ -99,7 +103,7 @@ def map_relation(
99
103
  # type was incorrectly routed here.
100
104
  raise SnowparkConnectNotImplementedError("No Relation Type")
101
105
 
102
- result: snowpark.DataFrame | pandas.DataFrame
106
+ result: DataFrameContainer | pandas.DataFrame
103
107
  operation = rel.WhichOneof("rel_type")
104
108
  with push_operation_scope(operation):
105
109
  match operation:
@@ -248,6 +252,8 @@ def map_relation(
248
252
  case other:
249
253
  raise SnowparkConnectNotImplementedError(f"Other Relation {other}")
250
254
 
251
- if isinstance(result, snowpark.DataFrame):
255
+ # Store container in plan cache
256
+ if isinstance(result, DataFrameContainer):
252
257
  set_plan_id_map(rel.common.plan_id, result)
258
+
253
259
  return result