snowpark-connect 0.20.2__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +3 -2
- snowflake/snowpark_connect/column_name_handler.py +6 -65
- snowflake/snowpark_connect/config.py +28 -14
- snowflake/snowpark_connect/dataframe_container.py +242 -0
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +13 -23
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +9 -5
- snowflake/snowpark_connect/expression/map_extension.py +2 -1
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +8 -7
- snowflake/snowpark_connect/expression/map_unresolved_function.py +279 -43
- snowflake/snowpark_connect/expression/map_unresolved_star.py +8 -8
- snowflake/snowpark_connect/expression/map_update_fields.py +1 -1
- snowflake/snowpark_connect/expression/typer.py +6 -6
- snowflake/snowpark_connect/proto/control_pb2.py +17 -16
- snowflake/snowpark_connect/proto/control_pb2.pyi +17 -17
- snowflake/snowpark_connect/proto/control_pb2_grpc.py +12 -63
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +15 -14
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +19 -14
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +27 -26
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +74 -68
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +5 -5
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +25 -17
- snowflake/snowpark_connect/relation/map_aggregate.py +72 -47
- snowflake/snowpark_connect/relation/map_catalog.py +2 -2
- snowflake/snowpark_connect/relation/map_column_ops.py +207 -144
- snowflake/snowpark_connect/relation/map_crosstab.py +25 -6
- snowflake/snowpark_connect/relation/map_extension.py +81 -56
- snowflake/snowpark_connect/relation/map_join.py +72 -63
- snowflake/snowpark_connect/relation/map_local_relation.py +35 -20
- snowflake/snowpark_connect/relation/map_map_partitions.py +21 -16
- snowflake/snowpark_connect/relation/map_relation.py +22 -16
- snowflake/snowpark_connect/relation/map_row_ops.py +232 -146
- snowflake/snowpark_connect/relation/map_sample_by.py +15 -8
- snowflake/snowpark_connect/relation/map_show_string.py +42 -5
- snowflake/snowpark_connect/relation/map_sql.py +155 -78
- snowflake/snowpark_connect/relation/map_stats.py +88 -39
- snowflake/snowpark_connect/relation/map_subquery_alias.py +13 -14
- snowflake/snowpark_connect/relation/map_udtf.py +6 -9
- snowflake/snowpark_connect/relation/read/map_read.py +8 -3
- snowflake/snowpark_connect/relation/read/map_read_csv.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_json.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_socket.py +7 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +25 -16
- snowflake/snowpark_connect/relation/read/map_read_text.py +7 -7
- snowflake/snowpark_connect/relation/utils.py +11 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +15 -12
- snowflake/snowpark_connect/relation/write/map_write.py +199 -40
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +3 -2
- snowflake/snowpark_connect/server.py +34 -4
- snowflake/snowpark_connect/type_mapping.py +2 -23
- snowflake/snowpark_connect/utils/cache.py +27 -22
- snowflake/snowpark_connect/utils/context.py +33 -17
- snowflake/snowpark_connect/utils/{attribute_handling.py → identifiers.py} +47 -0
- snowflake/snowpark_connect/utils/session.py +41 -34
- snowflake/snowpark_connect/utils/telemetry.py +1 -2
- snowflake/snowpark_connect/version.py +1 -1
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/METADATA +5 -3
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/RECORD +67 -64
- snowpark_connect-0.21.0.dist-info/licenses/LICENSE-binary +568 -0
- snowpark_connect-0.21.0.dist-info/licenses/NOTICE-binary +1533 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -11,12 +11,13 @@ import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
|
11
11
|
|
|
12
12
|
from snowflake import snowpark
|
|
13
13
|
from snowflake.snowpark._internal.analyzer.analyzer_utils import unquote_if_quoted
|
|
14
|
+
from snowflake.snowpark._internal.utils import is_in_stored_procedure
|
|
14
15
|
from snowflake.snowpark.types import LongType, StructField, StructType
|
|
15
16
|
from snowflake.snowpark_connect import tcm
|
|
16
17
|
from snowflake.snowpark_connect.column_name_handler import (
|
|
17
18
|
make_column_names_snowpark_compatible,
|
|
18
|
-
with_column_map,
|
|
19
19
|
)
|
|
20
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
20
21
|
from snowflake.snowpark_connect.type_mapping import (
|
|
21
22
|
map_json_schema_to_snowpark,
|
|
22
23
|
map_pyarrow_to_snowpark_types,
|
|
@@ -126,10 +127,8 @@ def map_pylist_cell_to_python_object(cell, type: pa.lib.DataType):
|
|
|
126
127
|
map_pylist_cell_to_python_object(obj, list_type.value_type)
|
|
127
128
|
for obj in cell
|
|
128
129
|
]
|
|
129
|
-
case map_type if (
|
|
130
|
-
|
|
131
|
-
and isinstance(type, pa.lib.MapType)
|
|
132
|
-
and all(isinstance(obj, tuple) and len(obj) == 2 for obj in cell)
|
|
130
|
+
case map_type if cell is not None and isinstance(type, pa.lib.MapType) and all(
|
|
131
|
+
isinstance(obj, tuple) and len(obj) == 2 for obj in cell
|
|
133
132
|
):
|
|
134
133
|
# the MapType in arrow becomes list in pylist_df,
|
|
135
134
|
# e.g. {"Car": "Honda", "Bike": "Yamaha"} --> [("Car", "Honda"), ("Bike", "Yamaha")] , and causes some
|
|
@@ -171,7 +170,9 @@ def map_pandas_cell_to_python_object(cell):
|
|
|
171
170
|
return res if res == res else None
|
|
172
171
|
|
|
173
172
|
|
|
174
|
-
def map_local_relation(
|
|
173
|
+
def map_local_relation(
|
|
174
|
+
rel: relation_proto.Relation,
|
|
175
|
+
) -> DataFrameContainer:
|
|
175
176
|
if rel.local_relation.HasField("data"):
|
|
176
177
|
data = pa.BufferReader(rel.local_relation.data)
|
|
177
178
|
with pa.ipc.open_stream(data) as reader:
|
|
@@ -204,9 +205,9 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
|
|
|
204
205
|
# Only create the pandas dataframe for empty dataframe cases.
|
|
205
206
|
pandas_df = table.to_pandas()
|
|
206
207
|
snowpark_df: snowpark.DataFrame = session.create_dataframe(pandas_df)
|
|
207
|
-
return
|
|
208
|
-
snowpark_df,
|
|
209
|
-
spark_column_names,
|
|
208
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
209
|
+
dataframe=snowpark_df,
|
|
210
|
+
spark_column_names=spark_column_names,
|
|
210
211
|
snowpark_column_names=new_columns,
|
|
211
212
|
column_metadata=column_metadata,
|
|
212
213
|
)
|
|
@@ -230,16 +231,22 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
|
|
|
230
231
|
# Special characters in the schema currently break create_dataframe with arrow
|
|
231
232
|
# https://snowflakecomputing.atlassian.net/browse/SNOW-2199291
|
|
232
233
|
current_schema = session.get_current_schema()
|
|
234
|
+
|
|
235
|
+
# _create_temp_stage() changes were not ported to the internal connector, leading to this
|
|
236
|
+
# error on TCM and in notebooks (sproc):
|
|
237
|
+
# TypeError: _create_temp_stage() takes 7 positional arguments but 8 were given
|
|
233
238
|
use_pyarrow = (
|
|
234
|
-
|
|
239
|
+
not is_in_stored_procedure()
|
|
240
|
+
# TODO: SNOW-2220726 investigate why use_pyarrow failed in TCM:
|
|
241
|
+
and not tcm.TCM_MODE
|
|
242
|
+
and re.match(
|
|
235
243
|
# See https://docs.snowflake.com/en/sql-reference/identifiers-syntax
|
|
236
244
|
r"[A-Za-z_][A-Za-z0-9_\$]*",
|
|
237
245
|
# Schema may be double-quoted.
|
|
238
246
|
current_schema.strip('"') if current_schema is not None else "",
|
|
239
247
|
)
|
|
240
248
|
is not None
|
|
241
|
-
)
|
|
242
|
-
# TypeError: _create_temp_stage() takes 7 positional arguments but 8 were given
|
|
249
|
+
)
|
|
243
250
|
|
|
244
251
|
if use_pyarrow:
|
|
245
252
|
snowpark_df: snowpark.DataFrame = session.create_dataframe(
|
|
@@ -257,6 +264,7 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
|
|
|
257
264
|
]
|
|
258
265
|
|
|
259
266
|
snowpark_df = snowpark_df.select(*casted_columns)
|
|
267
|
+
|
|
260
268
|
else:
|
|
261
269
|
pylist_df = [
|
|
262
270
|
list(row)
|
|
@@ -285,9 +293,9 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
|
|
|
285
293
|
snowpark_schema,
|
|
286
294
|
)
|
|
287
295
|
|
|
288
|
-
return
|
|
289
|
-
snowpark_df,
|
|
290
|
-
spark_column_names,
|
|
296
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
297
|
+
dataframe=snowpark_df,
|
|
298
|
+
spark_column_names=spark_column_names,
|
|
291
299
|
snowpark_column_names=new_columns,
|
|
292
300
|
column_metadata=column_metadata,
|
|
293
301
|
snowpark_column_types=[f.datatype for f in snowpark_schema.fields],
|
|
@@ -305,9 +313,9 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
|
|
|
305
313
|
[],
|
|
306
314
|
snowpark_schema,
|
|
307
315
|
)
|
|
308
|
-
return
|
|
309
|
-
snowpark_df,
|
|
310
|
-
spark_column_names,
|
|
316
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
317
|
+
dataframe=snowpark_df,
|
|
318
|
+
spark_column_names=spark_column_names,
|
|
311
319
|
snowpark_column_names=new_columns,
|
|
312
320
|
column_metadata=column_metadata,
|
|
313
321
|
)
|
|
@@ -317,10 +325,17 @@ def map_local_relation(rel: relation_proto.Relation) -> snowpark.DataFrame:
|
|
|
317
325
|
)
|
|
318
326
|
|
|
319
327
|
|
|
320
|
-
def map_range(
|
|
328
|
+
def map_range(
|
|
329
|
+
rel: relation_proto.Relation,
|
|
330
|
+
) -> DataFrameContainer:
|
|
321
331
|
session = get_or_create_snowpark_session()
|
|
322
332
|
new_columns = make_column_names_snowpark_compatible(["id"], rel.common.plan_id)
|
|
323
333
|
result = session.range(
|
|
324
334
|
rel.range.start, rel.range.end, rel.range.step
|
|
325
335
|
).with_column_renamed("ID", new_columns[0])
|
|
326
|
-
return
|
|
336
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
337
|
+
dataframe=result,
|
|
338
|
+
spark_column_names=["id"],
|
|
339
|
+
snowpark_column_names=new_columns,
|
|
340
|
+
snowpark_column_types=[LongType()],
|
|
341
|
+
)
|
|
@@ -8,9 +8,9 @@ from pyspark.sql.connect.proto.expressions_pb2 import CommonInlineUserDefinedFun
|
|
|
8
8
|
import snowflake.snowpark.functions as snowpark_fn
|
|
9
9
|
from snowflake import snowpark
|
|
10
10
|
from snowflake.snowpark.types import StructType
|
|
11
|
-
from snowflake.snowpark_connect.column_name_handler import with_column_map
|
|
12
11
|
from snowflake.snowpark_connect.config import global_config
|
|
13
12
|
from snowflake.snowpark_connect.constants import MAP_IN_ARROW_EVAL_TYPE
|
|
13
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
14
14
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
15
15
|
from snowflake.snowpark_connect.type_mapping import proto_to_snowpark_type
|
|
16
16
|
from snowflake.snowpark_connect.utils.pandas_udtf_utils import create_pandas_udtf
|
|
@@ -30,13 +30,16 @@ from snowflake.snowpark_connect.utils.udtf_helper import (
|
|
|
30
30
|
)
|
|
31
31
|
|
|
32
32
|
|
|
33
|
-
def map_map_partitions(
|
|
33
|
+
def map_map_partitions(
|
|
34
|
+
rel: relation_proto.Relation,
|
|
35
|
+
) -> DataFrameContainer:
|
|
34
36
|
"""
|
|
35
37
|
Map a function over the partitions of the input DataFrame.
|
|
36
38
|
|
|
37
39
|
This is a simple wrapper around the `mapInPandas` method in Snowpark.
|
|
38
40
|
"""
|
|
39
|
-
|
|
41
|
+
input_container = map_relation(rel.map_partitions.input)
|
|
42
|
+
input_df = input_container.dataframe
|
|
40
43
|
udf_proto = rel.map_partitions.func
|
|
41
44
|
udf_check(udf_proto)
|
|
42
45
|
|
|
@@ -44,7 +47,7 @@ def map_map_partitions(rel: relation_proto.Relation) -> snowpark.DataFrame:
|
|
|
44
47
|
eval_type = udf_proto.python_udf.eval_type
|
|
45
48
|
|
|
46
49
|
if eval_type == MAP_IN_ARROW_EVAL_TYPE:
|
|
47
|
-
return _map_in_arrow_with_pandas_udtf(
|
|
50
|
+
return _map_in_arrow_with_pandas_udtf(input_container, udf_proto)
|
|
48
51
|
else:
|
|
49
52
|
return _map_partitions_with_udf(input_df, udf_proto)
|
|
50
53
|
|
|
@@ -76,22 +79,24 @@ def _call_udtf(
|
|
|
76
79
|
# Only return the output columns.
|
|
77
80
|
result_df = result_df_with_dummy.select(*output_cols)
|
|
78
81
|
|
|
79
|
-
return
|
|
80
|
-
result_df,
|
|
81
|
-
output_cols,
|
|
82
|
-
output_cols,
|
|
83
|
-
[field.datatype for field in return_type.fields],
|
|
82
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
83
|
+
dataframe=result_df,
|
|
84
|
+
spark_column_names=output_cols,
|
|
85
|
+
snowpark_column_names=output_cols,
|
|
86
|
+
snowpark_column_types=[field.datatype for field in return_type.fields],
|
|
84
87
|
)
|
|
85
88
|
|
|
86
89
|
|
|
87
90
|
def _map_in_arrow_with_pandas_udtf(
|
|
88
|
-
|
|
91
|
+
input_df_container: DataFrameContainer,
|
|
92
|
+
udf_proto: CommonInlineUserDefinedFunction,
|
|
89
93
|
) -> snowpark.DataFrame:
|
|
90
94
|
"""
|
|
91
95
|
Handle mapInArrow using pandas_udtf for partition-level Arrow processing.
|
|
92
96
|
"""
|
|
97
|
+
input_df = input_df_container.dataframe
|
|
93
98
|
input_schema = input_df.schema
|
|
94
|
-
spark_column_names =
|
|
99
|
+
spark_column_names = input_df_container.column_map.get_spark_columns()
|
|
95
100
|
return_type = proto_to_snowpark_type(udf_proto.python_udf.output_type)
|
|
96
101
|
if require_creating_udtf_in_sproc(udf_proto):
|
|
97
102
|
udtf_name = create_pandas_udtf_in_sproc(
|
|
@@ -138,9 +143,9 @@ def _map_partitions_with_udf(
|
|
|
138
143
|
udf_column_name = "UDF_OUTPUT"
|
|
139
144
|
snowpark_columns = [snowpark_fn.col(name) for name in input_df.columns]
|
|
140
145
|
result = input_df.select(snowpark_fn.call_udf(snowpark_udf.name, *snowpark_columns))
|
|
141
|
-
return
|
|
142
|
-
result,
|
|
143
|
-
[udf_column_name],
|
|
144
|
-
[udf_column_name],
|
|
145
|
-
[snowpark_udf.return_type],
|
|
146
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
147
|
+
dataframe=result,
|
|
148
|
+
spark_column_names=[udf_column_name],
|
|
149
|
+
snowpark_column_names=[udf_column_name],
|
|
150
|
+
snowpark_column_types=[snowpark_udf.return_type],
|
|
146
151
|
)
|
|
@@ -7,8 +7,7 @@ import copy
|
|
|
7
7
|
import pandas
|
|
8
8
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
9
9
|
|
|
10
|
-
from snowflake import
|
|
11
|
-
from snowflake.snowpark_connect.column_name_handler import set_schema_getter
|
|
10
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
12
11
|
from snowflake.snowpark_connect.utils.cache import (
|
|
13
12
|
df_cache_map_get,
|
|
14
13
|
df_cache_map_put_if_absent,
|
|
@@ -32,8 +31,8 @@ NATURAL_JOIN_TYPE_BASE = 22
|
|
|
32
31
|
|
|
33
32
|
def map_relation(
|
|
34
33
|
rel: relation_proto.Relation, reuse_parsed_plan: bool = True
|
|
35
|
-
) ->
|
|
36
|
-
"""Map a Spark Protobuf Relation message to a
|
|
34
|
+
) -> DataFrameContainer | pandas.DataFrame:
|
|
35
|
+
"""Map a Spark Protobuf Relation message to a DataFrameContainer or pandas DataFrame.
|
|
37
36
|
|
|
38
37
|
NOTE: We return a pandas DataFrame object when the return value of the operation is a
|
|
39
38
|
scalar value. The client expects these as an Arrow buffer with this return value packed
|
|
@@ -43,11 +42,11 @@ def map_relation(
|
|
|
43
42
|
|
|
44
43
|
Args:
|
|
45
44
|
rel (relation_proto.Relation): The Spark Protobuf Relation message to map.
|
|
46
|
-
reuse_parsed_plan (bool, optional): If True, reuses previously parsed
|
|
45
|
+
reuse_parsed_plan (bool, optional): If True, reuses previously parsed container from cache
|
|
47
46
|
to avoid redundant operations.
|
|
48
47
|
|
|
49
48
|
Returns:
|
|
50
|
-
|
|
49
|
+
DataFrameContainer | pandas.DataFrame: The DataFrameContainer or pandas DataFrame
|
|
51
50
|
that corresponds to the input Spark Protobuf Relation message.
|
|
52
51
|
"""
|
|
53
52
|
# TODO: from snowflake_connect_server.relation import map_extension
|
|
@@ -75,22 +74,27 @@ def map_relation(
|
|
|
75
74
|
# Check for cached relation
|
|
76
75
|
cache_entry = df_cache_map_get((get_session_id(), rel.common.plan_id))
|
|
77
76
|
if cache_entry is not None:
|
|
78
|
-
if isinstance(cache_entry,
|
|
77
|
+
if isinstance(cache_entry, DataFrameContainer):
|
|
79
78
|
set_plan_id_map(rel.common.plan_id, cache_entry)
|
|
80
79
|
return cache_entry
|
|
81
80
|
|
|
82
81
|
# If df is not cached, check if we have parsed the plan
|
|
83
|
-
|
|
84
|
-
if
|
|
82
|
+
cached_container = get_plan_id_map(rel.common.plan_id)
|
|
83
|
+
if cached_container is not None:
|
|
84
|
+
cached_df = cached_container.dataframe
|
|
85
85
|
result = copy.copy(cached_df)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
86
|
+
# Create new container without triggering schema access
|
|
87
|
+
result_container = DataFrameContainer(
|
|
88
|
+
result,
|
|
89
|
+
column_map=copy.deepcopy(cached_container.column_map),
|
|
90
|
+
table_name=copy.deepcopy(cached_container.table_name),
|
|
91
|
+
alias=cached_container.alias,
|
|
92
|
+
cached_schema_getter=lambda: cached_df.schema,
|
|
93
|
+
)
|
|
89
94
|
# If we don't make a copy of the df._output, the expression IDs for attributes in Snowpark DataFrames will differ from those stored in the cache,
|
|
90
95
|
# leading to errors during query execution.
|
|
91
96
|
result._output = cached_df._output
|
|
92
|
-
|
|
93
|
-
return result
|
|
97
|
+
return result_container
|
|
94
98
|
|
|
95
99
|
if rel.WhichOneof("rel_type") is not None:
|
|
96
100
|
logger.info(rel.WhichOneof("rel_type").upper())
|
|
@@ -99,7 +103,7 @@ def map_relation(
|
|
|
99
103
|
# type was incorrectly routed here.
|
|
100
104
|
raise SnowparkConnectNotImplementedError("No Relation Type")
|
|
101
105
|
|
|
102
|
-
result:
|
|
106
|
+
result: DataFrameContainer | pandas.DataFrame
|
|
103
107
|
operation = rel.WhichOneof("rel_type")
|
|
104
108
|
with push_operation_scope(operation):
|
|
105
109
|
match operation:
|
|
@@ -248,6 +252,8 @@ def map_relation(
|
|
|
248
252
|
case other:
|
|
249
253
|
raise SnowparkConnectNotImplementedError(f"Other Relation {other}")
|
|
250
254
|
|
|
251
|
-
|
|
255
|
+
# Store container in plan cache
|
|
256
|
+
if isinstance(result, DataFrameContainer):
|
|
252
257
|
set_plan_id_map(rel.common.plan_id, result)
|
|
258
|
+
|
|
253
259
|
return result
|