snowpark-connect 0.20.2__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/analyze_plan/map_tree_string.py +3 -2
- snowflake/snowpark_connect/column_name_handler.py +6 -65
- snowflake/snowpark_connect/config.py +28 -14
- snowflake/snowpark_connect/dataframe_container.py +242 -0
- snowflake/snowpark_connect/execute_plan/map_execution_command.py +13 -23
- snowflake/snowpark_connect/execute_plan/map_execution_root.py +9 -5
- snowflake/snowpark_connect/expression/map_extension.py +2 -1
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +8 -7
- snowflake/snowpark_connect/expression/map_unresolved_function.py +279 -43
- snowflake/snowpark_connect/expression/map_unresolved_star.py +8 -8
- snowflake/snowpark_connect/expression/map_update_fields.py +1 -1
- snowflake/snowpark_connect/expression/typer.py +6 -6
- snowflake/snowpark_connect/proto/control_pb2.py +17 -16
- snowflake/snowpark_connect/proto/control_pb2.pyi +17 -17
- snowflake/snowpark_connect/proto/control_pb2_grpc.py +12 -63
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +15 -14
- snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +19 -14
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +27 -26
- snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +74 -68
- snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +5 -5
- snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +25 -17
- snowflake/snowpark_connect/relation/map_aggregate.py +72 -47
- snowflake/snowpark_connect/relation/map_catalog.py +2 -2
- snowflake/snowpark_connect/relation/map_column_ops.py +207 -144
- snowflake/snowpark_connect/relation/map_crosstab.py +25 -6
- snowflake/snowpark_connect/relation/map_extension.py +81 -56
- snowflake/snowpark_connect/relation/map_join.py +72 -63
- snowflake/snowpark_connect/relation/map_local_relation.py +35 -20
- snowflake/snowpark_connect/relation/map_map_partitions.py +21 -16
- snowflake/snowpark_connect/relation/map_relation.py +22 -16
- snowflake/snowpark_connect/relation/map_row_ops.py +232 -146
- snowflake/snowpark_connect/relation/map_sample_by.py +15 -8
- snowflake/snowpark_connect/relation/map_show_string.py +42 -5
- snowflake/snowpark_connect/relation/map_sql.py +155 -78
- snowflake/snowpark_connect/relation/map_stats.py +88 -39
- snowflake/snowpark_connect/relation/map_subquery_alias.py +13 -14
- snowflake/snowpark_connect/relation/map_udtf.py +6 -9
- snowflake/snowpark_connect/relation/read/map_read.py +8 -3
- snowflake/snowpark_connect/relation/read/map_read_csv.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_jdbc.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_json.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +7 -7
- snowflake/snowpark_connect/relation/read/map_read_socket.py +7 -3
- snowflake/snowpark_connect/relation/read/map_read_table.py +25 -16
- snowflake/snowpark_connect/relation/read/map_read_text.py +7 -7
- snowflake/snowpark_connect/relation/utils.py +11 -5
- snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +15 -12
- snowflake/snowpark_connect/relation/write/map_write.py +199 -40
- snowflake/snowpark_connect/relation/write/map_write_jdbc.py +3 -2
- snowflake/snowpark_connect/server.py +34 -4
- snowflake/snowpark_connect/type_mapping.py +2 -23
- snowflake/snowpark_connect/utils/cache.py +27 -22
- snowflake/snowpark_connect/utils/context.py +33 -17
- snowflake/snowpark_connect/utils/{attribute_handling.py → identifiers.py} +47 -0
- snowflake/snowpark_connect/utils/session.py +41 -34
- snowflake/snowpark_connect/utils/telemetry.py +1 -2
- snowflake/snowpark_connect/version.py +1 -1
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/METADATA +5 -3
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/RECORD +67 -64
- snowpark_connect-0.21.0.dist-info/licenses/LICENSE-binary +568 -0
- snowpark_connect-0.21.0.dist-info/licenses/NOTICE-binary +1533 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.20.2.data → snowpark_connect-0.21.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.20.2.dist-info → snowpark_connect-0.21.0.dist-info}/top_level.txt +0 -0
|
@@ -6,7 +6,7 @@ import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
|
|
|
6
6
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
7
7
|
|
|
8
8
|
from snowflake import snowpark
|
|
9
|
-
from snowflake.snowpark_connect.
|
|
9
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
10
10
|
from snowflake.snowpark_connect.expression.literal import get_literal_field_and_name
|
|
11
11
|
from snowflake.snowpark_connect.expression.map_expression import (
|
|
12
12
|
map_single_column_expression,
|
|
@@ -15,21 +15,28 @@ from snowflake.snowpark_connect.expression.typer import ExpressionTyper
|
|
|
15
15
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
def map_sample_by(
|
|
18
|
+
def map_sample_by(
|
|
19
|
+
rel: relation_proto.Relation,
|
|
20
|
+
) -> DataFrameContainer:
|
|
19
21
|
"""
|
|
20
22
|
Sample by an expression on the input DataFrame.
|
|
21
23
|
"""
|
|
22
|
-
|
|
24
|
+
input_container = map_relation(rel.sample_by.input)
|
|
25
|
+
input_df = input_container.dataframe
|
|
26
|
+
|
|
23
27
|
exp: expressions_proto.Expression = rel.sample_by.col
|
|
24
28
|
_, col_expr = map_single_column_expression(
|
|
25
|
-
exp,
|
|
29
|
+
exp, input_container.column_map, ExpressionTyper(input_df)
|
|
26
30
|
)
|
|
27
31
|
fractions = {
|
|
28
32
|
get_literal_field_and_name(frac.stratum)[0]: frac.fraction
|
|
29
33
|
for frac in rel.sample_by.fractions
|
|
30
34
|
}
|
|
31
35
|
result: snowpark.DataFrame = input_df.sampleBy(col_expr.col, fractions)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
+
return DataFrameContainer(
|
|
37
|
+
result,
|
|
38
|
+
column_map=input_container.column_map,
|
|
39
|
+
table_name=input_container.table_name,
|
|
40
|
+
alias=input_container.alias,
|
|
41
|
+
cached_schema_getter=lambda: input_df.schema,
|
|
42
|
+
)
|
|
@@ -2,11 +2,17 @@
|
|
|
2
2
|
# Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
|
|
3
3
|
#
|
|
4
4
|
|
|
5
|
+
import copy
|
|
6
|
+
|
|
5
7
|
import pandas
|
|
6
8
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
7
9
|
|
|
8
10
|
from snowflake import snowpark
|
|
9
11
|
from snowflake.snowpark._internal.analyzer import analyzer_utils
|
|
12
|
+
from snowflake.snowpark.functions import col
|
|
13
|
+
from snowflake.snowpark.types import DateType, StringType, StructField, StructType
|
|
14
|
+
from snowflake.snowpark_connect.column_name_handler import set_schema_getter
|
|
15
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
10
16
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
11
17
|
|
|
12
18
|
|
|
@@ -18,12 +24,15 @@ def map_show_string(rel: relation_proto.Relation) -> pandas.DataFrame:
|
|
|
18
24
|
message creates a string. The client expects this string to be packed into an Arrow
|
|
19
25
|
Buffer object as a single cell.
|
|
20
26
|
"""
|
|
21
|
-
|
|
27
|
+
input_df_container: DataFrameContainer = map_relation(rel.show_string.input)
|
|
28
|
+
raw_input_df = input_df_container.dataframe
|
|
29
|
+
input_df = _handle_datetype_columns(raw_input_df)
|
|
30
|
+
|
|
22
31
|
show_string = input_df._show_string_spark(
|
|
23
32
|
num_rows=rel.show_string.num_rows,
|
|
24
33
|
truncate=rel.show_string.truncate,
|
|
25
34
|
vertical=rel.show_string.vertical,
|
|
26
|
-
_spark_column_names=
|
|
35
|
+
_spark_column_names=input_df_container.column_map.get_spark_columns(),
|
|
27
36
|
)
|
|
28
37
|
return pandas.DataFrame({"show_string": [show_string]})
|
|
29
38
|
|
|
@@ -32,13 +41,15 @@ def map_repr_html(rel: relation_proto.Relation) -> pandas.DataFrame:
|
|
|
32
41
|
"""
|
|
33
42
|
Generate the html string representation of the input dataframe.
|
|
34
43
|
"""
|
|
35
|
-
|
|
44
|
+
input_df_container: DataFrameContainer = map_relation(rel.html_string.input)
|
|
45
|
+
input_df = input_df_container.dataframe
|
|
46
|
+
|
|
36
47
|
input_panda = input_df.toPandas()
|
|
37
48
|
input_panda.rename(
|
|
38
49
|
columns={
|
|
39
50
|
analyzer_utils.unquote_if_quoted(
|
|
40
|
-
|
|
41
|
-
):
|
|
51
|
+
input_df_container.column_map.get_snowpark_columns()[i]
|
|
52
|
+
): input_df_container.column_map.get_spark_columns()[i]
|
|
42
53
|
for i in range(len(input_panda.columns))
|
|
43
54
|
},
|
|
44
55
|
inplace=True,
|
|
@@ -48,3 +59,29 @@ def map_repr_html(rel: relation_proto.Relation) -> pandas.DataFrame:
|
|
|
48
59
|
max_rows=rel.html_string.num_rows,
|
|
49
60
|
)
|
|
50
61
|
return pandas.DataFrame({"html_string": [html_string]})
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _handle_datetype_columns(input_df: snowpark.DataFrame) -> snowpark.DataFrame:
|
|
65
|
+
"""
|
|
66
|
+
Maps DateType columns to strings it aims to allow showing the dates which are out of range of datetime.datetime.
|
|
67
|
+
"""
|
|
68
|
+
new_column_mapping = []
|
|
69
|
+
new_fields = []
|
|
70
|
+
transformation_required = False
|
|
71
|
+
for field in input_df.schema:
|
|
72
|
+
if isinstance(field.datatype, DateType):
|
|
73
|
+
transformation_required = True
|
|
74
|
+
new_column_mapping.append(col(field.name).cast(StringType()))
|
|
75
|
+
new_fields.append(StructField(field.name, StringType()))
|
|
76
|
+
else:
|
|
77
|
+
new_column_mapping.append(col(field.name))
|
|
78
|
+
new_fields.append(field)
|
|
79
|
+
|
|
80
|
+
if not transformation_required:
|
|
81
|
+
return input_df
|
|
82
|
+
|
|
83
|
+
transformed_df = input_df.select(new_column_mapping)
|
|
84
|
+
set_schema_getter(transformed_df, lambda: StructType(new_fields))
|
|
85
|
+
transformed_df._column_map = copy.deepcopy(input_df._column_map)
|
|
86
|
+
|
|
87
|
+
return transformed_df
|
|
@@ -26,15 +26,16 @@ from snowflake.snowpark._internal.analyzer.analyzer_utils import (
|
|
|
26
26
|
unquote_if_quoted,
|
|
27
27
|
)
|
|
28
28
|
from snowflake.snowpark._internal.type_utils import convert_sp_to_sf_type
|
|
29
|
-
from snowflake.snowpark._internal.utils import is_sql_select_statement
|
|
29
|
+
from snowflake.snowpark._internal.utils import is_sql_select_statement, quote_name
|
|
30
30
|
from snowflake.snowpark.functions import when_matched, when_not_matched
|
|
31
31
|
from snowflake.snowpark_connect.config import (
|
|
32
|
-
|
|
32
|
+
auto_uppercase_non_column_identifiers,
|
|
33
33
|
get_boolean_session_config_param,
|
|
34
34
|
global_config,
|
|
35
35
|
set_config_param,
|
|
36
36
|
unset_config_param,
|
|
37
37
|
)
|
|
38
|
+
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
38
39
|
from snowflake.snowpark_connect.expression.map_expression import (
|
|
39
40
|
ColumnNameMap,
|
|
40
41
|
map_single_column_expression,
|
|
@@ -72,6 +73,7 @@ from ..expression.map_sql_expression import (
|
|
|
72
73
|
map_logical_plan_expression,
|
|
73
74
|
sql_parser,
|
|
74
75
|
)
|
|
76
|
+
from ..utils.identifiers import spark_to_sf_single_id
|
|
75
77
|
|
|
76
78
|
_ctes = ContextVar[dict[str, relation_proto.Relation]]("_ctes", default={})
|
|
77
79
|
|
|
@@ -159,38 +161,32 @@ def parse_pos_args(
|
|
|
159
161
|
return dict(zip(sorted(positions), pos_args))
|
|
160
162
|
|
|
161
163
|
|
|
162
|
-
def execute_logical_plan(logical_plan) ->
|
|
164
|
+
def execute_logical_plan(logical_plan) -> DataFrameContainer:
|
|
163
165
|
proto = map_logical_plan_relation(logical_plan)
|
|
164
166
|
with push_evaluating_sql_scope():
|
|
165
167
|
return map_relation(proto)
|
|
166
168
|
|
|
167
169
|
|
|
168
|
-
def _spark_to_snowflake_single_id(name: str) -> str:
|
|
169
|
-
name = quote_name_without_upper_casing(name)
|
|
170
|
-
return name.upper() if auto_uppercase_ddl() else name
|
|
171
|
-
|
|
172
|
-
|
|
173
170
|
def _spark_to_snowflake(multipart_id: jpype.JObject) -> str:
|
|
174
171
|
return ".".join(
|
|
175
|
-
|
|
172
|
+
spark_to_sf_single_id(str(part)) for part in as_java_list(multipart_id)
|
|
176
173
|
)
|
|
177
174
|
|
|
178
175
|
|
|
179
176
|
def _rename_columns(
|
|
180
|
-
df: snowpark.DataFrame, user_specified_columns
|
|
177
|
+
df: snowpark.DataFrame, user_specified_columns, column_map: ColumnNameMap
|
|
181
178
|
) -> snowpark.DataFrame:
|
|
182
179
|
user_columns = [str(col._1()) for col in as_java_list(user_specified_columns)]
|
|
183
180
|
|
|
184
181
|
if user_columns:
|
|
185
182
|
columns = zip(df.columns, user_columns)
|
|
186
183
|
else:
|
|
187
|
-
columns =
|
|
184
|
+
columns = column_map.snowpark_to_spark_map().items()
|
|
188
185
|
|
|
189
186
|
for orig_column, user_column in columns:
|
|
190
187
|
df = df.with_column_renamed(
|
|
191
|
-
orig_column,
|
|
188
|
+
orig_column, spark_to_sf_single_id(user_column, is_column=True)
|
|
192
189
|
)
|
|
193
|
-
|
|
194
190
|
return df
|
|
195
191
|
|
|
196
192
|
|
|
@@ -199,11 +195,12 @@ def _create_table_as_select(logical_plan, mode: str) -> None:
|
|
|
199
195
|
name = get_relation_identifier_name(logical_plan.name())
|
|
200
196
|
comment = logical_plan.tableSpec().comment()
|
|
201
197
|
|
|
202
|
-
|
|
203
|
-
|
|
198
|
+
container = execute_logical_plan(logical_plan.query())
|
|
199
|
+
df = container.dataframe
|
|
200
|
+
columns = container.column_map.snowpark_to_spark_map().items()
|
|
204
201
|
for orig_column, user_column in columns:
|
|
205
202
|
df = df.with_column_renamed(
|
|
206
|
-
orig_column,
|
|
203
|
+
orig_column, spark_to_sf_single_id(user_column, is_column=True)
|
|
207
204
|
)
|
|
208
205
|
|
|
209
206
|
# TODO escaping should be handled by snowpark. remove when SNOW-2210271 is done
|
|
@@ -218,11 +215,11 @@ def _create_table_as_select(logical_plan, mode: str) -> None:
|
|
|
218
215
|
|
|
219
216
|
|
|
220
217
|
def _spark_field_to_sql(field: jpype.JObject, is_column: bool) -> str:
|
|
221
|
-
# Column names will be uppercased according to "snowpark.connect.auto-uppercase
|
|
218
|
+
# Column names will be uppercased according to "snowpark.connect.sql.identifiers.auto-uppercase",
|
|
222
219
|
# and struct fields will be left as is. This should allow users to use the same names
|
|
223
220
|
# in spark and Snowflake in most cases.
|
|
224
221
|
if is_column:
|
|
225
|
-
name =
|
|
222
|
+
name = spark_to_sf_single_id(str(field.name()), is_column=True)
|
|
226
223
|
else:
|
|
227
224
|
name = quote_name_without_upper_casing(str(field.name()))
|
|
228
225
|
data_type_str = _spark_datatype_to_sql(field.dataType())
|
|
@@ -257,10 +254,12 @@ def _normalize_identifiers(node):
|
|
|
257
254
|
The identifiers need to be uppercased to match Snowflake's behaviour. Users can disable this by setting
|
|
258
255
|
the `snowpark.connect.auto_uppercase_ddl` config to False.
|
|
259
256
|
"""
|
|
260
|
-
if isinstance(node, Identifier):
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
257
|
+
if not isinstance(node, Identifier):
|
|
258
|
+
return node
|
|
259
|
+
elif auto_uppercase_non_column_identifiers():
|
|
260
|
+
return Identifier(this=node.this.upper(), quoted=True)
|
|
261
|
+
else:
|
|
262
|
+
return Identifier(this=node.this, quoted=True)
|
|
264
263
|
|
|
265
264
|
|
|
266
265
|
def _remove_file_format_property(node):
|
|
@@ -328,7 +327,10 @@ def map_sql_to_pandas_df(
|
|
|
328
327
|
# Build Snowflake SQL from logical plan attributes
|
|
329
328
|
for col in as_java_list(columns_to_add):
|
|
330
329
|
# Follow the same pattern as AlterColumn for column name extraction
|
|
331
|
-
col_name = ".".join(
|
|
330
|
+
col_name = ".".join(
|
|
331
|
+
spark_to_sf_single_id(part, is_column=True)
|
|
332
|
+
for part in as_java_list(col.name())
|
|
333
|
+
)
|
|
332
334
|
col_type = _spark_datatype_to_sql(col.dataType())
|
|
333
335
|
snowflake_sql = (
|
|
334
336
|
f"ALTER TABLE {table_name} ADD COLUMN {col_name} {col_type}"
|
|
@@ -341,9 +343,22 @@ def map_sql_to_pandas_df(
|
|
|
341
343
|
|
|
342
344
|
# Extract actual column name
|
|
343
345
|
column_name = ".".join(
|
|
344
|
-
|
|
346
|
+
spark_to_sf_single_id(part, is_column=True)
|
|
347
|
+
for part in as_java_list(column_obj.name())
|
|
345
348
|
)
|
|
346
349
|
|
|
350
|
+
if not global_config.spark_sql_caseSensitive:
|
|
351
|
+
case_insensitive_name = next(
|
|
352
|
+
(
|
|
353
|
+
f.name
|
|
354
|
+
for f in session.table(table_name).schema.fields
|
|
355
|
+
if f.name.lower() == column_name.lower()
|
|
356
|
+
),
|
|
357
|
+
None,
|
|
358
|
+
)
|
|
359
|
+
if case_insensitive_name:
|
|
360
|
+
column_name = case_insensitive_name
|
|
361
|
+
|
|
347
362
|
# Build ALTER COLUMN command from logical plan attributes
|
|
348
363
|
alter_parts = []
|
|
349
364
|
|
|
@@ -381,9 +396,7 @@ def map_sql_to_pandas_df(
|
|
|
381
396
|
if_not_exists = "IF NOT EXISTS " if logical_plan.ifNotExists() else ""
|
|
382
397
|
session.sql(f"CREATE SCHEMA {if_not_exists}{name}").collect()
|
|
383
398
|
if previous_name is not None:
|
|
384
|
-
session.sql(
|
|
385
|
-
f"USE SCHEMA {_spark_to_snowflake_single_id(previous_name)}"
|
|
386
|
-
).collect()
|
|
399
|
+
session.sql(f"USE SCHEMA {quote_name(previous_name)}").collect()
|
|
387
400
|
else:
|
|
388
401
|
# TODO: Unset the schema
|
|
389
402
|
pass
|
|
@@ -443,7 +456,8 @@ def map_sql_to_pandas_df(
|
|
|
443
456
|
else:
|
|
444
457
|
object_name: str = as_java_list(logical_plan.child().nameParts())[0]
|
|
445
458
|
_accessing_temp_object.set(False)
|
|
446
|
-
|
|
459
|
+
df_container = execute_logical_plan(logical_plan.query())
|
|
460
|
+
df = df_container.dataframe
|
|
447
461
|
if _accessing_temp_object.get():
|
|
448
462
|
raise AnalysisException(
|
|
449
463
|
f"[INVALID_TEMP_OBJ_REFERENCE] Cannot create the persistent object `{CURRENT_CATALOG_NAME}`.`{current_schema}`.`{object_name}` "
|
|
@@ -454,7 +468,9 @@ def map_sql_to_pandas_df(
|
|
|
454
468
|
name = get_relation_identifier_name(logical_plan.child())
|
|
455
469
|
comment = logical_plan.comment()
|
|
456
470
|
|
|
457
|
-
df = _rename_columns(
|
|
471
|
+
df = _rename_columns(
|
|
472
|
+
df, logical_plan.userSpecifiedColumns(), df_container.column_map
|
|
473
|
+
)
|
|
458
474
|
|
|
459
475
|
# TODO: Support logical_plan.replace() == False
|
|
460
476
|
df.create_or_replace_view(
|
|
@@ -464,7 +480,8 @@ def map_sql_to_pandas_df(
|
|
|
464
480
|
else None,
|
|
465
481
|
)
|
|
466
482
|
case "CreateViewCommand":
|
|
467
|
-
|
|
483
|
+
df_container = execute_logical_plan(logical_plan.plan())
|
|
484
|
+
df = df_container.dataframe
|
|
468
485
|
tmp_views = _get_current_temp_objects()
|
|
469
486
|
tmp_views.add(
|
|
470
487
|
(
|
|
@@ -475,7 +492,7 @@ def map_sql_to_pandas_df(
|
|
|
475
492
|
)
|
|
476
493
|
|
|
477
494
|
name = str(logical_plan.name().identifier())
|
|
478
|
-
name =
|
|
495
|
+
name = spark_to_sf_single_id(name)
|
|
479
496
|
if isinstance(
|
|
480
497
|
logical_plan.viewType(),
|
|
481
498
|
jpype.JClass(
|
|
@@ -490,7 +507,9 @@ def map_sql_to_pandas_df(
|
|
|
490
507
|
else None
|
|
491
508
|
)
|
|
492
509
|
|
|
493
|
-
df = _rename_columns(
|
|
510
|
+
df = _rename_columns(
|
|
511
|
+
df, logical_plan.userSpecifiedColumns(), df_container.column_map
|
|
512
|
+
)
|
|
494
513
|
|
|
495
514
|
if logical_plan.replace():
|
|
496
515
|
df.create_or_replace_temp_view(
|
|
@@ -504,6 +523,7 @@ def map_sql_to_pandas_df(
|
|
|
504
523
|
)
|
|
505
524
|
case "DescribeColumn":
|
|
506
525
|
name = get_relation_identifier_name(logical_plan.column())
|
|
526
|
+
# todo double check if this is correct
|
|
507
527
|
rows = session.sql(f"DESCRIBE TABLE {name}").collect()
|
|
508
528
|
case "DescribeNamespace":
|
|
509
529
|
name = get_relation_identifier_name(logical_plan.namespace(), True)
|
|
@@ -521,11 +541,12 @@ def map_sql_to_pandas_df(
|
|
|
521
541
|
# This gets the schema without executing the query (similar to Spark's DESCRIBE QUERY)
|
|
522
542
|
# Get the inner query plan and convert it to SQL
|
|
523
543
|
inner_query_plan = logical_plan.plan()
|
|
524
|
-
|
|
544
|
+
df_container = execute_logical_plan(inner_query_plan)
|
|
545
|
+
df = df_container.dataframe
|
|
525
546
|
schema = df.schema
|
|
526
547
|
|
|
527
548
|
# Get original Spark column names using the column map from the original DataFrame
|
|
528
|
-
spark_columns =
|
|
549
|
+
spark_columns = df_container.column_map.get_spark_columns()
|
|
529
550
|
data = []
|
|
530
551
|
for i, field in enumerate(schema.fields):
|
|
531
552
|
# Use original Spark column name from column map
|
|
@@ -615,9 +636,9 @@ def map_sql_to_pandas_df(
|
|
|
615
636
|
"UnresolvedHaving",
|
|
616
637
|
"Distinct",
|
|
617
638
|
):
|
|
618
|
-
expr = execute_logical_plan(
|
|
619
|
-
|
|
620
|
-
][0]
|
|
639
|
+
expr = execute_logical_plan(
|
|
640
|
+
logical_plan.logicalPlan()
|
|
641
|
+
).dataframe.queries["queries"][0]
|
|
621
642
|
final_sql = f"EXPLAIN USING TEXT {expr}"
|
|
622
643
|
rows = session.sql(final_sql).collect()
|
|
623
644
|
elif (
|
|
@@ -626,7 +647,7 @@ def map_sql_to_pandas_df(
|
|
|
626
647
|
):
|
|
627
648
|
expr = execute_logical_plan(
|
|
628
649
|
logical_plan.logicalPlan().query()
|
|
629
|
-
).queries["queries"][0]
|
|
650
|
+
).dataframe.queries["queries"][0]
|
|
630
651
|
final_sql = f"EXPLAIN USING TEXT {expr}"
|
|
631
652
|
rows = session.sql(final_sql).collect()
|
|
632
653
|
else:
|
|
@@ -635,7 +656,8 @@ def map_sql_to_pandas_df(
|
|
|
635
656
|
f"{logical_plan_name} is not supported yet with EXPLAIN."
|
|
636
657
|
)
|
|
637
658
|
case "InsertIntoStatement":
|
|
638
|
-
|
|
659
|
+
df_container = execute_logical_plan(logical_plan.query())
|
|
660
|
+
df = df_container.dataframe
|
|
639
661
|
queries = df.queries["queries"]
|
|
640
662
|
if len(queries) != 1:
|
|
641
663
|
raise SnowparkConnectNotImplementedError(
|
|
@@ -645,7 +667,7 @@ def map_sql_to_pandas_df(
|
|
|
645
667
|
name = get_relation_identifier_name(logical_plan.table(), True)
|
|
646
668
|
|
|
647
669
|
user_columns = [
|
|
648
|
-
|
|
670
|
+
spark_to_sf_single_id(str(col), is_column=True)
|
|
649
671
|
for col in as_java_list(logical_plan.userSpecifiedCols())
|
|
650
672
|
]
|
|
651
673
|
overwrite_str = "OVERWRITE" if logical_plan.overwrite() else ""
|
|
@@ -751,32 +773,36 @@ def map_sql_to_pandas_df(
|
|
|
751
773
|
)
|
|
752
774
|
return assignments
|
|
753
775
|
|
|
754
|
-
|
|
776
|
+
source_df_container = map_relation(
|
|
755
777
|
map_logical_plan_relation(logical_plan.sourceTable())
|
|
756
778
|
)
|
|
757
|
-
|
|
779
|
+
source_df = source_df_container.dataframe
|
|
758
780
|
plan_id = gen_sql_plan_id()
|
|
759
|
-
|
|
760
|
-
target_df = map_relation(
|
|
781
|
+
target_df_container = map_relation(
|
|
761
782
|
map_logical_plan_relation(logical_plan.targetTable(), plan_id)
|
|
762
783
|
)
|
|
763
|
-
|
|
784
|
+
target_df = target_df_container.dataframe
|
|
785
|
+
|
|
786
|
+
for col in target_df_container.column_map.columns:
|
|
764
787
|
target_df = target_df.with_column_renamed(
|
|
765
|
-
col.snowpark_name,
|
|
788
|
+
col.snowpark_name,
|
|
789
|
+
spark_to_sf_single_id(col.spark_name, is_column=True),
|
|
766
790
|
)
|
|
767
|
-
|
|
768
|
-
target_df,
|
|
791
|
+
target_df_container = DataFrameContainer.create_with_column_mapping(
|
|
792
|
+
dataframe=target_df,
|
|
793
|
+
spark_column_names=target_df.columns,
|
|
794
|
+
snowpark_column_names=target_df.columns,
|
|
769
795
|
)
|
|
770
796
|
|
|
771
|
-
set_plan_id_map(plan_id,
|
|
797
|
+
set_plan_id_map(plan_id, target_df_container)
|
|
772
798
|
|
|
773
799
|
joined_df_before_condition: snowpark.DataFrame = source_df.join(
|
|
774
800
|
target_df
|
|
775
801
|
)
|
|
776
802
|
|
|
777
803
|
column_mapping_for_conditions = column_name_handler.JoinColumnNameMap(
|
|
778
|
-
|
|
779
|
-
|
|
804
|
+
source_df_container.column_map,
|
|
805
|
+
target_df_container.column_map,
|
|
780
806
|
)
|
|
781
807
|
typer_for_expressions = ExpressionTyper(joined_df_before_condition)
|
|
782
808
|
|
|
@@ -803,8 +829,8 @@ def map_sql_to_pandas_df(
|
|
|
803
829
|
):
|
|
804
830
|
assignments = _get_assignments_from_action(
|
|
805
831
|
matched_action,
|
|
806
|
-
|
|
807
|
-
|
|
832
|
+
source_df_container.column_map,
|
|
833
|
+
target_df_container.column_map,
|
|
808
834
|
ExpressionTyper(source_df),
|
|
809
835
|
ExpressionTyper(target_df),
|
|
810
836
|
)
|
|
@@ -825,8 +851,8 @@ def map_sql_to_pandas_df(
|
|
|
825
851
|
):
|
|
826
852
|
assignments = _get_assignments_from_action(
|
|
827
853
|
not_matched_action,
|
|
828
|
-
|
|
829
|
-
|
|
854
|
+
source_df_container.column_map,
|
|
855
|
+
target_df_container.column_map,
|
|
830
856
|
ExpressionTyper(source_df),
|
|
831
857
|
ExpressionTyper(target_df),
|
|
832
858
|
)
|
|
@@ -852,20 +878,28 @@ def map_sql_to_pandas_df(
|
|
|
852
878
|
source_df, merge_condition_typed_col.col, clauses
|
|
853
879
|
)
|
|
854
880
|
case "DeleteFromTable":
|
|
855
|
-
|
|
856
|
-
|
|
881
|
+
df_container = map_relation(
|
|
882
|
+
map_logical_plan_relation(logical_plan.table())
|
|
883
|
+
)
|
|
884
|
+
df = df_container.dataframe
|
|
885
|
+
for col in df_container.column_map.columns:
|
|
857
886
|
df = df.with_column_renamed(
|
|
858
|
-
col.snowpark_name,
|
|
887
|
+
col.snowpark_name,
|
|
888
|
+
spark_to_sf_single_id(col.spark_name, is_column=True),
|
|
859
889
|
)
|
|
860
|
-
|
|
861
|
-
|
|
890
|
+
df_container = column_name_handler.create_with_column_mapping(
|
|
891
|
+
dataframe=df,
|
|
892
|
+
spark_column_names=df.columns,
|
|
893
|
+
snowpark_column_names=df.columns,
|
|
894
|
+
)
|
|
895
|
+
df = df_container.dataframe
|
|
862
896
|
name = get_relation_identifier_name(logical_plan.table(), True)
|
|
863
897
|
(
|
|
864
898
|
condition_column_name,
|
|
865
899
|
condition_typed_col,
|
|
866
900
|
) = map_single_column_expression(
|
|
867
901
|
map_logical_plan_expression(logical_plan.condition()),
|
|
868
|
-
|
|
902
|
+
df_container.column_map,
|
|
869
903
|
ExpressionTyper(df),
|
|
870
904
|
)
|
|
871
905
|
session.table(name).delete(condition_typed_col.col)
|
|
@@ -873,9 +907,23 @@ def map_sql_to_pandas_df(
|
|
|
873
907
|
table_name = get_relation_identifier_name(logical_plan.table(), True)
|
|
874
908
|
column_obj = logical_plan.column()
|
|
875
909
|
old_column_name = ".".join(
|
|
876
|
-
str(part)
|
|
910
|
+
spark_to_sf_single_id(str(part), is_column=True)
|
|
911
|
+
for part in as_java_list(column_obj.name())
|
|
912
|
+
)
|
|
913
|
+
if not global_config.spark_sql_caseSensitive:
|
|
914
|
+
case_insensitive_name = next(
|
|
915
|
+
(
|
|
916
|
+
f.name
|
|
917
|
+
for f in session.table(table_name).schema.fields
|
|
918
|
+
if f.name.lower() == old_column_name.lower()
|
|
919
|
+
),
|
|
920
|
+
None,
|
|
921
|
+
)
|
|
922
|
+
if case_insensitive_name:
|
|
923
|
+
old_column_name = case_insensitive_name
|
|
924
|
+
new_column_name = spark_to_sf_single_id(
|
|
925
|
+
str(logical_plan.newName()), is_column=True
|
|
877
926
|
)
|
|
878
|
-
new_column_name = str(logical_plan.newName())
|
|
879
927
|
|
|
880
928
|
# Pass through to Snowflake
|
|
881
929
|
snowflake_sql = f"ALTER TABLE {table_name} RENAME COLUMN {old_column_name} TO {new_column_name}"
|
|
@@ -971,6 +1019,32 @@ def map_sql_to_pandas_df(
|
|
|
971
1019
|
else:
|
|
972
1020
|
rows = session.sql("SHOW TABLES").collect()
|
|
973
1021
|
|
|
1022
|
+
# Return empty DataFrame with proper schema if no results
|
|
1023
|
+
if not rows:
|
|
1024
|
+
if class_name == "ShowTableExtended":
|
|
1025
|
+
return (
|
|
1026
|
+
pandas.DataFrame(
|
|
1027
|
+
{
|
|
1028
|
+
"namespace": [""],
|
|
1029
|
+
"tableName": [""],
|
|
1030
|
+
"isTemporary": [""],
|
|
1031
|
+
"information": [""],
|
|
1032
|
+
}
|
|
1033
|
+
),
|
|
1034
|
+
"",
|
|
1035
|
+
)
|
|
1036
|
+
else:
|
|
1037
|
+
return (
|
|
1038
|
+
pandas.DataFrame(
|
|
1039
|
+
{
|
|
1040
|
+
"namespace": [""],
|
|
1041
|
+
"tableName": [""],
|
|
1042
|
+
"isTemporary": [""],
|
|
1043
|
+
}
|
|
1044
|
+
),
|
|
1045
|
+
"",
|
|
1046
|
+
)
|
|
1047
|
+
|
|
974
1048
|
# Apply pattern filtering if pattern is provided
|
|
975
1049
|
# This is workaround to filter using Python regex.
|
|
976
1050
|
if pattern and rows:
|
|
@@ -1020,9 +1094,7 @@ def map_sql_to_pandas_df(
|
|
|
1020
1094
|
if db_name and multi_part_len == 2:
|
|
1021
1095
|
# Check db_name is same as in the full table name
|
|
1022
1096
|
if (
|
|
1023
|
-
|
|
1024
|
-
str(db_and_table_name[0])
|
|
1025
|
-
).casefold()
|
|
1097
|
+
spark_to_sf_single_id(str(db_and_table_name[0])).casefold()
|
|
1026
1098
|
!= db_name.casefold()
|
|
1027
1099
|
):
|
|
1028
1100
|
raise AnalysisException(
|
|
@@ -1075,11 +1147,7 @@ def map_sql_to_pandas_df(
|
|
|
1075
1147
|
if _is_sql_select_statement_helper(sql_string):
|
|
1076
1148
|
return None, None
|
|
1077
1149
|
session = snowpark.Session.get_active_session()
|
|
1078
|
-
|
|
1079
|
-
columns = sql_df.columns
|
|
1080
|
-
column_name_handler.with_column_map(sql_df, columns, columns)
|
|
1081
|
-
rows = sql_df.collect()
|
|
1082
|
-
|
|
1150
|
+
rows = session.sql(sql_string).collect()
|
|
1083
1151
|
if rows:
|
|
1084
1152
|
return pandas.DataFrame(rows), ""
|
|
1085
1153
|
return pandas.DataFrame({"": [""]}), ""
|
|
@@ -1089,7 +1157,9 @@ def get_sql_passthrough() -> bool:
|
|
|
1089
1157
|
return get_boolean_session_config_param("snowpark.connect.sql.passthrough")
|
|
1090
1158
|
|
|
1091
1159
|
|
|
1092
|
-
def map_sql(
|
|
1160
|
+
def map_sql(
|
|
1161
|
+
rel: relation_proto.Relation,
|
|
1162
|
+
) -> DataFrameContainer:
|
|
1093
1163
|
"""
|
|
1094
1164
|
Map a SQL string to a DataFrame.
|
|
1095
1165
|
|
|
@@ -1112,7 +1182,11 @@ def map_sql(rel: relation_proto.Relation) -> snowpark.DataFrame:
|
|
|
1112
1182
|
session = snowpark.Session.get_active_session()
|
|
1113
1183
|
sql_df = session.sql(rel.sql.query)
|
|
1114
1184
|
columns = sql_df.columns
|
|
1115
|
-
return
|
|
1185
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
1186
|
+
dataframe=sql_df,
|
|
1187
|
+
spark_column_names=columns,
|
|
1188
|
+
snowpark_column_names=columns,
|
|
1189
|
+
)
|
|
1116
1190
|
|
|
1117
1191
|
|
|
1118
1192
|
def map_logical_plan_relation(
|
|
@@ -1453,8 +1527,9 @@ def map_logical_plan_relation(
|
|
|
1453
1527
|
|
|
1454
1528
|
# Need to find ids which are not part of values and remaining cols of df
|
|
1455
1529
|
input_rel = map_logical_plan_relation(rel.child())
|
|
1456
|
-
|
|
1457
|
-
|
|
1530
|
+
result = map_relation(input_rel)
|
|
1531
|
+
input_df: snowpark.DataFrame = result.dataframe
|
|
1532
|
+
column_map = result.column_map
|
|
1458
1533
|
typer = ExpressionTyper(input_df)
|
|
1459
1534
|
unpivot_spark_names = []
|
|
1460
1535
|
for v in values:
|
|
@@ -1744,8 +1819,8 @@ def map_logical_plan_relation(
|
|
|
1744
1819
|
# )
|
|
1745
1820
|
|
|
1746
1821
|
# This is a workaround to fix the bug in snowpark where if we select posexplode with *, it would return wrong columns
|
|
1747
|
-
|
|
1748
|
-
spark_columns =
|
|
1822
|
+
input_container = map_relation(input_relation)
|
|
1823
|
+
spark_columns = input_container.column_map.get_spark_columns()
|
|
1749
1824
|
column_expressions = [
|
|
1750
1825
|
expressions_proto.Expression(
|
|
1751
1826
|
unresolved_attribute=expressions_proto.Expression.UnresolvedAttribute(
|
|
@@ -1796,7 +1871,9 @@ def get_relation_identifier_name(name_obj, is_multi_part: bool = False) -> str:
|
|
|
1796
1871
|
expr = map_single_column_expression(
|
|
1797
1872
|
expr_proto, m, ExpressionTyper.dummy_typer(session)
|
|
1798
1873
|
)
|
|
1799
|
-
name =
|
|
1874
|
+
name = spark_to_sf_single_id(
|
|
1875
|
+
session.range(1).select(expr[1].col).collect()[0][0]
|
|
1876
|
+
)
|
|
1800
1877
|
else:
|
|
1801
1878
|
if is_multi_part:
|
|
1802
1879
|
name = _spark_to_snowflake(name_obj.multipartIdentifier())
|