snowpark-connect 0.25.0__py3-none-any.whl → 0.27.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- snowflake/snowpark_connect/config.py +10 -3
- snowflake/snowpark_connect/dataframe_container.py +16 -0
- snowflake/snowpark_connect/expression/map_expression.py +15 -0
- snowflake/snowpark_connect/expression/map_udf.py +68 -27
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +18 -0
- snowflake/snowpark_connect/expression/map_unresolved_function.py +38 -28
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/relation/map_extension.py +9 -7
- snowflake/snowpark_connect/relation/map_map_partitions.py +36 -72
- snowflake/snowpark_connect/relation/map_relation.py +15 -2
- snowflake/snowpark_connect/relation/map_row_ops.py +8 -1
- snowflake/snowpark_connect/relation/map_show_string.py +2 -0
- snowflake/snowpark_connect/relation/map_sql.py +63 -2
- snowflake/snowpark_connect/relation/map_udtf.py +96 -44
- snowflake/snowpark_connect/relation/utils.py +44 -0
- snowflake/snowpark_connect/relation/write/map_write.py +135 -24
- snowflake/snowpark_connect/resources_initializer.py +18 -5
- snowflake/snowpark_connect/server.py +12 -2
- snowflake/snowpark_connect/utils/artifacts.py +4 -5
- snowflake/snowpark_connect/utils/concurrent.py +4 -0
- snowflake/snowpark_connect/utils/context.py +41 -1
- snowflake/snowpark_connect/utils/external_udxf_cache.py +36 -0
- snowflake/snowpark_connect/utils/pandas_udtf_utils.py +86 -2
- snowflake/snowpark_connect/utils/scala_udf_utils.py +250 -242
- snowflake/snowpark_connect/utils/session.py +4 -0
- snowflake/snowpark_connect/utils/udf_utils.py +71 -118
- snowflake/snowpark_connect/utils/udtf_helper.py +17 -7
- snowflake/snowpark_connect/utils/udtf_utils.py +3 -16
- snowflake/snowpark_connect/version.py +2 -3
- {snowpark_connect-0.25.0.dist-info → snowpark_connect-0.27.0.dist-info}/METADATA +2 -2
- {snowpark_connect-0.25.0.dist-info → snowpark_connect-0.27.0.dist-info}/RECORD +41 -37
- {snowpark_connect-0.25.0.data → snowpark_connect-0.27.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.25.0.data → snowpark_connect-0.27.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.25.0.data → snowpark_connect-0.27.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.25.0.dist-info → snowpark_connect-0.27.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.25.0.dist-info → snowpark_connect-0.27.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.25.0.dist-info → snowpark_connect-0.27.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.25.0.dist-info → snowpark_connect-0.27.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.25.0.dist-info → snowpark_connect-0.27.0.dist-info}/top_level.txt +0 -0
|
@@ -15,6 +15,7 @@ from snowflake.snowpark_connect.utils.cache import (
|
|
|
15
15
|
from snowflake.snowpark_connect.utils.context import (
|
|
16
16
|
get_plan_id_map,
|
|
17
17
|
get_session_id,
|
|
18
|
+
push_map_partitions,
|
|
18
19
|
push_operation_scope,
|
|
19
20
|
set_is_aggregate_function,
|
|
20
21
|
set_plan_id_map,
|
|
@@ -90,6 +91,7 @@ def map_relation(
|
|
|
90
91
|
table_name=copy.deepcopy(cached_container.table_name),
|
|
91
92
|
alias=cached_container.alias,
|
|
92
93
|
cached_schema_getter=lambda: cached_df.schema,
|
|
94
|
+
partition_hint=cached_container.partition_hint,
|
|
93
95
|
)
|
|
94
96
|
# If we don't make a copy of the df._output, the expression IDs for attributes in Snowpark DataFrames will differ from those stored in the cache,
|
|
95
97
|
# leading to errors during query execution.
|
|
@@ -179,7 +181,8 @@ def map_relation(
|
|
|
179
181
|
)
|
|
180
182
|
return cached_df
|
|
181
183
|
case "map_partitions":
|
|
182
|
-
|
|
184
|
+
with push_map_partitions():
|
|
185
|
+
result = map_map_partitions.map_map_partitions(rel)
|
|
183
186
|
case "offset":
|
|
184
187
|
result = map_row_ops.map_offset(rel)
|
|
185
188
|
case "project":
|
|
@@ -189,13 +192,23 @@ def map_relation(
|
|
|
189
192
|
case "read":
|
|
190
193
|
result = read.map_read(rel)
|
|
191
194
|
case "repartition":
|
|
192
|
-
#
|
|
195
|
+
# Preserve partition hint for file output control
|
|
196
|
+
# This handles both repartition(n) with shuffle=True and coalesce(n) with shuffle=False
|
|
193
197
|
result = map_relation(rel.repartition.input)
|
|
198
|
+
if rel.repartition.num_partitions > 0:
|
|
199
|
+
result.partition_hint = rel.repartition.num_partitions
|
|
194
200
|
case "repartition_by_expression":
|
|
195
201
|
# This is a no-op operation in SAS as Snowpark doesn't have the concept of partitions.
|
|
196
202
|
# All the data in the dataframe will be treated as a single partition, and this will not
|
|
197
203
|
# have any side effects.
|
|
198
204
|
result = map_relation(rel.repartition_by_expression.input)
|
|
205
|
+
# Only preserve partition hint if num_partitions is explicitly specified and > 0
|
|
206
|
+
# Column-based repartitioning without count should clear any existing partition hints
|
|
207
|
+
if rel.repartition_by_expression.num_partitions > 0:
|
|
208
|
+
result.partition_hint = rel.repartition_by_expression.num_partitions
|
|
209
|
+
else:
|
|
210
|
+
# Column-based repartitioning clears partition hint (resets to default behavior)
|
|
211
|
+
result.partition_hint = None
|
|
199
212
|
case "replace":
|
|
200
213
|
result = map_row_ops.map_replace(rel)
|
|
201
214
|
case "sample":
|
|
@@ -553,7 +553,14 @@ def map_filter(
|
|
|
553
553
|
rel.filter.condition, input_container.column_map, typer
|
|
554
554
|
)
|
|
555
555
|
|
|
556
|
-
|
|
556
|
+
if rel.filter.input.WhichOneof("rel_type") == "subquery_alias":
|
|
557
|
+
# map_subquery_alias does not actually wrap the DataFrame in an alias or subquery.
|
|
558
|
+
# Apparently, there are cases (e.g., TpcdsQ53) where this is required, without it, we get
|
|
559
|
+
# SQL compilation error.
|
|
560
|
+
# To mitigate it, we are doing .select("*"), .alias() introduces additional describe queries
|
|
561
|
+
result = input_df.select("*").filter(condition.col)
|
|
562
|
+
else:
|
|
563
|
+
result = input_df.filter(condition.col)
|
|
557
564
|
|
|
558
565
|
return DataFrameContainer(
|
|
559
566
|
result,
|
|
@@ -12,6 +12,7 @@ from snowflake.snowpark._internal.analyzer import analyzer_utils
|
|
|
12
12
|
from snowflake.snowpark.functions import col
|
|
13
13
|
from snowflake.snowpark.types import DateType, StringType, StructField, StructType
|
|
14
14
|
from snowflake.snowpark_connect.column_name_handler import set_schema_getter
|
|
15
|
+
from snowflake.snowpark_connect.config import global_config
|
|
15
16
|
from snowflake.snowpark_connect.dataframe_container import DataFrameContainer
|
|
16
17
|
from snowflake.snowpark_connect.relation.map_relation import map_relation
|
|
17
18
|
|
|
@@ -33,6 +34,7 @@ def map_show_string(rel: relation_proto.Relation) -> pandas.DataFrame:
|
|
|
33
34
|
truncate=rel.show_string.truncate,
|
|
34
35
|
vertical=rel.show_string.vertical,
|
|
35
36
|
_spark_column_names=input_df_container.column_map.get_spark_columns(),
|
|
37
|
+
_spark_session_tz=global_config.spark_sql_session_timeZone,
|
|
36
38
|
)
|
|
37
39
|
return pandas.DataFrame({"show_string": [show_string]})
|
|
38
40
|
|
|
@@ -56,6 +56,7 @@ from snowflake.snowpark_connect.utils.context import (
|
|
|
56
56
|
_accessing_temp_object,
|
|
57
57
|
gen_sql_plan_id,
|
|
58
58
|
get_session_id,
|
|
59
|
+
get_sql_plan,
|
|
59
60
|
push_evaluating_sql_scope,
|
|
60
61
|
push_sql_scope,
|
|
61
62
|
set_sql_args,
|
|
@@ -542,6 +543,7 @@ def map_sql_to_pandas_df(
|
|
|
542
543
|
rows = session.sql(f"DESCRIBE TABLE {name}").collect()
|
|
543
544
|
case "DescribeNamespace":
|
|
544
545
|
name = get_relation_identifier_name(logical_plan.namespace(), True)
|
|
546
|
+
name = change_default_to_public(name)
|
|
545
547
|
rows = session.sql(f"DESCRIBE SCHEMA {name}").collect()
|
|
546
548
|
if not rows:
|
|
547
549
|
rows = None
|
|
@@ -793,6 +795,7 @@ def map_sql_to_pandas_df(
|
|
|
793
795
|
case "SetCatalogAndNamespace":
|
|
794
796
|
# TODO: add catalog setting here
|
|
795
797
|
name = get_relation_identifier_name(logical_plan.child(), True)
|
|
798
|
+
name = change_default_to_public(name)
|
|
796
799
|
session.sql(f"USE SCHEMA {name}").collect()
|
|
797
800
|
case "SetCommand":
|
|
798
801
|
kv_result_tuple = logical_plan.kv().get()
|
|
@@ -801,6 +804,7 @@ def map_sql_to_pandas_df(
|
|
|
801
804
|
set_config_param(get_session_id(), key, val, session)
|
|
802
805
|
case "SetNamespaceCommand":
|
|
803
806
|
name = _spark_to_snowflake(logical_plan.namespace())
|
|
807
|
+
name = change_default_to_public(name)
|
|
804
808
|
session.sql(f"USE SCHEMA {name}").collect()
|
|
805
809
|
case "SetNamespaceLocation" | "SetNamespaceProperties":
|
|
806
810
|
raise SnowparkConnectNotImplementedError(
|
|
@@ -997,6 +1001,20 @@ def get_sql_passthrough() -> bool:
|
|
|
997
1001
|
return get_boolean_session_config_param("snowpark.connect.sql.passthrough")
|
|
998
1002
|
|
|
999
1003
|
|
|
1004
|
+
def change_default_to_public(name: str) -> str:
|
|
1005
|
+
"""
|
|
1006
|
+
Change the namespace to PUBLIC when given name is DEFAULT
|
|
1007
|
+
:param name: Given namespace
|
|
1008
|
+
:return: if name is DEFAULT return PUBLIC otherwise name
|
|
1009
|
+
"""
|
|
1010
|
+
if name.startswith('"'):
|
|
1011
|
+
if name.upper() == '"DEFAULT"':
|
|
1012
|
+
return name.replace("DEFAULT", "PUBLIC")
|
|
1013
|
+
elif name.upper() == "DEFAULT":
|
|
1014
|
+
return "PUBLIC"
|
|
1015
|
+
return name
|
|
1016
|
+
|
|
1017
|
+
|
|
1000
1018
|
def map_sql(
|
|
1001
1019
|
rel: relation_proto.Relation,
|
|
1002
1020
|
) -> DataFrameContainer:
|
|
@@ -1008,7 +1026,6 @@ def map_sql(
|
|
|
1008
1026
|
In passthough mode as True, SAS calls session.sql() and not calling Spark Parser.
|
|
1009
1027
|
This is to mitigate any issue not covered by spark logical plan to protobuf conversion.
|
|
1010
1028
|
"""
|
|
1011
|
-
|
|
1012
1029
|
snowpark_connect_sql_passthrough = get_sql_passthrough()
|
|
1013
1030
|
|
|
1014
1031
|
if not snowpark_connect_sql_passthrough:
|
|
@@ -1353,6 +1370,7 @@ def map_logical_plan_relation(
|
|
|
1353
1370
|
left_input=map_logical_plan_relation(children[0]),
|
|
1354
1371
|
right_input=map_logical_plan_relation(children[1]),
|
|
1355
1372
|
set_op_type=relation_proto.SetOperation.SET_OP_TYPE_UNION,
|
|
1373
|
+
is_all=True,
|
|
1356
1374
|
by_name=rel.byName(),
|
|
1357
1375
|
allow_missing_columns=rel.allowMissingCol(),
|
|
1358
1376
|
)
|
|
@@ -1701,7 +1719,50 @@ def map_logical_plan_relation(
|
|
|
1701
1719
|
_window_specs.get()[key] = window_spec
|
|
1702
1720
|
proto = map_logical_plan_relation(rel.child())
|
|
1703
1721
|
case "Generate":
|
|
1704
|
-
|
|
1722
|
+
# Generate creates a nested Project relation (see lines 1785-1790) without
|
|
1723
|
+
# setting its plan_id field. When this Project is later processed by map_project
|
|
1724
|
+
# (map_column_ops.py), it uses rel.common.plan_id which defaults to 0 for unset
|
|
1725
|
+
# protobuf fields. This means all columns from the Generate operation (both exploded
|
|
1726
|
+
# columns and passthrough columns) will have plan_id=0 in their names.
|
|
1727
|
+
#
|
|
1728
|
+
# If Generate's child is a SubqueryAlias whose inner relation was processed
|
|
1729
|
+
# with a non-zero plan_id, there will be a mismatch between:
|
|
1730
|
+
# - The columns referenced in the Project (expecting plan_id from SubqueryAlias's child)
|
|
1731
|
+
# - The actual column names created by Generate's Project (using plan_id=0)
|
|
1732
|
+
|
|
1733
|
+
# Therefore, when Generate has a SubqueryAlias child, we explicitly process the inner
|
|
1734
|
+
# relation with plan_id=0 to match what Generate's Project will use. This only applies when
|
|
1735
|
+
# the immediate child of Generate is a SubqueryAlias and preserves existing registrations (like CTEs),
|
|
1736
|
+
# so it won't affect other patterns.
|
|
1737
|
+
|
|
1738
|
+
child_class = str(rel.child().getClass().getSimpleName())
|
|
1739
|
+
|
|
1740
|
+
if child_class == "SubqueryAlias":
|
|
1741
|
+
alias = str(rel.child().alias())
|
|
1742
|
+
|
|
1743
|
+
# Check if this alias was already registered during initial SQL parsing
|
|
1744
|
+
existing_plan_id = get_sql_plan(alias)
|
|
1745
|
+
|
|
1746
|
+
if existing_plan_id is not None:
|
|
1747
|
+
# Use the existing plan_id to maintain consistency with prior registration
|
|
1748
|
+
used_plan_id = existing_plan_id
|
|
1749
|
+
else:
|
|
1750
|
+
# Use plan_id=0 to match what the nested Project will use (protobuf default)
|
|
1751
|
+
used_plan_id = 0
|
|
1752
|
+
set_sql_plan_name(alias, used_plan_id)
|
|
1753
|
+
|
|
1754
|
+
# Process the inner child with the determined plan_id
|
|
1755
|
+
inner_child = map_logical_plan_relation(
|
|
1756
|
+
rel.child().child(), plan_id=used_plan_id
|
|
1757
|
+
)
|
|
1758
|
+
input_relation = relation_proto.Relation(
|
|
1759
|
+
subquery_alias=relation_proto.SubqueryAlias(
|
|
1760
|
+
input=inner_child,
|
|
1761
|
+
alias=alias,
|
|
1762
|
+
)
|
|
1763
|
+
)
|
|
1764
|
+
else:
|
|
1765
|
+
input_relation = map_logical_plan_relation(rel.child())
|
|
1705
1766
|
generator_output_list = as_java_list(rel.generatorOutput())
|
|
1706
1767
|
generator_output_list_expressions = [
|
|
1707
1768
|
map_logical_plan_expression(e) for e in generator_output_list
|
|
@@ -31,6 +31,10 @@ from snowflake.snowpark_connect.type_mapping import (
|
|
|
31
31
|
proto_to_snowpark_type,
|
|
32
32
|
)
|
|
33
33
|
from snowflake.snowpark_connect.utils.context import push_udtf_context
|
|
34
|
+
from snowflake.snowpark_connect.utils.external_udxf_cache import (
|
|
35
|
+
cache_external_udtf,
|
|
36
|
+
get_external_udtf_from_cache,
|
|
37
|
+
)
|
|
34
38
|
from snowflake.snowpark_connect.utils.session import get_or_create_snowpark_session
|
|
35
39
|
from snowflake.snowpark_connect.utils.udtf_helper import (
|
|
36
40
|
SnowparkUDTF,
|
|
@@ -44,6 +48,34 @@ from snowflake.snowpark_connect.utils.udxf_import_utils import (
|
|
|
44
48
|
)
|
|
45
49
|
|
|
46
50
|
|
|
51
|
+
def cache_external_udtf_wrapper(from_register_udtf: bool):
|
|
52
|
+
def outer_wrapper(wrapper_func):
|
|
53
|
+
def wrapper(
|
|
54
|
+
udtf_proto: relation_proto.CommonInlineUserDefinedTableFunction,
|
|
55
|
+
spark_column_names,
|
|
56
|
+
) -> SnowparkUDTF | None:
|
|
57
|
+
udf_hash = hash(str(udtf_proto))
|
|
58
|
+
cached_udtf = get_external_udtf_from_cache(udf_hash)
|
|
59
|
+
|
|
60
|
+
if cached_udtf:
|
|
61
|
+
if from_register_udtf:
|
|
62
|
+
session = get_or_create_snowpark_session()
|
|
63
|
+
session._udtfs[udtf_proto.function_name.lower()] = (
|
|
64
|
+
cached_udtf,
|
|
65
|
+
spark_column_names,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return cached_udtf
|
|
69
|
+
|
|
70
|
+
snowpark_udf = wrapper_func(udtf_proto, spark_column_names)
|
|
71
|
+
cache_external_udtf(udf_hash, snowpark_udf)
|
|
72
|
+
return snowpark_udf
|
|
73
|
+
|
|
74
|
+
return wrapper
|
|
75
|
+
|
|
76
|
+
return outer_wrapper
|
|
77
|
+
|
|
78
|
+
|
|
47
79
|
def build_expected_types_from_parsed(
|
|
48
80
|
parsed_return: types_proto.DataType,
|
|
49
81
|
) -> List[Tuple[str, Any]]:
|
|
@@ -165,26 +197,37 @@ def register_udtf(
|
|
|
165
197
|
) = process_return_type(python_udft.return_type)
|
|
166
198
|
function_name = udtf_proto.function_name
|
|
167
199
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
200
|
+
@cache_external_udtf_wrapper(from_register_udtf=True)
|
|
201
|
+
def _register_udtf(
|
|
202
|
+
udtf_proto: relation_proto.CommonInlineUserDefinedTableFunction,
|
|
203
|
+
spark_column_names,
|
|
204
|
+
):
|
|
205
|
+
kwargs = {
|
|
206
|
+
"session": session,
|
|
207
|
+
"udtf_proto": udtf_proto,
|
|
208
|
+
"expected_types": expected_types,
|
|
209
|
+
"output_schema": output_schema,
|
|
210
|
+
"packages": global_config.get("snowpark.connect.udf.packages", ""),
|
|
211
|
+
"imports": get_python_udxf_import_files(session),
|
|
212
|
+
"called_from": "register_udtf",
|
|
213
|
+
"is_arrow_enabled": is_arrow_enabled_in_udtf(),
|
|
214
|
+
"is_spark_compatible_udtf_mode_enabled": is_spark_compatible_udtf_mode_enabled(),
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if require_creating_udtf_in_sproc(udtf_proto):
|
|
218
|
+
snowpark_udtf = create_udtf_in_sproc(**kwargs)
|
|
219
|
+
else:
|
|
220
|
+
udtf = create_udtf(**kwargs)
|
|
221
|
+
snowpark_udtf = SnowparkUDTF(
|
|
222
|
+
name=udtf.name,
|
|
223
|
+
input_types=udtf._input_types,
|
|
224
|
+
output_schema=output_schema,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
return snowpark_udtf
|
|
187
228
|
|
|
229
|
+
snowpark_udtf = _register_udtf(udtf_proto, spark_column_names)
|
|
230
|
+
# We have to update cached _udtfs here, because function could have been cached in map_common_inline_user_defined_table_function
|
|
188
231
|
session._udtfs[function_name.lower()] = (snowpark_udtf, spark_column_names)
|
|
189
232
|
return snowpark_udtf
|
|
190
233
|
|
|
@@ -213,32 +256,41 @@ def map_common_inline_user_defined_table_function(
|
|
|
213
256
|
spark_column_names,
|
|
214
257
|
) = process_return_type(python_udft.return_type)
|
|
215
258
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
259
|
+
@cache_external_udtf_wrapper(from_register_udtf=False)
|
|
260
|
+
def _get_udtf(
|
|
261
|
+
udtf_proto: relation_proto.CommonInlineUserDefinedTableFunction,
|
|
262
|
+
spark_column_names,
|
|
263
|
+
):
|
|
264
|
+
kwargs = {
|
|
265
|
+
"session": session,
|
|
266
|
+
"udtf_proto": udtf_proto,
|
|
267
|
+
"expected_types": expected_types,
|
|
268
|
+
"output_schema": output_schema,
|
|
269
|
+
"packages": global_config.get("snowpark.connect.udf.packages", ""),
|
|
270
|
+
"imports": get_python_udxf_import_files(session),
|
|
271
|
+
"called_from": "map_common_inline_user_defined_table_function",
|
|
272
|
+
"is_arrow_enabled": is_arrow_enabled_in_udtf(),
|
|
273
|
+
"is_spark_compatible_udtf_mode_enabled": is_spark_compatible_udtf_mode_enabled(),
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
if require_creating_udtf_in_sproc(udtf_proto):
|
|
277
|
+
snowpark_udtf_or_error = create_udtf_in_sproc(**kwargs)
|
|
278
|
+
if isinstance(snowpark_udtf_or_error, str):
|
|
279
|
+
raise PythonException(snowpark_udtf_or_error)
|
|
280
|
+
snowpark_udtf = snowpark_udtf_or_error
|
|
281
|
+
else:
|
|
282
|
+
udtf_or_error = create_udtf(**kwargs)
|
|
283
|
+
if isinstance(udtf_or_error, str):
|
|
284
|
+
raise PythonException(udtf_or_error)
|
|
285
|
+
udtf = udtf_or_error
|
|
286
|
+
snowpark_udtf = SnowparkUDTF(
|
|
287
|
+
name=udtf.name,
|
|
288
|
+
input_types=udtf._input_types,
|
|
289
|
+
output_schema=output_schema,
|
|
290
|
+
)
|
|
291
|
+
return snowpark_udtf
|
|
241
292
|
|
|
293
|
+
snowpark_udtf = _get_udtf(rel, spark_column_names)
|
|
242
294
|
column_map = ColumnNameMap([], [])
|
|
243
295
|
snowpark_udtf_args = []
|
|
244
296
|
|
|
@@ -6,6 +6,7 @@ import random
|
|
|
6
6
|
import re
|
|
7
7
|
import string
|
|
8
8
|
import time
|
|
9
|
+
import uuid
|
|
9
10
|
from typing import Sequence
|
|
10
11
|
|
|
11
12
|
import pyspark.sql.connect.proto.relations_pb2 as relation_proto
|
|
@@ -153,6 +154,49 @@ def random_string(
|
|
|
153
154
|
return "".join([prefix, random_part, suffix])
|
|
154
155
|
|
|
155
156
|
|
|
157
|
+
def generate_spark_compatible_filename(
|
|
158
|
+
task_id: int = 0,
|
|
159
|
+
attempt_number: int = 0,
|
|
160
|
+
compression: str = None,
|
|
161
|
+
format_ext: str = "parquet",
|
|
162
|
+
) -> str:
|
|
163
|
+
"""Generate a Spark-compatible filename following the convention:
|
|
164
|
+
part-<task-id>-<uuid>-c<attempt-number>.<compression>.<format>
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
task_id: Task ID (usually 0 for single partition)
|
|
168
|
+
attempt_number: Attempt number (usually 0)
|
|
169
|
+
compression: Compression type (e.g., 'snappy', 'gzip', 'none')
|
|
170
|
+
format_ext: File format extension (e.g., 'parquet', 'csv', 'json')
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
A filename string following Spark's naming convention
|
|
174
|
+
"""
|
|
175
|
+
# Generate a UUID for uniqueness
|
|
176
|
+
file_uuid = str(uuid.uuid4())
|
|
177
|
+
|
|
178
|
+
# Format task ID with leading zeros (5 digits)
|
|
179
|
+
formatted_task_id = f"{task_id:05d}"
|
|
180
|
+
|
|
181
|
+
# Format attempt number with leading zeros (3 digits)
|
|
182
|
+
formatted_attempt = f"{attempt_number:03d}"
|
|
183
|
+
|
|
184
|
+
# Build the base filename
|
|
185
|
+
base_name = f"part-{formatted_task_id}-{file_uuid}-c{formatted_attempt}"
|
|
186
|
+
|
|
187
|
+
# Add compression if specified and not 'none'
|
|
188
|
+
if compression and compression.lower() not in ("none", "uncompressed"):
|
|
189
|
+
compression_part = f".{compression.lower()}"
|
|
190
|
+
else:
|
|
191
|
+
compression_part = ""
|
|
192
|
+
|
|
193
|
+
# Add format extension if specified
|
|
194
|
+
if format_ext:
|
|
195
|
+
return f"{base_name}{compression_part}.{format_ext}"
|
|
196
|
+
else:
|
|
197
|
+
return f"{base_name}{compression_part}"
|
|
198
|
+
|
|
199
|
+
|
|
156
200
|
def _normalize_query_for_semantic_hash(query_str: str) -> str:
|
|
157
201
|
"""
|
|
158
202
|
Normalize a query string for semantic comparison by extracting original names from
|