snowpark-connect 0.32.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/column_name_handler.py +91 -40
- snowflake/snowpark_connect/column_qualifier.py +0 -4
- snowflake/snowpark_connect/config.py +9 -0
- snowflake/snowpark_connect/expression/hybrid_column_map.py +5 -4
- snowflake/snowpark_connect/expression/literal.py +12 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +18 -4
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +150 -29
- snowflake/snowpark_connect/expression/map_unresolved_function.py +93 -55
- snowflake/snowpark_connect/relation/map_aggregate.py +156 -257
- snowflake/snowpark_connect/relation/map_column_ops.py +19 -0
- snowflake/snowpark_connect/relation/map_join.py +454 -252
- snowflake/snowpark_connect/relation/map_row_ops.py +136 -54
- snowflake/snowpark_connect/relation/map_sql.py +335 -90
- snowflake/snowpark_connect/relation/read/map_read.py +9 -1
- snowflake/snowpark_connect/relation/read/map_read_csv.py +19 -2
- snowflake/snowpark_connect/relation/read/map_read_json.py +90 -2
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +3 -0
- snowflake/snowpark_connect/relation/read/map_read_text.py +4 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +10 -0
- snowflake/snowpark_connect/relation/read/utils.py +41 -0
- snowflake/snowpark_connect/relation/utils.py +50 -2
- snowflake/snowpark_connect/relation/write/map_write.py +251 -292
- snowflake/snowpark_connect/resources_initializer.py +25 -13
- snowflake/snowpark_connect/server.py +9 -24
- snowflake/snowpark_connect/type_mapping.py +2 -0
- snowflake/snowpark_connect/typed_column.py +2 -2
- snowflake/snowpark_connect/utils/context.py +0 -14
- snowflake/snowpark_connect/utils/expression_transformer.py +163 -0
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +4 -1
- snowflake/snowpark_connect/utils/udf_helper.py +1 -0
- snowflake/snowpark_connect/utils/udtf_helper.py +3 -0
- snowflake/snowpark_connect/version.py +1 -1
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/METADATA +4 -2
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/RECORD +43 -104
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.32.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.32.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.32.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -35,6 +35,7 @@ from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
|
35
35
|
add_filename_metadata_to_reader,
|
|
36
36
|
)
|
|
37
37
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
38
|
+
apply_metadata_exclusion_pattern,
|
|
38
39
|
get_spark_column_names_from_snowpark_columns,
|
|
39
40
|
rename_columns_as_snowflake_standard,
|
|
40
41
|
)
|
|
@@ -80,6 +81,8 @@ def map_read_json(
|
|
|
80
81
|
dropFieldIfAllNull = snowpark_options.pop("dropfieldifallnull", False)
|
|
81
82
|
batch_size = snowpark_options.pop("batchsize", 1000)
|
|
82
83
|
|
|
84
|
+
apply_metadata_exclusion_pattern(snowpark_options)
|
|
85
|
+
|
|
83
86
|
reader = add_filename_metadata_to_reader(
|
|
84
87
|
session.read.options(snowpark_options), raw_options
|
|
85
88
|
)
|
|
@@ -117,6 +120,10 @@ def map_read_json(
|
|
|
117
120
|
if unquote_if_quoted(sf.name) in columns_with_valid_contents
|
|
118
121
|
]
|
|
119
122
|
|
|
123
|
+
new_schema, fields_changed = validate_and_update_schema(schema)
|
|
124
|
+
if fields_changed:
|
|
125
|
+
schema = new_schema
|
|
126
|
+
|
|
120
127
|
df = construct_dataframe_by_schema(
|
|
121
128
|
schema, df.to_local_iterator(), session, snowpark_options, batch_size
|
|
122
129
|
)
|
|
@@ -134,6 +141,84 @@ def map_read_json(
|
|
|
134
141
|
)
|
|
135
142
|
|
|
136
143
|
|
|
144
|
+
def should_drop_field(field: StructField) -> bool:
|
|
145
|
+
if isinstance(field.datatype, StructType):
|
|
146
|
+
# "a" : {} => drop the field
|
|
147
|
+
if len(field.datatype.fields) == 0:
|
|
148
|
+
return True
|
|
149
|
+
elif (
|
|
150
|
+
isinstance(field.datatype, ArrayType)
|
|
151
|
+
and field.datatype.element_type is not None
|
|
152
|
+
and isinstance(field.datatype.element_type, StructType)
|
|
153
|
+
):
|
|
154
|
+
if len(field.datatype.element_type.fields) == 0:
|
|
155
|
+
# "a" : [{}] => drop the field
|
|
156
|
+
return True
|
|
157
|
+
return False
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# Validate the schema to ensure it is valid for Snowflake
|
|
161
|
+
# Handles these cases:
|
|
162
|
+
# 1. Drops StructField([])
|
|
163
|
+
# 2. Drops ArrayType(StructType([]))
|
|
164
|
+
# 3. ArrayType() -> ArrayType(StringType())
|
|
165
|
+
def validate_and_update_schema(schema: StructType | None) -> (StructType | None, bool):
|
|
166
|
+
if not isinstance(schema, StructType):
|
|
167
|
+
return schema, False
|
|
168
|
+
new_fields = []
|
|
169
|
+
fields_changed = False
|
|
170
|
+
for sf in schema.fields:
|
|
171
|
+
if should_drop_field(sf):
|
|
172
|
+
fields_changed = True
|
|
173
|
+
continue
|
|
174
|
+
if isinstance(sf.datatype, StructType):
|
|
175
|
+
# If the schema is a struct, validate the child schema
|
|
176
|
+
if len(sf.datatype.fields) == 0:
|
|
177
|
+
# No fields in the struct, drop the field
|
|
178
|
+
fields_changed = True
|
|
179
|
+
continue
|
|
180
|
+
child_field = StructField(sf.name, sf.datatype, sf.nullable)
|
|
181
|
+
# Recursively validate the child schema
|
|
182
|
+
child_field.datatype, child_field_changes = validate_and_update_schema(
|
|
183
|
+
sf.datatype
|
|
184
|
+
)
|
|
185
|
+
if should_drop_field(child_field):
|
|
186
|
+
fields_changed = True
|
|
187
|
+
continue
|
|
188
|
+
new_fields.append(child_field)
|
|
189
|
+
fields_changed = fields_changed or child_field_changes
|
|
190
|
+
elif isinstance(sf.datatype, ArrayType):
|
|
191
|
+
# If the schema is an array, validate the element schema
|
|
192
|
+
if sf.datatype.element_type is not None and isinstance(
|
|
193
|
+
sf.datatype.element_type, StructType
|
|
194
|
+
):
|
|
195
|
+
# If the element schema is a struct, validate the element schema
|
|
196
|
+
if len(sf.datatype.element_type.fields) == 0:
|
|
197
|
+
# No fields in the struct, drop the field
|
|
198
|
+
fields_changed = True
|
|
199
|
+
continue
|
|
200
|
+
else:
|
|
201
|
+
# Recursively validate the element schema
|
|
202
|
+
element_schema, element_field_changes = validate_and_update_schema(
|
|
203
|
+
sf.datatype.element_type
|
|
204
|
+
)
|
|
205
|
+
if element_field_changes:
|
|
206
|
+
sf.datatype.element_type = element_schema
|
|
207
|
+
fields_changed = True
|
|
208
|
+
if should_drop_field(sf):
|
|
209
|
+
fields_changed = True
|
|
210
|
+
continue
|
|
211
|
+
elif sf.datatype.element_type is None:
|
|
212
|
+
fields_changed = True
|
|
213
|
+
sf.datatype.element_type = StringType()
|
|
214
|
+
new_fields.append(sf)
|
|
215
|
+
else:
|
|
216
|
+
new_fields.append(sf)
|
|
217
|
+
if fields_changed:
|
|
218
|
+
schema.fields = new_fields
|
|
219
|
+
return schema, fields_changed
|
|
220
|
+
|
|
221
|
+
|
|
137
222
|
def merge_json_schema(
|
|
138
223
|
content: typing.Any,
|
|
139
224
|
schema: StructType | None,
|
|
@@ -378,8 +463,11 @@ def construct_row_by_schema(
|
|
|
378
463
|
inner_schema = schema.element_type
|
|
379
464
|
if isinstance(content, str):
|
|
380
465
|
content = json.loads(content)
|
|
381
|
-
|
|
382
|
-
|
|
466
|
+
if inner_schema is not None:
|
|
467
|
+
for ele in content:
|
|
468
|
+
result.append(
|
|
469
|
+
construct_row_by_schema(ele, inner_schema, snowpark_options)
|
|
470
|
+
)
|
|
383
471
|
return result
|
|
384
472
|
elif isinstance(schema, DateType):
|
|
385
473
|
return cast_to_match_snowpark_type(
|
|
@@ -29,6 +29,7 @@ from snowflake.snowpark_connect.relation.read.metadata_utils import (
|
|
|
29
29
|
)
|
|
30
30
|
from snowflake.snowpark_connect.relation.read.reader_config import ReaderWriterConfig
|
|
31
31
|
from snowflake.snowpark_connect.relation.read.utils import (
|
|
32
|
+
apply_metadata_exclusion_pattern,
|
|
32
33
|
rename_columns_as_snowflake_standard,
|
|
33
34
|
)
|
|
34
35
|
from snowflake.snowpark_connect.utils.telemetry import (
|
|
@@ -57,6 +58,8 @@ def map_read_parquet(
|
|
|
57
58
|
assert schema is None, "Read PARQUET does not support user schema"
|
|
58
59
|
assert len(paths) > 0, "Read PARQUET expects at least one path"
|
|
59
60
|
|
|
61
|
+
apply_metadata_exclusion_pattern(snowpark_options)
|
|
62
|
+
|
|
60
63
|
reader = add_filename_metadata_to_reader(
|
|
61
64
|
session.read.options(snowpark_options), raw_options
|
|
62
65
|
)
|
|
@@ -26,6 +26,10 @@ def get_file_paths_from_stage(
|
|
|
26
26
|
) -> typing.List[str]:
|
|
27
27
|
files_paths = []
|
|
28
28
|
for listed_path_row in session.sql(f"LIST {path}").collect():
|
|
29
|
+
# Skip _SUCCESS marker files
|
|
30
|
+
if listed_path_row[0].endswith("_SUCCESS"):
|
|
31
|
+
continue
|
|
32
|
+
|
|
29
33
|
listed_path = listed_path_row[0].split("/")
|
|
30
34
|
if listed_path_row[0].startswith("s3://") or listed_path_row[0].startswith(
|
|
31
35
|
"s3a://"
|
|
@@ -126,6 +126,7 @@ CSV_READ_SUPPORTED_OPTIONS = lowercase_set(
|
|
|
126
126
|
"compression",
|
|
127
127
|
# "escapeQuotes",
|
|
128
128
|
# "quoteAll",
|
|
129
|
+
"rowsToInferSchema", # Snowflake specific option, number of rows to infer schema
|
|
129
130
|
}
|
|
130
131
|
)
|
|
131
132
|
|
|
@@ -201,6 +202,15 @@ def csv_convert_to_snowpark_args(snowpark_config: dict[str, Any]) -> dict[str, A
|
|
|
201
202
|
if snowpark_config["escape"] and snowpark_config["escape"] == "\\":
|
|
202
203
|
snowpark_config["escape"] = "\\\\"
|
|
203
204
|
|
|
205
|
+
# Snowflake specific option, number of rows to infer schema for CSV files
|
|
206
|
+
if "rowstoinferschema" in snowpark_config:
|
|
207
|
+
rows_to_infer_schema = snowpark_config["rowstoinferschema"]
|
|
208
|
+
del snowpark_config["rowstoinferschema"]
|
|
209
|
+
snowpark_config["INFER_SCHEMA_OPTIONS"] = {
|
|
210
|
+
"MAX_RECORDS_PER_FILE": int(rows_to_infer_schema),
|
|
211
|
+
"USE_RELAXED_TYPES": True,
|
|
212
|
+
}
|
|
213
|
+
|
|
204
214
|
# Rename the keys to match the Snowpark configuration.
|
|
205
215
|
for spark_arg, snowpark_arg in renamed_args.items():
|
|
206
216
|
if spark_arg not in snowpark_config:
|
|
@@ -40,6 +40,47 @@ DATA_SOURCE_SQL_COMMENT = (
|
|
|
40
40
|
INDEXED_COLUMN_NAME_PATTERN = re.compile(r"(^\"c)(\d+)(\"$)")
|
|
41
41
|
|
|
42
42
|
|
|
43
|
+
def apply_metadata_exclusion_pattern(options: dict) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Exclude metadata and hidden files from reads, matching Spark's behavior.
|
|
46
|
+
|
|
47
|
+
Automatically filters out internal metadata files that should never be read as data:
|
|
48
|
+
- _SUCCESS, _metadata, _common_metadata (Spark/Parquet metadata)
|
|
49
|
+
- .crc (Hadoop checksum files)
|
|
50
|
+
- .DS_Store (macOS system files)
|
|
51
|
+
- Any file starting with _ or .
|
|
52
|
+
|
|
53
|
+
Pattern used: ".*/[^_.][^/]*$|^[^_.][^/]*$"
|
|
54
|
+
- Matches files where filename does NOT start with _ or .
|
|
55
|
+
- Works at any directory depth (flat or partitioned data)
|
|
56
|
+
- Allows files with or without extensions
|
|
57
|
+
|
|
58
|
+
Examples of excluded files:
|
|
59
|
+
❌ _SUCCESS, _metadata, _common_metadata (Spark/Parquet metadata)
|
|
60
|
+
❌ .crc, .DS_Store, .hidden (system/hidden files)
|
|
61
|
+
❌ year=2024/_SUCCESS (metadata in partitioned directories)
|
|
62
|
+
|
|
63
|
+
Examples of allowed files:
|
|
64
|
+
✅ part-00000.parquet, data.csv, output.json (data files)
|
|
65
|
+
✅ success, myfile (files without extensions, don't start with _ or .)
|
|
66
|
+
✅ year=2024/month=01/part-00000.parquet (partitioned data)
|
|
67
|
+
|
|
68
|
+
User pattern handling:
|
|
69
|
+
- No pattern or "*" or ".*" → Apply metadata exclusion
|
|
70
|
+
- Custom patterns → Default to user provided pattern.
|
|
71
|
+
|
|
72
|
+
Leak cases (user explicitly requests metadata files and are intentional):
|
|
73
|
+
⚠️ "_*" → Matches _SUCCESS, _metadata (explicit underscore prefix)
|
|
74
|
+
⚠️ "*SUCCESS*" → Matches _SUCCESS (broad wildcard side effect)
|
|
75
|
+
⚠️ "[_.].*" → Matches _SUCCESS, .crc (character class includes _)
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
options: Dictionary of Snowpark read options (modified in place)
|
|
79
|
+
"""
|
|
80
|
+
if "PATTERN" not in options or options["PATTERN"] in ("*", ".*"):
|
|
81
|
+
options["PATTERN"] = ".*/[^_.][^/]*$|^[^_.][^/]*$"
|
|
82
|
+
|
|
83
|
+
|
|
43
84
|
def subtract_one(match: re.Match[str]) -> str:
|
|
44
85
|
"""Spark column names are 0 indexed, Snowpark is 1 indexed."""
|
|
45
86
|
return f"_c{str(int(match.group(2)) - 1)}"
|
|
@@ -174,6 +174,7 @@ def generate_spark_compatible_filename(
|
|
|
174
174
|
attempt_number: int = 0,
|
|
175
175
|
compression: str = None,
|
|
176
176
|
format_ext: str = "parquet",
|
|
177
|
+
shared_uuid: str = None,
|
|
177
178
|
) -> str:
|
|
178
179
|
"""Generate a Spark-compatible filename following the convention:
|
|
179
180
|
part-<task-id>-<uuid>-c<attempt-number>.<compression>.<format>
|
|
@@ -183,12 +184,13 @@ def generate_spark_compatible_filename(
|
|
|
183
184
|
attempt_number: Attempt number (usually 0)
|
|
184
185
|
compression: Compression type (e.g., 'snappy', 'gzip', 'none')
|
|
185
186
|
format_ext: File format extension (e.g., 'parquet', 'csv', 'json')
|
|
187
|
+
shared_uuid: Shared UUID for the file
|
|
186
188
|
|
|
187
189
|
Returns:
|
|
188
190
|
A filename string following Spark's naming convention
|
|
189
191
|
"""
|
|
190
|
-
#
|
|
191
|
-
file_uuid = str(uuid.uuid4())
|
|
192
|
+
# Use the shared UUID if provided, otherwise generate a new one for uniqueness
|
|
193
|
+
file_uuid = shared_uuid or str(uuid.uuid4())
|
|
192
194
|
|
|
193
195
|
# Format task ID with leading zeros (5 digits)
|
|
194
196
|
formatted_task_id = f"{task_id:05d}"
|
|
@@ -284,3 +286,49 @@ def snowpark_functions_col(name: str, column_map: ColumnNameMap) -> snowpark.Col
|
|
|
284
286
|
"""
|
|
285
287
|
is_qualified_name = name not in column_map.get_snowpark_columns()
|
|
286
288
|
return snowpark_fn.col(name, _is_qualified_name=is_qualified_name)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def is_aggregate_function(func_name: str) -> bool:
|
|
292
|
+
"""
|
|
293
|
+
Check if a function name is an aggregate function.
|
|
294
|
+
|
|
295
|
+
Uses a hybrid approach:
|
|
296
|
+
1. First checks PySpark's docstring convention (docstrings starting with "Aggregate function:")
|
|
297
|
+
2. Falls back to a hardcoded list for functions with missing/incorrect docstrings
|
|
298
|
+
|
|
299
|
+
This ensures comprehensive coverage while automatically supporting new PySpark aggregate functions.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
func_name: The function name to check (case-insensitive)
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
True if the function is an aggregate function, False otherwise
|
|
306
|
+
"""
|
|
307
|
+
try:
|
|
308
|
+
import pyspark.sql.functions as pyspark_functions
|
|
309
|
+
|
|
310
|
+
# TODO:
|
|
311
|
+
"""
|
|
312
|
+
Check we can leverage scala classes to determine agg functions:
|
|
313
|
+
https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala#L207
|
|
314
|
+
"""
|
|
315
|
+
|
|
316
|
+
# Try PySpark docstring approach first (covers most aggregate functions)
|
|
317
|
+
pyspark_func = getattr(pyspark_functions, func_name.lower(), None)
|
|
318
|
+
if pyspark_func and pyspark_func.__doc__:
|
|
319
|
+
if pyspark_func.__doc__.lstrip().startswith("Aggregate function:"):
|
|
320
|
+
return True
|
|
321
|
+
|
|
322
|
+
# Fallback list for aggregate functions with missing/incorrect docstrings
|
|
323
|
+
# These are known aggregate functions that don't have proper docstring markers
|
|
324
|
+
fallback_aggregates = {
|
|
325
|
+
"percentile_cont",
|
|
326
|
+
"percentile_disc",
|
|
327
|
+
"any_value",
|
|
328
|
+
"grouping",
|
|
329
|
+
"grouping_id",
|
|
330
|
+
}
|
|
331
|
+
return func_name.lower() in fallback_aggregates
|
|
332
|
+
|
|
333
|
+
except Exception:
|
|
334
|
+
return False
|