snowpark-connect 0.32.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of snowpark-connect might be problematic. Click here for more details.
- snowflake/snowpark_connect/column_name_handler.py +91 -40
- snowflake/snowpark_connect/column_qualifier.py +0 -4
- snowflake/snowpark_connect/config.py +9 -0
- snowflake/snowpark_connect/expression/hybrid_column_map.py +5 -4
- snowflake/snowpark_connect/expression/literal.py +12 -12
- snowflake/snowpark_connect/expression/map_sql_expression.py +18 -4
- snowflake/snowpark_connect/expression/map_unresolved_attribute.py +150 -29
- snowflake/snowpark_connect/expression/map_unresolved_function.py +93 -55
- snowflake/snowpark_connect/relation/map_aggregate.py +156 -257
- snowflake/snowpark_connect/relation/map_column_ops.py +19 -0
- snowflake/snowpark_connect/relation/map_join.py +454 -252
- snowflake/snowpark_connect/relation/map_row_ops.py +136 -54
- snowflake/snowpark_connect/relation/map_sql.py +335 -90
- snowflake/snowpark_connect/relation/read/map_read.py +9 -1
- snowflake/snowpark_connect/relation/read/map_read_csv.py +19 -2
- snowflake/snowpark_connect/relation/read/map_read_json.py +90 -2
- snowflake/snowpark_connect/relation/read/map_read_parquet.py +3 -0
- snowflake/snowpark_connect/relation/read/map_read_text.py +4 -0
- snowflake/snowpark_connect/relation/read/reader_config.py +10 -0
- snowflake/snowpark_connect/relation/read/utils.py +41 -0
- snowflake/snowpark_connect/relation/utils.py +50 -2
- snowflake/snowpark_connect/relation/write/map_write.py +251 -292
- snowflake/snowpark_connect/resources_initializer.py +25 -13
- snowflake/snowpark_connect/server.py +9 -24
- snowflake/snowpark_connect/type_mapping.py +2 -0
- snowflake/snowpark_connect/typed_column.py +2 -2
- snowflake/snowpark_connect/utils/context.py +0 -14
- snowflake/snowpark_connect/utils/expression_transformer.py +163 -0
- snowflake/snowpark_connect/utils/sequence.py +21 -0
- snowflake/snowpark_connect/utils/session.py +4 -1
- snowflake/snowpark_connect/utils/udf_helper.py +1 -0
- snowflake/snowpark_connect/utils/udtf_helper.py +3 -0
- snowflake/snowpark_connect/version.py +1 -1
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/METADATA +4 -2
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/RECORD +43 -104
- snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
- snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
- snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
- snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-library-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
- snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
- snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
- snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
- {snowpark_connect-0.32.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-connect +0 -0
- {snowpark_connect-0.32.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-session +0 -0
- {snowpark_connect-0.32.0.data → snowpark_connect-1.0.0.data}/scripts/snowpark-submit +0 -0
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/WHEEL +0 -0
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/LICENSE-binary +0 -0
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/LICENSE.txt +0 -0
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/licenses/NOTICE-binary +0 -0
- {snowpark_connect-0.32.0.dist-info → snowpark_connect-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -45,6 +45,61 @@ from snowflake.snowpark_connect.utils.telemetry import (
|
|
|
45
45
|
)
|
|
46
46
|
|
|
47
47
|
|
|
48
|
+
def cast_columns(
|
|
49
|
+
df_container: DataFrameContainer,
|
|
50
|
+
df_dtypes: list[snowpark.types.DataType],
|
|
51
|
+
target_dtypes: list[snowpark.types.DataType],
|
|
52
|
+
column_map: ColumnNameMap,
|
|
53
|
+
):
|
|
54
|
+
df: snowpark.DataFrame = df_container.dataframe
|
|
55
|
+
if df_dtypes == target_dtypes:
|
|
56
|
+
return df_container
|
|
57
|
+
# Use cached schema if available to avoid triggering extra queries
|
|
58
|
+
if (
|
|
59
|
+
hasattr(df_container, "cached_schema_getter")
|
|
60
|
+
and df_container.cached_schema_getter is not None
|
|
61
|
+
):
|
|
62
|
+
df_schema = df_container.cached_schema_getter()
|
|
63
|
+
else:
|
|
64
|
+
df_schema = df.schema # Get current schema
|
|
65
|
+
new_columns = []
|
|
66
|
+
|
|
67
|
+
for i, field in enumerate(df_schema.fields):
|
|
68
|
+
col_name = field.name
|
|
69
|
+
current_type = field.datatype
|
|
70
|
+
target_type = target_dtypes[i]
|
|
71
|
+
|
|
72
|
+
if current_type != target_type:
|
|
73
|
+
new_columns.append(df[col_name].cast(target_type).alias(col_name))
|
|
74
|
+
else:
|
|
75
|
+
new_columns.append(df[col_name])
|
|
76
|
+
|
|
77
|
+
new_df = df.select(new_columns)
|
|
78
|
+
return DataFrameContainer.create_with_column_mapping(
|
|
79
|
+
dataframe=new_df,
|
|
80
|
+
spark_column_names=column_map.get_spark_columns(),
|
|
81
|
+
snowpark_column_names=column_map.get_snowpark_columns(),
|
|
82
|
+
snowpark_column_types=target_dtypes,
|
|
83
|
+
column_metadata=column_map.column_metadata,
|
|
84
|
+
parent_column_name_map=column_map,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def get_schema_from_result(
|
|
89
|
+
result: DataFrameContainer,
|
|
90
|
+
) -> StructType:
|
|
91
|
+
"""
|
|
92
|
+
Get schema from a DataFrameContainer, using cached schema if available to avoid extra queries.
|
|
93
|
+
"""
|
|
94
|
+
if (
|
|
95
|
+
hasattr(result, "cached_schema_getter")
|
|
96
|
+
and result.cached_schema_getter is not None
|
|
97
|
+
):
|
|
98
|
+
return result.cached_schema_getter()
|
|
99
|
+
else:
|
|
100
|
+
return result.dataframe.schema
|
|
101
|
+
|
|
102
|
+
|
|
48
103
|
def map_deduplicate(
|
|
49
104
|
rel: relation_proto.Relation,
|
|
50
105
|
) -> DataFrameContainer:
|
|
@@ -205,21 +260,8 @@ def map_union(
|
|
|
205
260
|
|
|
206
261
|
# workaround for unstructured type vs structured type
|
|
207
262
|
# Use cached schema if available to avoid triggering extra queries
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
and left_result.cached_schema_getter is not None
|
|
211
|
-
):
|
|
212
|
-
left_schema = left_result.cached_schema_getter()
|
|
213
|
-
else:
|
|
214
|
-
left_schema = left_df.schema
|
|
215
|
-
|
|
216
|
-
if (
|
|
217
|
-
hasattr(right_result, "cached_schema_getter")
|
|
218
|
-
and right_result.cached_schema_getter is not None
|
|
219
|
-
):
|
|
220
|
-
right_schema = right_result.cached_schema_getter()
|
|
221
|
-
else:
|
|
222
|
-
right_schema = right_df.schema
|
|
263
|
+
left_schema = get_schema_from_result(left_result)
|
|
264
|
+
right_schema = get_schema_from_result(right_result)
|
|
223
265
|
|
|
224
266
|
left_dtypes = [field.datatype for field in left_schema.fields]
|
|
225
267
|
right_dtypes = [field.datatype for field in right_schema.fields]
|
|
@@ -257,6 +299,29 @@ def map_union(
|
|
|
257
299
|
# Union of any type with null type is of the other type
|
|
258
300
|
target_left_dtypes.append(other_t)
|
|
259
301
|
target_right_dtypes.append(other_t)
|
|
302
|
+
case (snowpark.types.DecimalType(), snowpark.types.DecimalType()):
|
|
303
|
+
# Widen decimal types to accommodate both sides
|
|
304
|
+
# Calculate the maximum scale and maximum integer digits
|
|
305
|
+
left_integer_digits = left_type.precision - left_type.scale
|
|
306
|
+
right_integer_digits = right_type.precision - right_type.scale
|
|
307
|
+
|
|
308
|
+
# The common type needs to accommodate:
|
|
309
|
+
# - The maximum number of digits after the decimal point (scale)
|
|
310
|
+
# - The maximum number of digits before the decimal point (integer digits)
|
|
311
|
+
common_scale = max(left_type.scale, right_type.scale)
|
|
312
|
+
common_integer_digits = max(
|
|
313
|
+
left_integer_digits, right_integer_digits
|
|
314
|
+
)
|
|
315
|
+
common_precision = min(38, common_scale + common_integer_digits)
|
|
316
|
+
|
|
317
|
+
# Ensure scale doesn't exceed precision
|
|
318
|
+
common_scale = min(common_scale, common_precision)
|
|
319
|
+
|
|
320
|
+
common_type = snowpark.types.DecimalType(
|
|
321
|
+
common_precision, common_scale
|
|
322
|
+
)
|
|
323
|
+
target_left_dtypes.append(common_type)
|
|
324
|
+
target_right_dtypes.append(common_type)
|
|
260
325
|
case (snowpark.types.BooleanType(), _) | (
|
|
261
326
|
_,
|
|
262
327
|
snowpark.types.BooleanType(),
|
|
@@ -272,49 +337,24 @@ def map_union(
|
|
|
272
337
|
raise exception
|
|
273
338
|
target_left_dtypes.append(left_type)
|
|
274
339
|
target_right_dtypes.append(right_type)
|
|
340
|
+
case (
|
|
341
|
+
snowpark.types.TimestampType()
|
|
342
|
+
| snowpark.types.DateType()
|
|
343
|
+
| snowpark.types._NumericType(),
|
|
344
|
+
snowpark.types.StringType(),
|
|
345
|
+
) | (
|
|
346
|
+
snowpark.types.StringType(),
|
|
347
|
+
snowpark.types.TimestampType()
|
|
348
|
+
| snowpark.types.DateType()
|
|
349
|
+
| snowpark.types._NumericType(),
|
|
350
|
+
) if not spark_sql_ansi_enabled:
|
|
351
|
+
common_type = snowpark.types.StringType()
|
|
352
|
+
target_left_dtypes.append(common_type)
|
|
353
|
+
target_right_dtypes.append(common_type)
|
|
275
354
|
case _:
|
|
276
355
|
target_left_dtypes.append(left_type)
|
|
277
356
|
target_right_dtypes.append(right_type)
|
|
278
357
|
|
|
279
|
-
def cast_columns(
|
|
280
|
-
df_container: DataFrameContainer,
|
|
281
|
-
df_dtypes: list[snowpark.types.DataType],
|
|
282
|
-
target_dtypes: list[snowpark.types.DataType],
|
|
283
|
-
column_map: ColumnNameMap,
|
|
284
|
-
):
|
|
285
|
-
df: snowpark.DataFrame = df_container.dataframe
|
|
286
|
-
if df_dtypes == target_dtypes:
|
|
287
|
-
return df_container
|
|
288
|
-
# Use cached schema if available to avoid triggering extra queries
|
|
289
|
-
if (
|
|
290
|
-
hasattr(df_container, "cached_schema_getter")
|
|
291
|
-
and df_container.cached_schema_getter is not None
|
|
292
|
-
):
|
|
293
|
-
df_schema = df_container.cached_schema_getter()
|
|
294
|
-
else:
|
|
295
|
-
df_schema = df.schema # Get current schema
|
|
296
|
-
new_columns = []
|
|
297
|
-
|
|
298
|
-
for i, field in enumerate(df_schema.fields):
|
|
299
|
-
col_name = field.name
|
|
300
|
-
current_type = field.datatype
|
|
301
|
-
target_type = target_dtypes[i]
|
|
302
|
-
|
|
303
|
-
if current_type != target_type:
|
|
304
|
-
new_columns.append(df[col_name].cast(target_type).alias(col_name))
|
|
305
|
-
else:
|
|
306
|
-
new_columns.append(df[col_name])
|
|
307
|
-
|
|
308
|
-
new_df = df.select(new_columns)
|
|
309
|
-
return DataFrameContainer.create_with_column_mapping(
|
|
310
|
-
dataframe=new_df,
|
|
311
|
-
spark_column_names=column_map.get_spark_columns(),
|
|
312
|
-
snowpark_column_names=column_map.get_snowpark_columns(),
|
|
313
|
-
snowpark_column_types=target_dtypes,
|
|
314
|
-
column_metadata=column_map.column_metadata,
|
|
315
|
-
parent_column_name_map=column_map,
|
|
316
|
-
)
|
|
317
|
-
|
|
318
358
|
left_result = cast_columns(
|
|
319
359
|
left_result,
|
|
320
360
|
left_dtypes,
|
|
@@ -527,6 +567,48 @@ def map_except(
|
|
|
527
567
|
left_df = left_result.dataframe
|
|
528
568
|
right_df = right_result.dataframe
|
|
529
569
|
|
|
570
|
+
# workaround for unstructured type vs structured type
|
|
571
|
+
# Use cached schema if available to avoid triggering extra queries
|
|
572
|
+
left_schema = get_schema_from_result(left_result)
|
|
573
|
+
right_schema = get_schema_from_result(right_result)
|
|
574
|
+
|
|
575
|
+
left_dtypes = [field.datatype for field in left_schema.fields]
|
|
576
|
+
right_dtypes = [field.datatype for field in right_schema.fields]
|
|
577
|
+
|
|
578
|
+
if left_dtypes != right_dtypes and not rel.set_op.by_name:
|
|
579
|
+
if len(left_dtypes) != len(right_dtypes):
|
|
580
|
+
exception = AnalysisException("UNION: the number of columns must match")
|
|
581
|
+
attach_custom_error_code(exception, ErrorCodes.INVALID_OPERATION)
|
|
582
|
+
raise exception
|
|
583
|
+
target_left_dtypes, target_right_dtypes = [], []
|
|
584
|
+
for left_type, right_type in zip(left_dtypes, right_dtypes):
|
|
585
|
+
match (left_type, right_type):
|
|
586
|
+
case (snowpark.types._NumericType(), snowpark.types.StringType()) | (
|
|
587
|
+
snowpark.types.StringType(),
|
|
588
|
+
snowpark.types._NumericType(),
|
|
589
|
+
):
|
|
590
|
+
common_type = snowpark.types.StringType()
|
|
591
|
+
target_left_dtypes.append(common_type)
|
|
592
|
+
target_right_dtypes.append(common_type)
|
|
593
|
+
case _:
|
|
594
|
+
target_left_dtypes.append(left_type)
|
|
595
|
+
target_right_dtypes.append(right_type)
|
|
596
|
+
|
|
597
|
+
left_result = cast_columns(
|
|
598
|
+
left_result,
|
|
599
|
+
left_dtypes,
|
|
600
|
+
target_left_dtypes,
|
|
601
|
+
left_result.column_map,
|
|
602
|
+
)
|
|
603
|
+
right_result = cast_columns(
|
|
604
|
+
right_result,
|
|
605
|
+
right_dtypes,
|
|
606
|
+
target_right_dtypes,
|
|
607
|
+
right_result.column_map,
|
|
608
|
+
)
|
|
609
|
+
left_df = left_result.dataframe
|
|
610
|
+
right_df = right_result.dataframe
|
|
611
|
+
|
|
530
612
|
if rel.set_op.is_all:
|
|
531
613
|
# Snowflake except removes all duplicated rows. In order to handle the case,
|
|
532
614
|
# we add a partition row number column to the df to make duplicated rows unique to
|