snowpark-connect 0.21.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of snowpark-connect might be problematic. Click here for more details.

Files changed (56) hide show
  1. snowflake/snowpark_connect/config.py +19 -14
  2. snowflake/snowpark_connect/error/error_utils.py +32 -0
  3. snowflake/snowpark_connect/error/exceptions.py +4 -0
  4. snowflake/snowpark_connect/expression/hybrid_column_map.py +192 -0
  5. snowflake/snowpark_connect/expression/literal.py +9 -12
  6. snowflake/snowpark_connect/expression/map_cast.py +20 -4
  7. snowflake/snowpark_connect/expression/map_expression.py +8 -1
  8. snowflake/snowpark_connect/expression/map_udf.py +4 -4
  9. snowflake/snowpark_connect/expression/map_unresolved_extract_value.py +32 -5
  10. snowflake/snowpark_connect/expression/map_unresolved_function.py +269 -134
  11. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +8 -8
  12. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +4 -2
  13. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +127 -21
  14. snowflake/snowpark_connect/relation/map_aggregate.py +154 -18
  15. snowflake/snowpark_connect/relation/map_column_ops.py +59 -8
  16. snowflake/snowpark_connect/relation/map_extension.py +58 -24
  17. snowflake/snowpark_connect/relation/map_local_relation.py +8 -1
  18. snowflake/snowpark_connect/relation/map_map_partitions.py +3 -1
  19. snowflake/snowpark_connect/relation/map_row_ops.py +30 -1
  20. snowflake/snowpark_connect/relation/map_sql.py +40 -196
  21. snowflake/snowpark_connect/relation/map_udtf.py +4 -4
  22. snowflake/snowpark_connect/relation/read/map_read.py +2 -1
  23. snowflake/snowpark_connect/relation/read/map_read_json.py +12 -1
  24. snowflake/snowpark_connect/relation/read/map_read_parquet.py +8 -1
  25. snowflake/snowpark_connect/relation/read/reader_config.py +10 -0
  26. snowflake/snowpark_connect/relation/read/utils.py +7 -6
  27. snowflake/snowpark_connect/relation/utils.py +170 -1
  28. snowflake/snowpark_connect/relation/write/map_write.py +306 -87
  29. snowflake/snowpark_connect/server.py +34 -5
  30. snowflake/snowpark_connect/type_mapping.py +6 -2
  31. snowflake/snowpark_connect/utils/describe_query_cache.py +2 -9
  32. snowflake/snowpark_connect/utils/env_utils.py +55 -0
  33. snowflake/snowpark_connect/utils/session.py +21 -4
  34. snowflake/snowpark_connect/utils/telemetry.py +213 -61
  35. snowflake/snowpark_connect/utils/udxf_import_utils.py +14 -0
  36. snowflake/snowpark_connect/version.py +1 -1
  37. snowflake/snowpark_decoder/__init__.py +0 -0
  38. snowflake/snowpark_decoder/_internal/proto/generated/DataframeProcessorMsg_pb2.py +36 -0
  39. snowflake/snowpark_decoder/_internal/proto/generated/DataframeProcessorMsg_pb2.pyi +156 -0
  40. snowflake/snowpark_decoder/dp_session.py +111 -0
  41. snowflake/snowpark_decoder/spark_decoder.py +76 -0
  42. {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/METADATA +2 -2
  43. {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/RECORD +55 -44
  44. {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/top_level.txt +1 -0
  45. spark/__init__.py +0 -0
  46. spark/connect/__init__.py +0 -0
  47. spark/connect/envelope_pb2.py +31 -0
  48. spark/connect/envelope_pb2.pyi +46 -0
  49. snowflake/snowpark_connect/includes/jars/jackson-mapper-asl-1.9.13.jar +0 -0
  50. {snowpark_connect-0.21.0.data → snowpark_connect-0.23.0.data}/scripts/snowpark-connect +0 -0
  51. {snowpark_connect-0.21.0.data → snowpark_connect-0.23.0.data}/scripts/snowpark-session +0 -0
  52. {snowpark_connect-0.21.0.data → snowpark_connect-0.23.0.data}/scripts/snowpark-submit +0 -0
  53. {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/WHEEL +0 -0
  54. {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/LICENSE-binary +0 -0
  55. {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/LICENSE.txt +0 -0
  56. {snowpark_connect-0.21.0.dist-info → snowpark_connect-0.23.0.dist-info}/licenses/NOTICE-binary +0 -0
@@ -83,6 +83,7 @@ from snowflake.snowpark_connect.utils.context import (
83
83
  set_session_id,
84
84
  set_spark_version,
85
85
  )
86
+ from snowflake.snowpark_connect.utils.env_utils import get_int_from_env
86
87
  from snowflake.snowpark_connect.utils.interrupt import (
87
88
  interrupt_all_queries,
88
89
  interrupt_queries_with_tag,
@@ -700,11 +701,27 @@ def _serve(
700
701
  return
701
702
 
702
703
  server_options = [
703
- ("grpc.max_receive_message_length", _SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE),
704
- ("grpc.max_metadata_size", _SPARK_CONNECT_GRPC_MAX_METADATA_SIZE),
704
+ (
705
+ "grpc.max_receive_message_length",
706
+ get_int_from_env(
707
+ "SNOWFLAKE_GRPC_MAX_MESSAGE_SIZE",
708
+ _SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE,
709
+ ),
710
+ ),
711
+ (
712
+ "grpc.max_metadata_size",
713
+ get_int_from_env(
714
+ "SNOWFLAKE_GRPC_MAX_METADATA_SIZE",
715
+ _SPARK_CONNECT_GRPC_MAX_METADATA_SIZE,
716
+ ),
717
+ ),
705
718
  (
706
719
  "grpc.absolute_max_metadata_size",
707
- _SPARK_CONNECT_GRPC_MAX_METADATA_SIZE * 2,
720
+ get_int_from_env(
721
+ "SNOWFLAKE_GRPC_MAX_METADATA_SIZE",
722
+ _SPARK_CONNECT_GRPC_MAX_METADATA_SIZE,
723
+ )
724
+ * 2,
708
725
  ),
709
726
  ]
710
727
  server = grpc.server(
@@ -812,8 +829,11 @@ class UnixDomainSocketChannelBuilder(ChannelBuilder):
812
829
  Spark Connect gRPC channel builder for Unix domain sockets
813
830
  """
814
831
 
815
- def __init__(self, channelOptions: Optional[List[Tuple[str, Any]]] = None) -> None:
816
- url: str = get_client_url()
832
+ def __init__(
833
+ self, url: str = None, channelOptions: Optional[List[Tuple[str, Any]]] = None
834
+ ) -> None:
835
+ if url is None:
836
+ url = get_client_url()
817
837
  if url[:6] != "unix:/" or len(url) < 7:
818
838
  raise PySparkValueError(
819
839
  error_class="INVALID_CONNECT_URL",
@@ -981,6 +1001,7 @@ def start_session(
981
1001
  stop_event: threading.Event = None,
982
1002
  snowpark_session: Optional[snowpark.Session] = None,
983
1003
  connection_parameters: Optional[Dict[str, str]] = None,
1004
+ max_grpc_message_size: int = _SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE,
984
1005
  ) -> threading.Thread | None:
985
1006
  """
986
1007
  Starts Spark Connect server connected to Snowflake. No-op if the Server is already running.
@@ -1003,6 +1024,14 @@ def start_session(
1003
1024
  provided, the `snowpark_session` parameter must be None.
1004
1025
  """
1005
1026
  try:
1027
+ # Changing the value of our global variable based on the grpc message size provided by the user.
1028
+ global _SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE
1029
+ _SPARK_CONNECT_GRPC_MAX_MESSAGE_SIZE = max_grpc_message_size
1030
+
1031
+ from pyspark.sql.connect.client import ChannelBuilder
1032
+
1033
+ ChannelBuilder.MAX_MESSAGE_LENGTH = max_grpc_message_size
1034
+
1006
1035
  if os.environ.get("SPARK_ENV_LOADED"):
1007
1036
  raise RuntimeError(
1008
1037
  "Snowpark Connect cannot be run inside of a Spark environment"
@@ -59,7 +59,7 @@ def _get_struct_type_class():
59
59
 
60
60
 
61
61
  @cache
62
- def _get_python_sql_utils_class():
62
+ def get_python_sql_utils_class():
63
63
  return jpype.JClass("org.apache.spark.sql.api.python.PythonSQLUtils")
64
64
 
65
65
 
@@ -70,7 +70,7 @@ def parse_ddl_with_spark_scala(ddl_string: str) -> pyspark.sql.types.DataType:
70
70
  This mimics pysparks.ddl parsing logic pyspark.sql.types._py_parse_datatype_string
71
71
  """
72
72
  struct_type_class = _get_struct_type_class()
73
- python_sql_utils = _get_python_sql_utils_class()
73
+ python_sql_utils = get_python_sql_utils_class()
74
74
 
75
75
  try:
76
76
  # DDL format, "fieldname datatype, fieldname datatype".
@@ -324,6 +324,8 @@ def cast_to_match_snowpark_type(
324
324
  return str(content)
325
325
  case snowpark.types.VariantType:
326
326
  return str(content)
327
+ case snowpark.types.TimestampType:
328
+ return str(content)
327
329
  case _:
328
330
  raise SnowparkConnectNotImplementedError(
329
331
  f"Unsupported snowpark data type in casting: {data_type}"
@@ -779,6 +781,8 @@ def map_simple_types(simple_type: str) -> snowpark.types.DataType:
779
781
  return snowpark.types.TimestampType()
780
782
  case "timestamp_ntz":
781
783
  return snowpark.types.TimestampType(snowpark.types.TimestampTimeZone.NTZ)
784
+ case "timestamp_ltz":
785
+ return snowpark.types.TimestampType(snowpark.types.TimestampTimeZone.LTZ)
782
786
  case "day_time_interval":
783
787
  # this is not a column type in snowflake so there won't be a dataframe column
784
788
  # with this, for now this type won't make any sense
@@ -131,21 +131,14 @@ def instrument_session_for_describe_cache(session: snowpark.Session):
131
131
  logger.debug(f"DDL detected, clearing describe query cache: '{query}'")
132
132
  cache.clear()
133
133
 
134
- def report_query(qid: str, is_internal: bool) -> None:
135
- if is_internal:
136
- telemetry.report_internal_query()
137
- elif qid:
138
- telemetry.report_query_id(qid)
139
-
140
134
  def wrap_execute(wrapped_fn):
141
135
  def fn(query: str, **kwargs):
142
136
  update_cache_for_query(query)
143
- is_internal = kwargs.get("_is_internal", False)
144
137
  try:
145
138
  result = wrapped_fn(query, **kwargs)
146
- report_query(result.sfqid, is_internal)
139
+ telemetry.report_query(result, **kwargs)
147
140
  except Exception as e:
148
- report_query(e.sfqid, is_internal)
141
+ telemetry.report_query(e, **kwargs)
149
142
  raise e
150
143
  return result
151
144
 
@@ -0,0 +1,55 @@
1
+ #
2
+ # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3
+ #
4
+
5
+ """
6
+ Environment variable utilities for Snowpark Connect.
7
+ """
8
+
9
+ import os
10
+
11
+ from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
12
+
13
+
14
+ def get_int_from_env(env_var: str, default: int) -> int:
15
+ """
16
+ Safely get integer value from environment variable with fallback to default.
17
+
18
+ Args:
19
+ env_var: Environment variable name
20
+ default: Default integer value if env var is not set or invalid
21
+
22
+ Returns:
23
+ Integer value from environment variable or default
24
+
25
+ Raises:
26
+ TypeError: If default is not an integer
27
+
28
+ Examples:
29
+ >>> get_int_from_env("MAX_WORKERS", 10)
30
+ 10
31
+ >>> os.environ["MAX_WORKERS"] = "20"
32
+ >>> get_int_from_env("MAX_WORKERS", 10)
33
+ 20
34
+ >>> os.environ["MAX_WORKERS"] = "invalid"
35
+ >>> get_int_from_env("MAX_WORKERS", 10) # logs warning, returns 10
36
+ 10
37
+ """
38
+ # Validate that default is actually an integer
39
+ if not isinstance(default, int):
40
+ raise TypeError(
41
+ f"Default value must be an integer, got {type(default).__name__}: {default}"
42
+ )
43
+
44
+ value = os.getenv(env_var)
45
+ if value is None:
46
+ return default
47
+
48
+ try:
49
+ return int(value)
50
+ except ValueError:
51
+ logger.warning(
52
+ f"Invalid integer value for environment variable {env_var}: '{value}', "
53
+ f"using default: {default}"
54
+ )
55
+ return default
@@ -73,6 +73,27 @@ def configure_snowpark_session(session: snowpark.Session):
73
73
  session.connection.arrow_number_to_decimal_setter = True
74
74
  session.custom_package_usage_config["enabled"] = True
75
75
 
76
+ default_fallback_timezone = "UTC"
77
+ if global_config.spark_sql_session_timeZone is None:
78
+ try:
79
+ result = session.sql("SHOW PARAMETERS LIKE 'TIMEZONE'").collect()
80
+ if result and len(result) > 0:
81
+ value = result[0]["value"]
82
+ logger.warning(
83
+ f"Using Snowflake session timezone parameter as fallback: {value}"
84
+ )
85
+ else:
86
+ value = default_fallback_timezone
87
+ logger.warning(
88
+ f"Could not determine timezone from parameters, defaulting to {default_fallback_timezone}"
89
+ )
90
+ except Exception as e:
91
+ value = default_fallback_timezone
92
+ logger.warning(
93
+ f"Could not query Snowflake timezone parameter ({e}), defaulting to {default_fallback_timezone}"
94
+ )
95
+ global_config.spark_sql_session_timeZone = value
96
+
76
97
  session_params = {
77
98
  "TIMESTAMP_TYPE_MAPPING": "TIMESTAMP_LTZ",
78
99
  "TIMEZONE": f"'{global_config.spark_sql_session_timeZone}'",
@@ -181,7 +202,3 @@ def set_query_tags(spark_tags: Sequence[str]) -> None:
181
202
 
182
203
  if spark_tags_str != snowpark_session.query_tag:
183
204
  snowpark_session.query_tag = spark_tags_str
184
-
185
-
186
- def get_python_udxf_import_files(session: snowpark.Session) -> str:
187
- return ",".join([file for file in [*session._python_files, *session._import_files]])