snowpark-connect 0.27.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. snowflake/snowpark_connect/__init__.py +1 -0
  2. snowflake/snowpark_connect/analyze_plan/map_tree_string.py +8 -4
  3. snowflake/snowpark_connect/client/__init__.py +15 -0
  4. snowflake/snowpark_connect/client/error_utils.py +30 -0
  5. snowflake/snowpark_connect/client/exceptions.py +36 -0
  6. snowflake/snowpark_connect/client/query_results.py +90 -0
  7. snowflake/snowpark_connect/client/server.py +680 -0
  8. snowflake/snowpark_connect/client/utils/__init__.py +10 -0
  9. snowflake/snowpark_connect/client/utils/session.py +85 -0
  10. snowflake/snowpark_connect/column_name_handler.py +404 -243
  11. snowflake/snowpark_connect/column_qualifier.py +43 -0
  12. snowflake/snowpark_connect/config.py +237 -23
  13. snowflake/snowpark_connect/constants.py +2 -0
  14. snowflake/snowpark_connect/dataframe_container.py +102 -8
  15. snowflake/snowpark_connect/date_time_format_mapping.py +71 -13
  16. snowflake/snowpark_connect/error/error_codes.py +50 -0
  17. snowflake/snowpark_connect/error/error_utils.py +172 -23
  18. snowflake/snowpark_connect/error/exceptions.py +13 -4
  19. snowflake/snowpark_connect/execute_plan/map_execution_command.py +15 -160
  20. snowflake/snowpark_connect/execute_plan/map_execution_root.py +26 -20
  21. snowflake/snowpark_connect/execute_plan/utils.py +5 -1
  22. snowflake/snowpark_connect/expression/function_defaults.py +9 -2
  23. snowflake/snowpark_connect/expression/hybrid_column_map.py +53 -5
  24. snowflake/snowpark_connect/expression/literal.py +37 -13
  25. snowflake/snowpark_connect/expression/map_cast.py +123 -5
  26. snowflake/snowpark_connect/expression/map_expression.py +80 -27
  27. snowflake/snowpark_connect/expression/map_extension.py +322 -12
  28. snowflake/snowpark_connect/expression/map_sql_expression.py +316 -81
  29. snowflake/snowpark_connect/expression/map_udf.py +85 -20
  30. snowflake/snowpark_connect/expression/map_unresolved_attribute.py +451 -173
  31. snowflake/snowpark_connect/expression/map_unresolved_function.py +2748 -746
  32. snowflake/snowpark_connect/expression/map_unresolved_star.py +87 -23
  33. snowflake/snowpark_connect/expression/map_update_fields.py +70 -18
  34. snowflake/snowpark_connect/expression/map_window_function.py +18 -3
  35. snowflake/snowpark_connect/includes/jars/{scala-library-2.12.18.jar → sas-scala-udf_2.12-0.2.0.jar} +0 -0
  36. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py +1 -1
  37. snowflake/snowpark_connect/includes/python/pyspark/sql/connect/streaming/worker/listener_worker.py +1 -1
  38. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.py +12 -10
  39. snowflake/snowpark_connect/proto/snowflake_expression_ext_pb2.pyi +14 -2
  40. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.py +10 -8
  41. snowflake/snowpark_connect/proto/snowflake_relation_ext_pb2.pyi +13 -6
  42. snowflake/snowpark_connect/relation/catalogs/abstract_spark_catalog.py +65 -17
  43. snowflake/snowpark_connect/relation/catalogs/snowflake_catalog.py +297 -49
  44. snowflake/snowpark_connect/relation/catalogs/utils.py +12 -4
  45. snowflake/snowpark_connect/relation/io_utils.py +110 -10
  46. snowflake/snowpark_connect/relation/map_aggregate.py +196 -255
  47. snowflake/snowpark_connect/relation/map_catalog.py +5 -1
  48. snowflake/snowpark_connect/relation/map_column_ops.py +264 -96
  49. snowflake/snowpark_connect/relation/map_extension.py +263 -29
  50. snowflake/snowpark_connect/relation/map_join.py +683 -442
  51. snowflake/snowpark_connect/relation/map_local_relation.py +28 -1
  52. snowflake/snowpark_connect/relation/map_map_partitions.py +83 -8
  53. snowflake/snowpark_connect/relation/map_relation.py +48 -19
  54. snowflake/snowpark_connect/relation/map_row_ops.py +310 -91
  55. snowflake/snowpark_connect/relation/map_show_string.py +13 -6
  56. snowflake/snowpark_connect/relation/map_sql.py +1233 -222
  57. snowflake/snowpark_connect/relation/map_stats.py +48 -9
  58. snowflake/snowpark_connect/relation/map_subquery_alias.py +11 -2
  59. snowflake/snowpark_connect/relation/map_udtf.py +14 -4
  60. snowflake/snowpark_connect/relation/read/jdbc_read_dbapi.py +53 -14
  61. snowflake/snowpark_connect/relation/read/map_read.py +134 -43
  62. snowflake/snowpark_connect/relation/read/map_read_csv.py +255 -45
  63. snowflake/snowpark_connect/relation/read/map_read_jdbc.py +17 -5
  64. snowflake/snowpark_connect/relation/read/map_read_json.py +320 -85
  65. snowflake/snowpark_connect/relation/read/map_read_parquet.py +142 -27
  66. snowflake/snowpark_connect/relation/read/map_read_partitioned_parquet.py +142 -0
  67. snowflake/snowpark_connect/relation/read/map_read_socket.py +11 -3
  68. snowflake/snowpark_connect/relation/read/map_read_table.py +82 -5
  69. snowflake/snowpark_connect/relation/read/map_read_text.py +18 -3
  70. snowflake/snowpark_connect/relation/read/metadata_utils.py +170 -0
  71. snowflake/snowpark_connect/relation/read/reader_config.py +36 -3
  72. snowflake/snowpark_connect/relation/read/utils.py +50 -5
  73. snowflake/snowpark_connect/relation/stage_locator.py +91 -55
  74. snowflake/snowpark_connect/relation/utils.py +128 -5
  75. snowflake/snowpark_connect/relation/write/jdbc_write_dbapi.py +19 -3
  76. snowflake/snowpark_connect/relation/write/map_write.py +929 -319
  77. snowflake/snowpark_connect/relation/write/map_write_jdbc.py +8 -2
  78. snowflake/snowpark_connect/resources/java_udfs-1.0-SNAPSHOT.jar +0 -0
  79. snowflake/snowpark_connect/resources_initializer.py +110 -48
  80. snowflake/snowpark_connect/server.py +546 -456
  81. snowflake/snowpark_connect/server_common/__init__.py +500 -0
  82. snowflake/snowpark_connect/snowflake_session.py +65 -0
  83. snowflake/snowpark_connect/start_server.py +53 -5
  84. snowflake/snowpark_connect/type_mapping.py +349 -27
  85. snowflake/snowpark_connect/typed_column.py +9 -7
  86. snowflake/snowpark_connect/utils/artifacts.py +9 -8
  87. snowflake/snowpark_connect/utils/cache.py +49 -27
  88. snowflake/snowpark_connect/utils/concurrent.py +36 -1
  89. snowflake/snowpark_connect/utils/context.py +187 -37
  90. snowflake/snowpark_connect/utils/describe_query_cache.py +68 -53
  91. snowflake/snowpark_connect/utils/env_utils.py +5 -1
  92. snowflake/snowpark_connect/utils/expression_transformer.py +172 -0
  93. snowflake/snowpark_connect/utils/identifiers.py +137 -3
  94. snowflake/snowpark_connect/utils/io_utils.py +57 -1
  95. snowflake/snowpark_connect/utils/java_stored_procedure.py +125 -0
  96. snowflake/snowpark_connect/utils/java_udaf_utils.py +303 -0
  97. snowflake/snowpark_connect/utils/java_udtf_utils.py +239 -0
  98. snowflake/snowpark_connect/utils/jvm_udf_utils.py +248 -0
  99. snowflake/snowpark_connect/utils/open_telemetry.py +516 -0
  100. snowflake/snowpark_connect/utils/pandas_udtf_utils.py +8 -4
  101. snowflake/snowpark_connect/utils/patch_spark_line_number.py +181 -0
  102. snowflake/snowpark_connect/utils/profiling.py +25 -8
  103. snowflake/snowpark_connect/utils/scala_udf_utils.py +101 -332
  104. snowflake/snowpark_connect/utils/sequence.py +21 -0
  105. snowflake/snowpark_connect/utils/session.py +64 -28
  106. snowflake/snowpark_connect/utils/snowpark_connect_logging.py +51 -9
  107. snowflake/snowpark_connect/utils/spcs_logger.py +290 -0
  108. snowflake/snowpark_connect/utils/telemetry.py +163 -22
  109. snowflake/snowpark_connect/utils/temporary_view_cache.py +67 -0
  110. snowflake/snowpark_connect/utils/temporary_view_helper.py +334 -0
  111. snowflake/snowpark_connect/utils/udf_cache.py +117 -41
  112. snowflake/snowpark_connect/utils/udf_helper.py +39 -37
  113. snowflake/snowpark_connect/utils/udf_utils.py +133 -14
  114. snowflake/snowpark_connect/utils/udtf_helper.py +8 -1
  115. snowflake/snowpark_connect/utils/udtf_utils.py +46 -31
  116. snowflake/snowpark_connect/utils/upload_java_jar.py +57 -0
  117. snowflake/snowpark_connect/version.py +1 -1
  118. snowflake/snowpark_decoder/dp_session.py +6 -2
  119. snowflake/snowpark_decoder/spark_decoder.py +12 -0
  120. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-submit +2 -2
  121. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/METADATA +14 -7
  122. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/RECORD +129 -167
  123. snowflake/snowpark_connect/hidden_column.py +0 -39
  124. snowflake/snowpark_connect/includes/jars/antlr4-runtime-4.9.3.jar +0 -0
  125. snowflake/snowpark_connect/includes/jars/commons-cli-1.5.0.jar +0 -0
  126. snowflake/snowpark_connect/includes/jars/commons-codec-1.16.1.jar +0 -0
  127. snowflake/snowpark_connect/includes/jars/commons-collections-3.2.2.jar +0 -0
  128. snowflake/snowpark_connect/includes/jars/commons-collections4-4.4.jar +0 -0
  129. snowflake/snowpark_connect/includes/jars/commons-compiler-3.1.9.jar +0 -0
  130. snowflake/snowpark_connect/includes/jars/commons-compress-1.26.0.jar +0 -0
  131. snowflake/snowpark_connect/includes/jars/commons-crypto-1.1.0.jar +0 -0
  132. snowflake/snowpark_connect/includes/jars/commons-dbcp-1.4.jar +0 -0
  133. snowflake/snowpark_connect/includes/jars/commons-io-2.16.1.jar +0 -0
  134. snowflake/snowpark_connect/includes/jars/commons-lang-2.6.jar +0 -0
  135. snowflake/snowpark_connect/includes/jars/commons-lang3-3.12.0.jar +0 -0
  136. snowflake/snowpark_connect/includes/jars/commons-logging-1.1.3.jar +0 -0
  137. snowflake/snowpark_connect/includes/jars/commons-math3-3.6.1.jar +0 -0
  138. snowflake/snowpark_connect/includes/jars/commons-pool-1.5.4.jar +0 -0
  139. snowflake/snowpark_connect/includes/jars/commons-text-1.10.0.jar +0 -0
  140. snowflake/snowpark_connect/includes/jars/hadoop-client-api-trimmed-3.3.4.jar +0 -0
  141. snowflake/snowpark_connect/includes/jars/jackson-annotations-2.15.2.jar +0 -0
  142. snowflake/snowpark_connect/includes/jars/jackson-core-2.15.2.jar +0 -0
  143. snowflake/snowpark_connect/includes/jars/jackson-core-asl-1.9.13.jar +0 -0
  144. snowflake/snowpark_connect/includes/jars/jackson-databind-2.15.2.jar +0 -0
  145. snowflake/snowpark_connect/includes/jars/jackson-dataformat-yaml-2.15.2.jar +0 -0
  146. snowflake/snowpark_connect/includes/jars/jackson-datatype-jsr310-2.15.2.jar +0 -0
  147. snowflake/snowpark_connect/includes/jars/jackson-module-scala_2.12-2.15.2.jar +0 -0
  148. snowflake/snowpark_connect/includes/jars/json4s-ast_2.12-3.7.0-M11.jar +0 -0
  149. snowflake/snowpark_connect/includes/jars/json4s-core_2.12-3.7.0-M11.jar +0 -0
  150. snowflake/snowpark_connect/includes/jars/json4s-jackson_2.12-3.7.0-M11.jar +0 -0
  151. snowflake/snowpark_connect/includes/jars/json4s-native_2.12-3.7.0-M11.jar +0 -0
  152. snowflake/snowpark_connect/includes/jars/json4s-scalap_2.12-3.7.0-M11.jar +0 -0
  153. snowflake/snowpark_connect/includes/jars/kryo-shaded-4.0.2.jar +0 -0
  154. snowflake/snowpark_connect/includes/jars/log4j-1.2-api-2.20.0.jar +0 -0
  155. snowflake/snowpark_connect/includes/jars/log4j-api-2.20.0.jar +0 -0
  156. snowflake/snowpark_connect/includes/jars/log4j-core-2.20.0.jar +0 -0
  157. snowflake/snowpark_connect/includes/jars/log4j-slf4j2-impl-2.20.0.jar +0 -0
  158. snowflake/snowpark_connect/includes/jars/paranamer-2.8.3.jar +0 -0
  159. snowflake/snowpark_connect/includes/jars/paranamer-2.8.jar +0 -0
  160. snowflake/snowpark_connect/includes/jars/sas-scala-udf_2.12-0.1.0.jar +0 -0
  161. snowflake/snowpark_connect/includes/jars/scala-collection-compat_2.12-2.7.0.jar +0 -0
  162. snowflake/snowpark_connect/includes/jars/scala-parser-combinators_2.12-2.3.0.jar +0 -0
  163. snowflake/snowpark_connect/includes/jars/scala-reflect-2.12.18.jar +0 -0
  164. snowflake/snowpark_connect/includes/jars/scala-xml_2.12-2.1.0.jar +0 -0
  165. snowflake/snowpark_connect/includes/jars/slf4j-api-2.0.7.jar +0 -0
  166. snowflake/snowpark_connect/includes/jars/spark-catalyst_2.12-3.5.6.jar +0 -0
  167. snowflake/snowpark_connect/includes/jars/spark-common-utils_2.12-3.5.6.jar +0 -0
  168. snowflake/snowpark_connect/includes/jars/spark-connect-client-jvm_2.12-3.5.6.jar +0 -0
  169. snowflake/snowpark_connect/includes/jars/spark-core_2.12-3.5.6.jar +0 -0
  170. snowflake/snowpark_connect/includes/jars/spark-graphx_2.12-3.5.6.jar +0 -0
  171. snowflake/snowpark_connect/includes/jars/spark-hive-thriftserver_2.12-3.5.6.jar +0 -0
  172. snowflake/snowpark_connect/includes/jars/spark-hive_2.12-3.5.6.jar +0 -0
  173. snowflake/snowpark_connect/includes/jars/spark-kvstore_2.12-3.5.6.jar +0 -0
  174. snowflake/snowpark_connect/includes/jars/spark-launcher_2.12-3.5.6.jar +0 -0
  175. snowflake/snowpark_connect/includes/jars/spark-mesos_2.12-3.5.6.jar +0 -0
  176. snowflake/snowpark_connect/includes/jars/spark-mllib-local_2.12-3.5.6.jar +0 -0
  177. snowflake/snowpark_connect/includes/jars/spark-network-common_2.12-3.5.6.jar +0 -0
  178. snowflake/snowpark_connect/includes/jars/spark-network-shuffle_2.12-3.5.6.jar +0 -0
  179. snowflake/snowpark_connect/includes/jars/spark-repl_2.12-3.5.6.jar +0 -0
  180. snowflake/snowpark_connect/includes/jars/spark-sketch_2.12-3.5.6.jar +0 -0
  181. snowflake/snowpark_connect/includes/jars/spark-sql-api_2.12-3.5.6.jar +0 -0
  182. snowflake/snowpark_connect/includes/jars/spark-sql_2.12-3.5.6.jar +0 -0
  183. snowflake/snowpark_connect/includes/jars/spark-tags_2.12-3.5.6.jar +0 -0
  184. snowflake/snowpark_connect/includes/jars/spark-unsafe_2.12-3.5.6.jar +0 -0
  185. snowflake/snowpark_connect/includes/jars/spark-yarn_2.12-3.5.6.jar +0 -0
  186. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-connect +0 -0
  187. {snowpark_connect-0.27.0.data → snowpark_connect-1.6.0.data}/scripts/snowpark-session +0 -0
  188. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/WHEEL +0 -0
  189. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE-binary +0 -0
  190. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/LICENSE.txt +0 -0
  191. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/licenses/NOTICE-binary +0 -0
  192. {snowpark_connect-0.27.0.dist-info → snowpark_connect-1.6.0.dist-info}/top_level.txt +0 -0
@@ -26,7 +26,6 @@ def df_cache_map_get(key: Tuple[str, any]) -> DataFrameContainer | None:
26
26
  def df_cache_map_put_if_absent(
27
27
  key: Tuple[str, any],
28
28
  compute_fn: Callable[[], DataFrameContainer | pandas.DataFrame],
29
- materialize: bool,
30
29
  ) -> DataFrameContainer | pandas.DataFrame:
31
30
  """
32
31
  Put a DataFrame container into the cache map if the key is absent. Optionally, as side effect, materialize
@@ -35,7 +34,6 @@ def df_cache_map_put_if_absent(
35
34
  Args:
36
35
  key (Tuple[str, int]): The key to insert into the cache map (session_id, plan_id).
37
36
  compute_fn (Callable[[], DataFrameContainer | pandas.DataFrame]): A function to compute the DataFrame container if the key is absent.
38
- materialize (bool): Whether to materialize the DataFrame.
39
37
 
40
38
  Returns:
41
39
  DataFrameContainer | pandas.DataFrame: The cached or newly computed DataFrame container.
@@ -45,7 +43,7 @@ def df_cache_map_put_if_absent(
45
43
  container: DataFrameContainer,
46
44
  ) -> DataFrameContainer:
47
45
 
48
- if materialize:
46
+ if container.can_be_materialized:
49
47
  df = container.dataframe
50
48
  cached_result = df.cache_result()
51
49
  return DataFrameContainer(
@@ -58,30 +56,54 @@ def df_cache_map_put_if_absent(
58
56
  return container
59
57
 
60
58
  with _cache_map_lock:
61
- if key not in df_cache_map:
62
- result = compute_fn()
63
-
64
- # check cache again, since recursive call in compute_fn could've already cached the result.
65
- # we want return it, instead of saving it again. This is important if materialize = True
66
- # because materialization is expensive operation that we don't want to do twice.
67
- if key in df_cache_map:
68
- return df_cache_map[key]
69
-
70
- # only cache DataFrameContainer, but not pandas result.
71
- # Pandas result is only returned when df.show() is called, where we convert
72
- # a dataframe to a string representation.
73
- # We don't expect map_relation would return pandas df here because that would
74
- # be equivalent to calling df.show().cache(), which is not allowed.
75
- if isinstance(result, DataFrameContainer):
76
- df_cache_map[key] = _object_to_cache(result)
77
- else:
78
- # This is not expected, but we will just log a warning
79
- logger.warning(
80
- "Unexpected pandas dataframe returned for caching. Ignoring the cache call."
81
- )
82
- return result
83
-
84
- return df_cache_map[key]
59
+ if key in df_cache_map:
60
+ return df_cache_map[key]
61
+
62
+ # the compute_fn is not guaranteed to be called only once, but it's acceptable based upon the following analysis:
63
+ # there are in total 5 occurrences of passing compute_fn callback falling into two categories:
64
+ # 2 occurrences as lambda that needs to be computed:
65
+ # 1) server::AnalyzePlan case "persist"
66
+ # 2) server::AddArtifacts case "read"
67
+ # 3 occurrences as lambda that simply returns pre-computed dataframe without any computation:
68
+ # 1) map_relation case "local_relation"
69
+ # 2) map_relation case "sample"
70
+ # 3) map_read case "data_source"
71
+ # based upon the analysis of the code, the chance of concurrently calling compute_fn for the same key is very low and if it happens
72
+ # repeating the computation will not affect the result.
73
+ # This is a trade-off between implementation simplicity and fine-grained locking.
74
+ result = compute_fn()
75
+
76
+ if isinstance(result, DataFrameContainer) and not result.can_be_cached:
77
+ return result
78
+
79
+ # check cache again, since recursive call in compute_fn could've already cached the result.
80
+ # we want return it, instead of saving it again. This is important if materialize = True
81
+ # because materialization is expensive operation that we don't want to do twice.
82
+ with _cache_map_lock:
83
+ if key in df_cache_map:
84
+ return df_cache_map[key]
85
+
86
+ # only cache DataFrameContainer, but not pandas result.
87
+ # Pandas result is only returned when df.show() is called, where we convert
88
+ # a dataframe to a string representation.
89
+ # We don't expect map_relation would return pandas df here because that would
90
+ # be equivalent to calling df.show().cache(), which is not allowed.
91
+ if isinstance(result, DataFrameContainer):
92
+ # The _object_to_cache function is not guaranteed to be called only once.
93
+ # In rare multithreading cases, this may result in duplicate temporary table
94
+ # creation because df.cache_result() materializes the DataFrame into a temp table each time.
95
+ # This is acceptable because correctness is not affected, the likelihood is very low, and
96
+ # it simplifies the implementation by avoiding fine-grained locking.
97
+ cached_result = _object_to_cache(result)
98
+ with _cache_map_lock:
99
+ df_cache_map[key] = cached_result
100
+ return df_cache_map[key]
101
+ else:
102
+ # This is not expected, but we will just log a warning
103
+ logger.warning(
104
+ "Unexpected pandas dataframe returned for caching. Ignoring the cache call."
105
+ )
106
+ return result
85
107
 
86
108
 
87
109
  def df_cache_map_pop(key: Tuple[str, any]) -> None:
@@ -5,10 +5,11 @@
5
5
  import threading
6
6
  from collections.abc import Mapping
7
7
  from copy import copy
8
- from typing import TypeVar
8
+ from typing import Callable, TypeVar
9
9
 
10
10
  K = TypeVar("K")
11
11
  V = TypeVar("V")
12
+ T = TypeVar("T")
12
13
 
13
14
 
14
15
  class SynchronizedDict(Mapping[K, V]):
@@ -52,6 +53,10 @@ class SynchronizedDict(Mapping[K, V]):
52
53
  with self._lock.writer():
53
54
  self._dict[key] = value
54
55
 
56
+ def __delitem__(self, key: K) -> None:
57
+ with self._lock.writer():
58
+ del self._dict[key]
59
+
55
60
  def __contains__(self, key: K) -> bool:
56
61
  with self._lock.reader():
57
62
  return key in self._dict
@@ -69,6 +74,36 @@ class SynchronizedDict(Mapping[K, V]):
69
74
  self._dict.clear()
70
75
 
71
76
 
77
+ class SynchronizedList:
78
+ def __init__(self, in_list: list[T] | None = None) -> None:
79
+ self._lock = ReadWriteLock()
80
+ self._list = in_list if in_list is not None else []
81
+
82
+ def append(self, item: T) -> None:
83
+ with self._lock.writer():
84
+ self._list.append(item)
85
+
86
+ def clear(self) -> None:
87
+ with self._lock.writer():
88
+ self._list.clear()
89
+
90
+ def copy(self) -> list[T]:
91
+ with self._lock.reader():
92
+ return self._list.copy()
93
+
94
+ def filter(self, predicate: Callable[[T], bool]) -> None:
95
+ with self._lock.writer():
96
+ self._list = [item for item in self._list if predicate(item)]
97
+
98
+ def __len__(self) -> int:
99
+ with self._lock.reader():
100
+ return len(self._list)
101
+
102
+ def __iter__(self):
103
+ with self._lock.reader():
104
+ return iter(self._list.copy())
105
+
106
+
72
107
  class ReadWriteLock:
73
108
  class _Reader:
74
109
  def __init__(self, lock) -> None:
@@ -2,10 +2,12 @@
2
2
  # Copyright (c) 2012-2025 Snowflake Computing Inc. All rights reserved.
3
3
  #
4
4
 
5
+ import os
5
6
  import re
7
+ import threading
6
8
  from contextlib import contextmanager
7
9
  from contextvars import ContextVar
8
- from typing import Mapping, Optional
10
+ from typing import Iterator, Mapping, Optional
9
11
 
10
12
  import pyspark.sql.connect.proto.expressions_pb2 as expressions_proto
11
13
 
@@ -14,25 +16,29 @@ from snowflake.snowpark_connect.typed_column import TypedColumn
14
16
 
15
17
  # TODO: remove session id from context when we host SAS in Snowflake server
16
18
 
17
- _session_id = ContextVar[str]("_session_id")
19
+ _spark_session_id = ContextVar[str]("_spark_session_id")
18
20
  _plan_id_map = ContextVar[Mapping[int, DataFrameContainer]]("_plan_id_map")
19
21
  _alias_map = ContextVar[Mapping[str, DataFrameContainer | None]]("_alias_map")
20
22
  _spark_version = ContextVar[str]("_spark_version")
21
23
  _is_aggregate_function = ContextVar(
22
24
  "_is_aggregate_function", default=("default", False)
23
25
  )
26
+ _grouping_by_scala_udf_key = ContextVar[bool](
27
+ "_grouping_by_scala_udf_key", default=False
28
+ )
24
29
  _is_evaluating_sql = ContextVar[bool]("_is_evaluating_sql", default=False)
25
30
  _is_evaluating_join_condition = ContextVar(
26
31
  "_is_evaluating_join_condition", default=("default", False, [], [])
27
32
  )
33
+ _is_processing_order_by = ContextVar[bool]("_is_processing_order_by", default=False)
34
+ _is_processing_aliased_relation = ContextVar[bool](
35
+ "_is_processing_aliased_relation", default=False
36
+ )
28
37
 
29
38
  _sql_aggregate_function_count = ContextVar[int](
30
39
  "_contains_aggregate_function", default=0
31
40
  )
32
41
 
33
- # Context for parsing map_partitions
34
- _map_partitions_stack = ContextVar[int]("_map_partitions_stack", default=0)
35
-
36
42
  # We have to generate our own plan IDs that are different from Spark's.
37
43
  # Spark plan IDs start at 0, so pick a "big enough" number to avoid overlaps.
38
44
  _STARTING_SQL_PLAN_ID = 0x80000000
@@ -55,10 +61,26 @@ _resolving_lambda_fun = ContextVar[bool]("_resolving_lambdas", default=False)
55
61
  _current_lambda_params = ContextVar[list[str]]("_current_lambda_params", default=[])
56
62
 
57
63
  _is_window_enabled = ContextVar[bool]("_is_window_enabled", default=False)
58
- _is_in_pivot = ContextVar[bool]("_is_in_pivot", default=False)
59
64
  _is_in_udtf_context = ContextVar[bool]("_is_in_udtf_context", default=False)
60
65
  _accessing_temp_object = ContextVar[bool]("_accessing_temp_object", default=False)
61
66
 
67
+ # Thread-safe lock for JPype JClass creation to prevent access violations
68
+ _jpype_jclass_lock = threading.Lock()
69
+
70
+
71
+ @contextmanager
72
+ def get_jpype_jclass_lock() -> Iterator[None]:
73
+ """
74
+ Context manager that acquires the JPype JClass lock on Windows platforms.
75
+ On non-Windows (os.name != 'nt'), it yields without acquiring the lock.
76
+ """
77
+ if os.name == "nt":
78
+ with _jpype_jclass_lock:
79
+ yield
80
+ else:
81
+ yield
82
+
83
+
62
84
  # Lateral Column Alias helpers
63
85
  # We keep a thread-local mapping from alias name -> TypedColumn that is
64
86
  # populated incrementally while the projection list is being processed.
@@ -70,12 +92,60 @@ _lca_alias_map: ContextVar[dict[str, TypedColumn]] = ContextVar(
70
92
  default={},
71
93
  )
72
94
 
95
+ _view_process_context = ContextVar("_view_process_context", default=[])
96
+
97
+
98
+ @contextmanager
99
+ def push_processed_view(name: str):
100
+ _view_process_context.set(_view_process_context.get() + [name])
101
+ yield
102
+ _view_process_context.set(_view_process_context.get()[:-1])
103
+
104
+
105
+ def get_processed_views() -> list[str]:
106
+ return _view_process_context.get()
107
+
108
+
109
+ def register_processed_view(name: str) -> None:
110
+ context = _view_process_context.get()
111
+ context.append(name)
112
+ _view_process_context.set(context)
113
+
114
+
115
+ _request_external_tables = ContextVar[list[str]]("_used_external_tables", default=[])
116
+
117
+
118
+ def register_request_external_table(name: str) -> None:
119
+ _request_external_tables.set(_request_external_tables.get() + [name])
120
+
121
+
122
+ def get_request_external_tables() -> list[str]:
123
+ return _request_external_tables.get()
124
+
125
+
126
+ def clean_request_external_tables() -> None:
127
+ _request_external_tables.set([])
128
+
129
+
73
130
  # Context variable to track current grouping columns for grouping_id() function
74
131
  _current_grouping_columns: ContextVar[list[str]] = ContextVar(
75
132
  "_current_grouping_columns",
76
133
  default=[],
77
134
  )
78
135
 
136
+ # Context variable to capture all original_attr_name values during subquery resolution
137
+ # This is a stack of lists to handle nested subqueries correctly
138
+ _captured_attribute_names: ContextVar[list[list[str]]] = ContextVar(
139
+ "_captured_attribute_names",
140
+ default=[],
141
+ )
142
+
143
+ # Context variable to track if we're resolving a subquery expression
144
+ _is_resolving_subquery_exp: ContextVar[bool] = ContextVar(
145
+ "_is_resolving_subquery_exp",
146
+ default=False,
147
+ )
148
+
79
149
 
80
150
  def clear_lca_alias_map() -> None:
81
151
  _lca_alias_map.set({})
@@ -112,14 +182,56 @@ def get_current_grouping_columns() -> list[str]:
112
182
  return _current_grouping_columns.get()
113
183
 
114
184
 
115
- def set_session_id(value: str) -> None:
116
- """Set the session ID for the current context"""
117
- _session_id.set(value)
185
+ def capture_attribute_name(attr_name: str) -> None:
186
+ """Capture an original_attr_name during expression resolution."""
187
+ stack = _captured_attribute_names.get()
188
+ if stack:
189
+ stack[-1].append(attr_name)
190
+ _captured_attribute_names.set(stack)
191
+
192
+
193
+ def get_captured_attribute_names() -> list[str]:
194
+ """Get the list of captured attribute names from the current top of the stack."""
195
+ stack = _captured_attribute_names.get()
196
+ return stack[-1] if stack else []
197
+
198
+
199
+ def is_resolving_subquery_exp() -> bool:
200
+ """
201
+ Returns True if currently resolving a subquery expression.
202
+ """
203
+ return _is_resolving_subquery_exp.get()
204
+
205
+
206
+ @contextmanager
207
+ def resolving_subquery_exp():
208
+ """
209
+ Context manager that captures all original_attr_name values during subquery expression resolution.
210
+ Sets a flag to indicate we're in a subquery context and pushes a new list onto the stack.
211
+ When the context exits, pops the list from the stack.
212
+ """
213
+ stack = _captured_attribute_names.get()
214
+ stack.append([])
215
+ _captured_attribute_names.set(stack)
216
+ token = _is_resolving_subquery_exp.set(True)
217
+ try:
218
+ yield
219
+ finally:
220
+ stack = _captured_attribute_names.get()
221
+ if stack:
222
+ stack.pop()
223
+ _captured_attribute_names.set(stack)
224
+ _is_resolving_subquery_exp.reset(token)
225
+
226
+
227
+ def set_spark_session_id(value: str) -> None:
228
+ """Set the Spark session ID for the current context"""
229
+ _spark_session_id.set(value)
118
230
 
119
231
 
120
- def get_session_id() -> str:
121
- """Get the session ID for the current context."""
122
- return _session_id.get(None)
232
+ def get_spark_session_id() -> str:
233
+ """Get the Spark session ID for the current context."""
234
+ return _spark_session_id.get(None)
123
235
 
124
236
 
125
237
  def set_plan_id_map(plan_id: int, container: DataFrameContainer) -> None:
@@ -190,6 +302,66 @@ def push_evaluating_sql_scope():
190
302
  _is_evaluating_sql.set(prev)
191
303
 
192
304
 
305
+ def get_grouping_by_scala_udf_key() -> bool:
306
+ """
307
+ Gets the value of _grouping_by_scala_udf_key for the current context, defaults to False.
308
+ """
309
+ return _grouping_by_scala_udf_key.get()
310
+
311
+
312
+ @contextmanager
313
+ def grouping_by_scala_udf_key(value: bool):
314
+ """
315
+ Context manager that conditionally sets a flag indicating grouping by scala_udf key.
316
+ Only activates the flag when value=True, otherwise leaves the current context unchanged
317
+ """
318
+ prev = _grouping_by_scala_udf_key.get()
319
+ try:
320
+ if value:
321
+ _grouping_by_scala_udf_key.set(True)
322
+ yield
323
+ finally:
324
+ _grouping_by_scala_udf_key.set(prev)
325
+
326
+
327
+ def get_is_processing_order_by() -> bool:
328
+ """
329
+ Gets the value of _is_processing_order_by for the current context, defaults to False.
330
+ """
331
+ return _is_processing_order_by.get()
332
+
333
+
334
+ @contextmanager
335
+ def push_processing_order_by_scope():
336
+ """
337
+ Context manager that sets a flag indicating if ORDER BY expressions are being evaluated.
338
+ This enables optimizations like reusing already-computed UDF columns.
339
+ """
340
+ prev = _is_processing_order_by.get()
341
+ try:
342
+ _is_processing_order_by.set(True)
343
+ yield
344
+ finally:
345
+ _is_processing_order_by.set(prev)
346
+
347
+
348
+ def get_is_processing_aliased_relation() -> bool:
349
+ return _is_processing_aliased_relation.get()
350
+
351
+
352
+ @contextmanager
353
+ def push_processing_aliased_relation_scope(process_aliased_relation: bool):
354
+ """
355
+ Context manager that sets a flag indicating if an aliased relation is being resolved.
356
+ """
357
+ prev = _is_processing_aliased_relation.get()
358
+ try:
359
+ _is_processing_aliased_relation.set(process_aliased_relation)
360
+ yield
361
+ finally:
362
+ _is_processing_aliased_relation.set(prev)
363
+
364
+
193
365
  def get_is_evaluating_join_condition() -> tuple[str, bool, list, list]:
194
366
  """
195
367
  Gets the value of _is_evaluating_join_condition for the current context, defaults to False.
@@ -210,16 +382,6 @@ def push_evaluating_join_condition(join_type, left_keys, right_keys):
210
382
  _is_evaluating_join_condition.set(prev)
211
383
 
212
384
 
213
- @contextmanager
214
- def push_map_partitions():
215
- _map_partitions_stack.set(_map_partitions_stack.get() + 1)
216
- yield
217
-
218
-
219
- def map_partitions_depth() -> int:
220
- return _map_partitions_stack.get()
221
-
222
-
223
385
  @contextmanager
224
386
  def push_sql_scope():
225
387
  """
@@ -383,13 +545,14 @@ def get_outer_dataframes() -> list[DataFrameContainer]:
383
545
 
384
546
 
385
547
  def clear_context_data() -> None:
386
- _session_id.set(None)
548
+ _spark_session_id.set(None)
387
549
  _plan_id_map.set({})
388
550
  _alias_map.set({})
389
551
 
552
+ _request_external_tables.set([])
553
+ _view_process_context.set([])
390
554
  _next_sql_plan_id.set(_STARTING_SQL_PLAN_ID)
391
555
  _sql_plan_name_map.set({})
392
- _map_partitions_stack.set(0)
393
556
  _sql_aggregate_function_count.set(0)
394
557
  _sql_named_args.set({})
395
558
  _sql_pos_args.set({})
@@ -419,19 +582,6 @@ def is_window_enabled():
419
582
  return _is_window_enabled.get()
420
583
 
421
584
 
422
- @contextmanager
423
- def temporary_pivot_expression(value: bool):
424
- token = _is_in_pivot.set(value)
425
- try:
426
- yield
427
- finally:
428
- _is_in_pivot.reset(token)
429
-
430
-
431
- def is_in_pivot() -> bool:
432
- return _is_in_pivot.get()
433
-
434
-
435
585
  def get_is_in_udtf_context() -> bool:
436
586
  """
437
587
  Gets the value of _is_in_udtf_context for the current context, defaults to False.
@@ -6,20 +6,28 @@ import hashlib
6
6
  import inspect
7
7
  import random
8
8
  import re
9
- import threading
10
9
  import time
11
10
  from typing import Any
12
11
 
13
12
  from snowflake import snowpark
14
13
  from snowflake.connector.cursor import ResultMetadataV2
15
14
  from snowflake.snowpark._internal.server_connection import ServerConnection
15
+ from snowflake.snowpark_connect.error.error_codes import ErrorCodes
16
+ from snowflake.snowpark_connect.error.error_utils import attach_custom_error_code
17
+ from snowflake.snowpark_connect.utils.concurrent import (
18
+ SynchronizedDict,
19
+ SynchronizedList,
20
+ )
16
21
  from snowflake.snowpark_connect.utils.snowpark_connect_logging import logger
17
22
  from snowflake.snowpark_connect.utils.telemetry import telemetry
18
23
 
19
- DESCRIBE_CACHE_TTL_SECONDS = 15
20
24
  USE_DESCRIBE_QUERY_CACHE = True
21
25
 
22
- DDL_DETECTION_PATTERN = re.compile(r"^\s*(CREATE|ALTER|DROP|RENAME)\b", re.IGNORECASE)
26
+ DDL_DETECTION_PATTERN = re.compile(r"\s*(CREATE|ALTER|DROP)\b", re.IGNORECASE)
27
+ PLAIN_CREATE_PATTERN = re.compile(
28
+ r"\s*CREATE\s+((LOCAL|GLOBAL)\s+)?(TRANSIENT\s+)?TABLE\b", re.IGNORECASE
29
+ )
30
+
23
31
  # Pattern for simple constant queries like: SELECT 3 :: INT AS "3-80000030-0" FROM ( SELECT $1 AS "__DUMMY" FROM VALUES (NULL :: STRING))
24
32
  # Using exact spacing pattern from generated SQL for deterministic matching
25
33
  # Column ID format: {original_name}-{8_digit_hex_plan_id}-{column_index}
@@ -32,8 +40,7 @@ SIMPLE_CONSTANT_PATTERN = re.compile(
32
40
 
33
41
  class DescribeQueryCache:
34
42
  def __init__(self) -> None:
35
- self._cache = {}
36
- self._lock = threading.Lock()
43
+ self._cache = SynchronizedDict()
37
44
 
38
45
  @staticmethod
39
46
  def _hash_query(sql_query: str) -> str:
@@ -48,49 +55,53 @@ class DescribeQueryCache:
48
55
  return sql_query
49
56
 
50
57
  def get(self, sql_query: str) -> list[ResultMetadataV2] | None:
58
+ from snowflake.snowpark_connect.config import get_describe_cache_ttl_seconds
59
+
60
+ telemetry.report_describe_query_cache_lookup()
61
+
51
62
  cache_key = self._get_cache_key(sql_query)
52
63
  key = self._hash_query(cache_key)
53
64
  current_time = time.monotonic()
54
65
 
55
- # TODO: maybe too much locking, we could use read-write lock also. Or a thread safe dictionary.
56
- with self._lock:
57
- if key in self._cache:
58
- result, timestamp = self._cache[key]
59
- if current_time < timestamp + DESCRIBE_CACHE_TTL_SECONDS:
60
- logger.debug(
61
- f"Returning query result from cache for query: {sql_query[:20]}"
62
- )
63
-
64
- # If this is a constant query, we need to transform the result metadata
65
- # to match the actual query's column name
66
- if (
67
- cache_key != sql_query
68
- ): # Only transform if we normalized the key
69
- match = SIMPLE_CONSTANT_PATTERN.match(sql_query)
70
- if match:
71
- number, column_id = match.groups()
72
- expected_column_name = column_id
73
-
74
- # Transform the cached result to match this query's column name
75
- # There should only be one column in these constant queries
76
- metadata = result[0]
77
- new_metadata = ResultMetadataV2(
78
- name=expected_column_name,
79
- type_code=metadata.type_code,
80
- display_size=metadata.display_size,
81
- internal_size=metadata.internal_size,
82
- precision=metadata.precision,
83
- scale=metadata.scale,
84
- is_nullable=metadata.is_nullable,
85
- )
86
- return [new_metadata]
87
-
88
- return result
89
- else:
90
- logger.debug(
91
- f"Had a cached entry, but it expired for query: {sql_query[:20]}"
92
- )
93
- del self._cache[key]
66
+ if key in self._cache:
67
+ result, timestamp = self._cache[key]
68
+
69
+ expired_by = current_time - (timestamp + get_describe_cache_ttl_seconds())
70
+ if expired_by < 0:
71
+ logger.debug(
72
+ f"Returning query result from cache for query: {sql_query[:20]}"
73
+ )
74
+ self._cache[key] = (result, current_time)
75
+
76
+ # If this is a constant query, we need to transform the result metadata
77
+ # to match the actual query's column name
78
+ if cache_key != sql_query: # Only transform if we normalized the key
79
+ match = SIMPLE_CONSTANT_PATTERN.match(sql_query)
80
+ if match:
81
+ number, column_id = match.groups()
82
+ expected_column_name = column_id
83
+
84
+ # Transform the cached result to match this query's column name
85
+ # There should only be one column in these constant queries
86
+ metadata = result[0]
87
+ new_metadata = ResultMetadataV2(
88
+ name=expected_column_name,
89
+ type_code=metadata.type_code,
90
+ display_size=metadata.display_size,
91
+ internal_size=metadata.internal_size,
92
+ precision=metadata.precision,
93
+ scale=metadata.scale,
94
+ is_nullable=metadata.is_nullable,
95
+ )
96
+
97
+ telemetry.report_describe_query_cache_hit()
98
+ return [new_metadata]
99
+
100
+ telemetry.report_describe_query_cache_hit()
101
+ return result
102
+ else:
103
+ telemetry.report_describe_query_cache_expired(expired_by)
104
+ del self._cache[key]
94
105
  return None
95
106
 
96
107
  def put(self, sql_query: str, result: list[ResultMetadataV2] | None) -> None:
@@ -102,12 +113,18 @@ class DescribeQueryCache:
102
113
 
103
114
  logger.debug(f"Putting query into cache: {sql_query[:50]}...")
104
115
 
105
- with self._lock:
106
- self._cache[key] = (result, time.monotonic())
116
+ self._cache[key] = (result, time.monotonic())
107
117
 
108
118
  def clear(self) -> None:
109
- with self._lock:
110
- self._cache.clear()
119
+ self._cache.clear()
120
+
121
+ def update_cache_for_query(self, query: str) -> None:
122
+ # Clear cache for DDL operations that modify existing objects (exclude CREATE TABLE)
123
+ if DDL_DETECTION_PATTERN.search(query) and not PLAIN_CREATE_PATTERN.search(
124
+ query
125
+ ):
126
+ self.clear()
127
+ telemetry.report_describe_query_cache_clear()
111
128
 
112
129
 
113
130
  def instrument_session_for_describe_cache(session: snowpark.Session):
@@ -118,7 +135,7 @@ def instrument_session_for_describe_cache(session: snowpark.Session):
118
135
  return
119
136
 
120
137
  session._describe_query_cache = DescribeQueryCache()
121
- session._snowpark_api_describe_calls = []
138
+ session._snowpark_api_describe_calls = SynchronizedList()
122
139
 
123
140
  def update_cache_for_query(query: str):
124
141
  cache = None
@@ -126,10 +143,7 @@ def instrument_session_for_describe_cache(session: snowpark.Session):
126
143
  if isinstance(cache_instance, DescribeQueryCache):
127
144
  cache = cache_instance
128
145
 
129
- # TODO: This is very broad right now. We should be able to reduce the scope of clearing.
130
- if DDL_DETECTION_PATTERN.search(query):
131
- logger.debug(f"DDL detected, clearing describe query cache: '{query}'")
132
- cache.clear()
146
+ cache.update_cache_for_query(query)
133
147
 
134
148
  def wrap_execute(wrapped_fn):
135
149
  def fn(query: str, **kwargs):
@@ -139,6 +153,7 @@ def instrument_session_for_describe_cache(session: snowpark.Session):
139
153
  telemetry.report_query(result, **kwargs)
140
154
  except Exception as e:
141
155
  telemetry.report_query(e, **kwargs)
156
+ attach_custom_error_code(e, ErrorCodes.INTERNAL_ERROR)
142
157
  raise e
143
158
  return result
144
159