dataproc-spark-connect 1.0.0rc1__py2.py3-none-any.whl → 1.0.0rc3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 1.0.0rc1
3
+ Version: 1.0.0rc3
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -1,13 +1,13 @@
1
- dataproc_spark_connect-1.0.0rc1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
1
+ dataproc_spark_connect-1.0.0rc3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2
2
  google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
3
- google/cloud/dataproc_spark_connect/environment.py,sha256=UICy9XyqAxL-cryVWx7GZPRAxoir5LKk0dtqqY_l--c,2307
3
+ google/cloud/dataproc_spark_connect/environment.py,sha256=l1wWiHMHtBQ9YonE-kHTpaZlN9vLE4fyJSTn7RZP6kA,2503
4
4
  google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
5
5
  google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
6
- google/cloud/dataproc_spark_connect/session.py,sha256=vD9kJXJbkzfKlYt5TFp0umHi6A7ZdheSJNjrqVyL0oo,37432
6
+ google/cloud/dataproc_spark_connect/session.py,sha256=u2QxRLn84EMdpFnv9pI6Y7F8OFQ2mqCllb6AsLd73eo,42402
7
7
  google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
8
8
  google/cloud/dataproc_spark_connect/client/core.py,sha256=GRc4OCTBvIvdagjxOPoDO22vLtt8xDSerdREMRDeUBY,4659
9
9
  google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
10
- dataproc_spark_connect-1.0.0rc1.dist-info/METADATA,sha256=D1e6sjZ8-hVccXMjHwkoX0OPOOW876hzsQi25WxMbMI,3468
11
- dataproc_spark_connect-1.0.0rc1.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
- dataproc_spark_connect-1.0.0rc1.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
- dataproc_spark_connect-1.0.0rc1.dist-info/RECORD,,
10
+ dataproc_spark_connect-1.0.0rc3.dist-info/METADATA,sha256=yUMy-S__bGisjsxduDg_VmJzF3ZKqCybaNUwn5DMymo,3468
11
+ dataproc_spark_connect-1.0.0rc3.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
+ dataproc_spark_connect-1.0.0rc3.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
+ dataproc_spark_connect-1.0.0rc3.dist-info/RECORD,,
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import os
16
+ import sys
16
17
  from typing import Callable, Tuple, List
17
18
 
18
19
 
@@ -46,6 +47,18 @@ def is_jetbrains_ide() -> bool:
46
47
  return "jetbrains" in os.getenv("TERMINAL_EMULATOR", "").lower()
47
48
 
48
49
 
50
+ def is_interactive():
51
+ return hasattr(sys, "ps1")
52
+
53
+
54
+ def is_terminal():
55
+ return sys.stdin.isatty()
56
+
57
+
58
+ def is_interactive_terminal():
59
+ return is_interactive() and is_terminal()
60
+
61
+
49
62
  def get_client_environment_label() -> str:
50
63
  """
51
64
  Map current environment to a standardized client label.
@@ -24,8 +24,11 @@ import threading
24
24
  import time
25
25
  import uuid
26
26
  import tqdm
27
+ from packaging import version
28
+ from tqdm import tqdm as cli_tqdm
29
+ from tqdm.notebook import tqdm as notebook_tqdm
27
30
  from types import MethodType
28
- from typing import Any, cast, ClassVar, Dict, Optional, Union
31
+ from typing import Any, cast, ClassVar, Dict, Iterable, Optional, Union
29
32
 
30
33
  from google.api_core import retry
31
34
  from google.api_core.client_options import ClientOptions
@@ -104,12 +107,14 @@ class DataprocSparkSession(SparkSession):
104
107
  """
105
108
 
106
109
  _DEFAULT_RUNTIME_VERSION = "3.0"
110
+ _MIN_RUNTIME_VERSION = "3.0"
107
111
 
108
112
  _active_s8s_session_uuid: ClassVar[Optional[str]] = None
109
113
  _project_id = None
110
114
  _region = None
111
115
  _client_options = None
112
116
  _active_s8s_session_id: ClassVar[Optional[str]] = None
117
+ _execution_progress_bar = dict()
113
118
 
114
119
  class Builder(SparkSession.Builder):
115
120
 
@@ -247,7 +252,11 @@ class DataprocSparkSession(SparkSession):
247
252
  assert self._channel_builder is not None
248
253
  session = DataprocSparkSession(connection=self._channel_builder)
249
254
 
255
+ # Register handler for Cell Execution Progress bar
256
+ session._register_progress_execution_handler()
257
+
250
258
  DataprocSparkSession._set_default_and_active_session(session)
259
+
251
260
  return session
252
261
 
253
262
  def __create(self) -> "DataprocSparkSession":
@@ -262,6 +271,9 @@ class DataprocSparkSession(SparkSession):
262
271
 
263
272
  dataproc_config: Session = self._get_dataproc_config()
264
273
 
274
+ # Check runtime version compatibility before creating session
275
+ self._check_runtime_compatibility(dataproc_config)
276
+
265
277
  session_id = self.generate_dataproc_session_id()
266
278
  dataproc_config.name = f"projects/{self._project_id}/locations/{self._region}/sessions/{session_id}"
267
279
  logger.debug(
@@ -593,6 +605,43 @@ class DataprocSparkSession(SparkSession):
593
605
  stacklevel=3,
594
606
  )
595
607
 
608
+ def _check_runtime_compatibility(self, dataproc_config):
609
+ """Check if runtime version 3.0 client is compatible with older runtime versions.
610
+
611
+ Runtime version 3.0 clients do not support older runtime versions (pre-3.0).
612
+ There is no backward or forward compatibility between different runtime versions.
613
+
614
+ Args:
615
+ dataproc_config: The Session configuration containing runtime version
616
+
617
+ Raises:
618
+ DataprocSparkConnectException: If server is using pre-3.0 runtime version
619
+ """
620
+ runtime_version = dataproc_config.runtime_config.version
621
+
622
+ if not runtime_version:
623
+ return
624
+
625
+ logger.debug(f"Detected server runtime version: {runtime_version}")
626
+
627
+ # Parse runtime version to check if it's below minimum supported version
628
+ try:
629
+ server_version = version.parse(runtime_version)
630
+ min_version = version.parse(
631
+ DataprocSparkSession._MIN_RUNTIME_VERSION
632
+ )
633
+
634
+ if server_version < min_version:
635
+ raise DataprocSparkConnectException(
636
+ f"Specified {runtime_version} Dataproc Runtime version is not supported, "
637
+ f"use {DataprocSparkSession._MIN_RUNTIME_VERSION} version or higher."
638
+ )
639
+ except version.InvalidVersion:
640
+ # If we can't parse the version, log a warning but continue
641
+ logger.warning(
642
+ f"Could not parse runtime version: {runtime_version}"
643
+ )
644
+
596
645
  def _display_view_session_details_button(self, session_id):
597
646
  try:
598
647
  session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
@@ -682,6 +731,78 @@ class DataprocSparkSession(SparkSession):
682
731
  execute_and_fetch_as_iterator_wrapped_method, self.client
683
732
  )
684
733
 
734
+ # Patching clearProgressHandlers method to not remove Dataproc Progress Handler
735
+ clearProgressHandlers_base_method = self.clearProgressHandlers
736
+
737
+ def clearProgressHandlers_wrapper_method(_, *args, **kwargs):
738
+ clearProgressHandlers_base_method(*args, **kwargs)
739
+
740
+ self._register_progress_execution_handler()
741
+
742
+ self.clearProgressHandlers = MethodType(
743
+ clearProgressHandlers_wrapper_method, self
744
+ )
745
+
746
+ def _register_progress_execution_handler(self):
747
+ from pyspark.sql.connect.shell.progress import StageInfo
748
+
749
+ def handler(
750
+ stages: Optional[Iterable[StageInfo]],
751
+ inflight_tasks: int,
752
+ operation_id: Optional[str],
753
+ done: bool,
754
+ ):
755
+ if operation_id is None:
756
+ return
757
+
758
+ # Don't build / render progress bar for non-interactive (despite
759
+ # Ipython or non-IPython)
760
+ if not environment.is_interactive():
761
+ return
762
+
763
+ total_tasks = 0
764
+ completed_tasks = 0
765
+
766
+ for stage in stages or []:
767
+ total_tasks += stage.num_tasks
768
+ completed_tasks += stage.num_completed_tasks
769
+
770
+ tqdm_pbar = notebook_tqdm
771
+ if environment.is_interactive_terminal():
772
+ tqdm_pbar = cli_tqdm
773
+
774
+ # Use a lock to ensure only one thread can access and modify
775
+ # the shared dictionaries at a time.
776
+ with self._lock:
777
+ if operation_id in self._execution_progress_bar:
778
+ pbar = self._execution_progress_bar[operation_id]
779
+ if pbar.total != total_tasks:
780
+ pbar.reset(
781
+ total=total_tasks
782
+ ) # This force resets the progress bar % too on next refresh
783
+ else:
784
+ pbar = tqdm_pbar(
785
+ total=total_tasks,
786
+ leave=True,
787
+ dynamic_ncols=True,
788
+ bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} Tasks",
789
+ )
790
+ self._execution_progress_bar[operation_id] = pbar
791
+
792
+ # To handle skipped or failed tasks.
793
+ # StageInfo proto doesn't have skipped and failed tasks information to process.
794
+ if done and completed_tasks < total_tasks:
795
+ completed_tasks = total_tasks
796
+
797
+ pbar.n = completed_tasks
798
+ pbar.refresh()
799
+
800
+ if done:
801
+ pbar.close()
802
+ self._execution_progress_bar.pop(operation_id, None)
803
+
804
+ self.registerProgressHandler(handler)
805
+
685
806
  @staticmethod
686
807
  def _sql_lazy_transformation(req):
687
808
  # Select SQL command
@@ -711,6 +832,11 @@ class DataprocSparkSession(SparkSession):
711
832
  """
712
833
 
713
834
  def _display_operation_link(self, operation_id: str):
835
+ # Don't print per-operation Spark UI link for non-interactive (despite
836
+ # Ipython or non-IPython)
837
+ if not environment.is_interactive():
838
+ return
839
+
714
840
  assert all(
715
841
  [
716
842
  operation_id is not None,
@@ -726,12 +852,13 @@ class DataprocSparkSession(SparkSession):
726
852
  f"associatedSqlOperationId={operation_id}?project={self._project_id}"
727
853
  )
728
854
 
855
+ if environment.is_interactive_terminal():
856
+ print(f"Spark Query: {url}")
857
+ return
858
+
729
859
  try:
730
860
  from IPython.display import display, HTML
731
- from IPython.core.interactiveshell import InteractiveShell
732
861
 
733
- if not InteractiveShell.initialized():
734
- return
735
862
  html_element = f"""
736
863
  <div>
737
864
  <p><a href="{url}">Spark Query</a> (Operation: {operation_id})</p>