dataproc-spark-connect 1.0.0rc2__py2.py3-none-any.whl → 1.0.0rc4__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 1.0.0rc2
3
+ Version: 1.0.0rc4
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -1,13 +1,13 @@
1
- dataproc_spark_connect-1.0.0rc2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
1
+ dataproc_spark_connect-1.0.0rc4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2
2
  google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
3
- google/cloud/dataproc_spark_connect/environment.py,sha256=l1wWiHMHtBQ9YonE-kHTpaZlN9vLE4fyJSTn7RZP6kA,2503
3
+ google/cloud/dataproc_spark_connect/environment.py,sha256=H4KcT-_X64oKlQ9vFhfoRSh5JrmyHgFGCeo8UOAztiM,2678
4
4
  google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
5
5
  google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
6
- google/cloud/dataproc_spark_connect/session.py,sha256=FdJI_F9k6EfIvlgC1-f-Qb_Uwg9SmkIyWhpNZlqGQhw,40405
6
+ google/cloud/dataproc_spark_connect/session.py,sha256=RDOnjzhyI1bI_Hf00bddGcBXOnuIHzv7AhlK8fFvYIY,50873
7
7
  google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
8
8
  google/cloud/dataproc_spark_connect/client/core.py,sha256=GRc4OCTBvIvdagjxOPoDO22vLtt8xDSerdREMRDeUBY,4659
9
9
  google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
10
- dataproc_spark_connect-1.0.0rc2.dist-info/METADATA,sha256=o2vfu5NRn2Pb0N7cavrBm2OLwP_LXQBVrclNjEtb9Do,3468
11
- dataproc_spark_connect-1.0.0rc2.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
- dataproc_spark_connect-1.0.0rc2.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
- dataproc_spark_connect-1.0.0rc2.dist-info/RECORD,,
10
+ dataproc_spark_connect-1.0.0rc4.dist-info/METADATA,sha256=bXOK3plGsUla_ugMQjJ9GDiQz1qey0GcApVg4yjls4k,3468
11
+ dataproc_spark_connect-1.0.0rc4.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
+ dataproc_spark_connect-1.0.0rc4.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
+ dataproc_spark_connect-1.0.0rc4.dist-info/RECORD,,
@@ -48,7 +48,15 @@ def is_jetbrains_ide() -> bool:
48
48
 
49
49
 
50
50
  def is_interactive():
51
- return hasattr(sys, "ps1")
51
+ try:
52
+ from IPython import get_ipython
53
+
54
+ if get_ipython() is not None:
55
+ return True
56
+ except ImportError:
57
+ pass
58
+
59
+ return hasattr(sys, "ps1") or sys.flags.interactive
52
60
 
53
61
 
54
62
  def is_terminal():
@@ -24,6 +24,7 @@ import threading
24
24
  import time
25
25
  import uuid
26
26
  import tqdm
27
+ from packaging import version
27
28
  from tqdm import tqdm as cli_tqdm
28
29
  from tqdm.notebook import tqdm as notebook_tqdm
29
30
  from types import MethodType
@@ -45,6 +46,7 @@ from google.cloud.dataproc_spark_connect.pypi_artifacts import PyPiArtifacts
45
46
  from google.cloud.dataproc_v1 import (
46
47
  AuthenticationConfig,
47
48
  CreateSessionRequest,
49
+ DeleteSessionRequest,
48
50
  GetSessionRequest,
49
51
  Session,
50
52
  SessionControllerClient,
@@ -86,6 +88,22 @@ def _is_valid_label_value(value: str) -> bool:
86
88
  return bool(re.match(pattern, value))
87
89
 
88
90
 
91
+ def _is_valid_session_id(session_id: str) -> bool:
92
+ """
93
+ Validates if a string complies with Google Cloud session ID format.
94
+ - Must be 4-63 characters
95
+ - Only lowercase letters, numbers, and dashes are allowed
96
+ - Must start with a lowercase letter
97
+ - Cannot end with a dash
98
+ """
99
+ if not session_id:
100
+ return False
101
+
102
+ # The pattern is sufficient for validation and already enforces length constraints.
103
+ pattern = r"^[a-z][a-z0-9-]{2,61}[a-z0-9]$"
104
+ return bool(re.match(pattern, session_id))
105
+
106
+
89
107
  class DataprocSparkSession(SparkSession):
90
108
  """The entry point to programming Spark with the Dataset and DataFrame API.
91
109
 
@@ -106,12 +124,14 @@ class DataprocSparkSession(SparkSession):
106
124
  """
107
125
 
108
126
  _DEFAULT_RUNTIME_VERSION = "3.0"
127
+ _MIN_RUNTIME_VERSION = "3.0"
109
128
 
110
129
  _active_s8s_session_uuid: ClassVar[Optional[str]] = None
111
130
  _project_id = None
112
131
  _region = None
113
132
  _client_options = None
114
133
  _active_s8s_session_id: ClassVar[Optional[str]] = None
134
+ _active_session_uses_custom_id: ClassVar[bool] = False
115
135
  _execution_progress_bar = dict()
116
136
 
117
137
  class Builder(SparkSession.Builder):
@@ -120,6 +140,7 @@ class DataprocSparkSession(SparkSession):
120
140
  self._options: Dict[str, Any] = {}
121
141
  self._channel_builder: Optional[DataprocChannelBuilder] = None
122
142
  self._dataproc_config: Optional[Session] = None
143
+ self._custom_session_id: Optional[str] = None
123
144
  self._project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
124
145
  self._region = os.getenv("GOOGLE_CLOUD_REGION")
125
146
  self._client_options = ClientOptions(
@@ -128,6 +149,18 @@ class DataprocSparkSession(SparkSession):
128
149
  f"{self._region}-dataproc.googleapis.com",
129
150
  )
130
151
  )
152
+ self._session_controller_client: Optional[
153
+ SessionControllerClient
154
+ ] = None
155
+
156
+ @property
157
+ def session_controller_client(self) -> SessionControllerClient:
158
+ """Get or create a SessionControllerClient instance."""
159
+ if self._session_controller_client is None:
160
+ self._session_controller_client = SessionControllerClient(
161
+ client_options=self._client_options
162
+ )
163
+ return self._session_controller_client
131
164
 
132
165
  def projectId(self, project_id):
133
166
  self._project_id = project_id
@@ -141,6 +174,35 @@ class DataprocSparkSession(SparkSession):
141
174
  )
142
175
  return self
143
176
 
177
+ def dataprocSessionId(self, session_id: str):
178
+ """
179
+ Set a custom session ID for creating or reusing sessions.
180
+
181
+ The session ID must:
182
+ - Be 4-63 characters long
183
+ - Start with a lowercase letter
184
+ - Contain only lowercase letters, numbers, and hyphens
185
+ - Not end with a hyphen
186
+
187
+ Args:
188
+ session_id: The custom session ID to use
189
+
190
+ Returns:
191
+ This Builder instance for method chaining
192
+
193
+ Raises:
194
+ ValueError: If the session ID format is invalid
195
+ """
196
+ if not _is_valid_session_id(session_id):
197
+ raise ValueError(
198
+ f"Invalid session ID: '{session_id}'. "
199
+ "Session ID must be 4-63 characters, start with a lowercase letter, "
200
+ "contain only lowercase letters, numbers, and hyphens, "
201
+ "and not end with a hyphen."
202
+ )
203
+ self._custom_session_id = session_id
204
+ return self
205
+
144
206
  def dataprocSessionConfig(self, dataproc_config: Session):
145
207
  self._dataproc_config = dataproc_config
146
208
  for k, v in dataproc_config.runtime_config.properties.items():
@@ -254,6 +316,7 @@ class DataprocSparkSession(SparkSession):
254
316
  session._register_progress_execution_handler()
255
317
 
256
318
  DataprocSparkSession._set_default_and_active_session(session)
319
+
257
320
  return session
258
321
 
259
322
  def __create(self) -> "DataprocSparkSession":
@@ -268,7 +331,16 @@ class DataprocSparkSession(SparkSession):
268
331
 
269
332
  dataproc_config: Session = self._get_dataproc_config()
270
333
 
271
- session_id = self.generate_dataproc_session_id()
334
+ # Check runtime version compatibility before creating session
335
+ self._check_runtime_compatibility(dataproc_config)
336
+
337
+ # Use custom session ID if provided, otherwise generate one
338
+ session_id = (
339
+ self._custom_session_id
340
+ if self._custom_session_id
341
+ else self.generate_dataproc_session_id()
342
+ )
343
+
272
344
  dataproc_config.name = f"projects/{self._project_id}/locations/{self._region}/sessions/{session_id}"
273
345
  logger.debug(
274
346
  f"Dataproc Session configuration:\n{dataproc_config}"
@@ -283,6 +355,10 @@ class DataprocSparkSession(SparkSession):
283
355
 
284
356
  logger.debug("Creating Dataproc Session")
285
357
  DataprocSparkSession._active_s8s_session_id = session_id
358
+ # Track whether this session uses a custom ID (unmanaged) or auto-generated ID (managed)
359
+ DataprocSparkSession._active_session_uses_custom_id = (
360
+ self._custom_session_id is not None
361
+ )
286
362
  s8s_creation_start_time = time.time()
287
363
 
288
364
  stop_create_session_pbar_event = threading.Event()
@@ -373,6 +449,7 @@ class DataprocSparkSession(SparkSession):
373
449
  if create_session_pbar_thread.is_alive():
374
450
  create_session_pbar_thread.join()
375
451
  DataprocSparkSession._active_s8s_session_id = None
452
+ DataprocSparkSession._active_session_uses_custom_id = False
376
453
  raise DataprocSparkConnectException(
377
454
  f"Error while creating Dataproc Session: {e.message}"
378
455
  )
@@ -381,6 +458,7 @@ class DataprocSparkSession(SparkSession):
381
458
  if create_session_pbar_thread.is_alive():
382
459
  create_session_pbar_thread.join()
383
460
  DataprocSparkSession._active_s8s_session_id = None
461
+ DataprocSparkSession._active_session_uses_custom_id = False
384
462
  raise RuntimeError(
385
463
  f"Error while creating Dataproc Session"
386
464
  ) from e
@@ -422,16 +500,19 @@ class DataprocSparkSession(SparkSession):
422
500
  :param html_element: HTML element to display for interactive IPython
423
501
  environment
424
502
  """
503
+ # Don't print any output (Rich or Plain) for non-interactive
504
+ if not environment.is_interactive():
505
+ return
506
+
507
+ if environment.is_interactive_terminal():
508
+ print(plain_message)
509
+ return
510
+
425
511
  try:
426
512
  from IPython.display import display, HTML
427
- from IPython.core.interactiveshell import InteractiveShell
428
513
 
429
- if not InteractiveShell.initialized():
430
- raise DataprocSparkConnectException(
431
- "Not in an Interactive IPython Environment"
432
- )
433
514
  display(HTML(html_element))
434
- except (ImportError, DataprocSparkConnectException):
515
+ except ImportError:
435
516
  print(plain_message)
436
517
 
437
518
  def _get_exiting_active_session(
@@ -471,11 +552,30 @@ class DataprocSparkSession(SparkSession):
471
552
 
472
553
  def getOrCreate(self) -> "DataprocSparkSession":
473
554
  with DataprocSparkSession._lock:
555
+ # Handle custom session ID by setting it early and letting existing logic handle it
556
+ if self._custom_session_id:
557
+ self._handle_custom_session_id()
558
+
474
559
  session = self._get_exiting_active_session()
475
560
  if session is None:
476
561
  session = self.__create()
477
562
  return session
478
563
 
564
+ def _handle_custom_session_id(self):
565
+ """Handle custom session ID by checking if it exists and setting _active_s8s_session_id."""
566
+ session_response = self._get_session_by_id(self._custom_session_id)
567
+ if session_response is not None:
568
+ # Found an active session with the custom ID, set it as the active session
569
+ DataprocSparkSession._active_s8s_session_id = (
570
+ self._custom_session_id
571
+ )
572
+ # Mark that this session uses a custom ID
573
+ DataprocSparkSession._active_session_uses_custom_id = True
574
+ else:
575
+ # No existing session found, clear any existing active session ID
576
+ # so we'll create a new one with the custom ID
577
+ DataprocSparkSession._active_s8s_session_id = None
578
+
479
579
  def _get_dataproc_config(self):
480
580
  # Use the property to ensure we always have a config
481
581
  dataproc_config = self.dataproc_config
@@ -599,7 +699,52 @@ class DataprocSparkSession(SparkSession):
599
699
  stacklevel=3,
600
700
  )
601
701
 
702
+ def _check_runtime_compatibility(self, dataproc_config):
703
+ """Check if runtime version 3.0 client is compatible with older runtime versions.
704
+
705
+ Runtime version 3.0 clients do not support older runtime versions (pre-3.0).
706
+ There is no backward or forward compatibility between different runtime versions.
707
+
708
+ Args:
709
+ dataproc_config: The Session configuration containing runtime version
710
+
711
+ Raises:
712
+ DataprocSparkConnectException: If server is using pre-3.0 runtime version
713
+ """
714
+ runtime_version = dataproc_config.runtime_config.version
715
+
716
+ if not runtime_version:
717
+ return
718
+
719
+ logger.debug(f"Detected server runtime version: {runtime_version}")
720
+
721
+ # Parse runtime version to check if it's below minimum supported version
722
+ try:
723
+ server_version = version.parse(runtime_version)
724
+ min_version = version.parse(
725
+ DataprocSparkSession._MIN_RUNTIME_VERSION
726
+ )
727
+
728
+ if server_version < min_version:
729
+ raise DataprocSparkConnectException(
730
+ f"Specified {runtime_version} Dataproc Runtime version is not supported, "
731
+ f"use {DataprocSparkSession._MIN_RUNTIME_VERSION} version or higher."
732
+ )
733
+ except version.InvalidVersion:
734
+ # If we can't parse the version, log a warning but continue
735
+ logger.warning(
736
+ f"Could not parse runtime version: {runtime_version}"
737
+ )
738
+
602
739
  def _display_view_session_details_button(self, session_id):
740
+ # Display button is only supported in colab enterprise
741
+ if not environment.is_colab_enterprise():
742
+ return
743
+
744
+ # Skip button display for colab enterprise IPython terminals
745
+ if environment.is_interactive_terminal():
746
+ return
747
+
603
748
  try:
604
749
  session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
605
750
  from IPython.core.interactiveshell import InteractiveShell
@@ -615,6 +760,90 @@ class DataprocSparkSession(SparkSession):
615
760
  except ImportError as e:
616
761
  logger.debug(f"Import error: {e}")
617
762
 
763
+ def _get_session_by_id(self, session_id: str) -> Optional[Session]:
764
+ """
765
+ Get existing session by ID.
766
+
767
+ Returns:
768
+ Session if ACTIVE/CREATING, None if not found or not usable
769
+ """
770
+ session_name = f"projects/{self._project_id}/locations/{self._region}/sessions/{session_id}"
771
+
772
+ try:
773
+ get_request = GetSessionRequest(name=session_name)
774
+ session = self.session_controller_client.get_session(
775
+ get_request
776
+ )
777
+
778
+ logger.debug(
779
+ f"Found existing session {session_id} in state: {session.state}"
780
+ )
781
+
782
+ if session.state in [
783
+ Session.State.ACTIVE,
784
+ Session.State.CREATING,
785
+ ]:
786
+ # Reuse the active session
787
+ logger.info(f"Reusing existing session: {session_id}")
788
+ return session
789
+ else:
790
+ # Session exists but is not usable (terminated/failed/terminating)
791
+ logger.info(
792
+ f"Session {session_id} in {session.state.name} state, cannot reuse"
793
+ )
794
+ return None
795
+
796
+ except NotFound:
797
+ # Session doesn't exist, can create new one
798
+ logger.debug(
799
+ f"Session {session_id} not found, can create new one"
800
+ )
801
+ return None
802
+ except Exception as e:
803
+ logger.error(f"Error checking session {session_id}: {e}")
804
+ return None
805
+
806
+ def _delete_session(self, session_name: str):
807
+ """Delete a session to free up the session ID for reuse."""
808
+ try:
809
+ delete_request = DeleteSessionRequest(name=session_name)
810
+ self.session_controller_client.delete_session(delete_request)
811
+ logger.debug(f"Deleted session: {session_name}")
812
+ except NotFound:
813
+ logger.debug(f"Session already deleted: {session_name}")
814
+
815
+ def _wait_for_termination(self, session_name: str, timeout: int = 180):
816
+ """Wait for a session to finish terminating."""
817
+ start_time = time.time()
818
+
819
+ while time.time() - start_time < timeout:
820
+ try:
821
+ get_request = GetSessionRequest(name=session_name)
822
+ session = self.session_controller_client.get_session(
823
+ get_request
824
+ )
825
+
826
+ if session.state in [
827
+ Session.State.TERMINATED,
828
+ Session.State.FAILED,
829
+ ]:
830
+ return
831
+ elif session.state != Session.State.TERMINATING:
832
+ # Session is in unexpected state
833
+ logger.warning(
834
+ f"Session {session_name} in unexpected state while waiting for termination: {session.state}"
835
+ )
836
+ return
837
+
838
+ time.sleep(2)
839
+ except NotFound:
840
+ # Session was deleted
841
+ return
842
+
843
+ logger.warning(
844
+ f"Timeout waiting for session {session_name} to terminate"
845
+ )
846
+
618
847
  @staticmethod
619
848
  def generate_dataproc_session_id():
620
849
  timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -789,6 +1018,11 @@ class DataprocSparkSession(SparkSession):
789
1018
  """
790
1019
 
791
1020
  def _display_operation_link(self, operation_id: str):
1021
+ # Don't print per-operation Spark UI link for non-interactive (despite
1022
+ # Ipython or non-IPython)
1023
+ if not environment.is_interactive():
1024
+ return
1025
+
792
1026
  assert all(
793
1027
  [
794
1028
  operation_id is not None,
@@ -804,12 +1038,13 @@ class DataprocSparkSession(SparkSession):
804
1038
  f"associatedSqlOperationId={operation_id}?project={self._project_id}"
805
1039
  )
806
1040
 
1041
+ if environment.is_interactive_terminal():
1042
+ print(f"Spark Query: {url}")
1043
+ return
1044
+
807
1045
  try:
808
1046
  from IPython.display import display, HTML
809
- from IPython.core.interactiveshell import InteractiveShell
810
1047
 
811
- if not InteractiveShell.initialized():
812
- return
813
1048
  html_element = f"""
814
1049
  <div>
815
1050
  <p><a href="{url}">Spark Query</a> (Operation: {operation_id})</p>
@@ -893,16 +1128,29 @@ class DataprocSparkSession(SparkSession):
893
1128
  def stop(self) -> None:
894
1129
  with DataprocSparkSession._lock:
895
1130
  if DataprocSparkSession._active_s8s_session_id is not None:
896
- terminate_s8s_session(
897
- DataprocSparkSession._project_id,
898
- DataprocSparkSession._region,
899
- DataprocSparkSession._active_s8s_session_id,
900
- self._client_options,
901
- )
1131
+ # Check if this is a managed session (auto-generated ID) or unmanaged session (custom ID)
1132
+ if DataprocSparkSession._active_session_uses_custom_id:
1133
+ # Unmanaged session (custom ID): Only clean up client-side state
1134
+ # Don't terminate as it might be in use by other notebooks or clients
1135
+ logger.debug(
1136
+ f"Stopping unmanaged session {DataprocSparkSession._active_s8s_session_id} without termination"
1137
+ )
1138
+ else:
1139
+ # Managed session (auto-generated ID): Use original behavior and terminate
1140
+ logger.debug(
1141
+ f"Terminating managed session {DataprocSparkSession._active_s8s_session_id}"
1142
+ )
1143
+ terminate_s8s_session(
1144
+ DataprocSparkSession._project_id,
1145
+ DataprocSparkSession._region,
1146
+ DataprocSparkSession._active_s8s_session_id,
1147
+ self._client_options,
1148
+ )
902
1149
 
903
1150
  self._remove_stopped_session_from_file()
904
1151
  DataprocSparkSession._active_s8s_session_uuid = None
905
1152
  DataprocSparkSession._active_s8s_session_id = None
1153
+ DataprocSparkSession._active_session_uses_custom_id = False
906
1154
  DataprocSparkSession._project_id = None
907
1155
  DataprocSparkSession._region = None
908
1156
  DataprocSparkSession._client_options = None