dataproc-spark-connect 1.0.0rc3__tar.gz → 1.0.0rc5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/PKG-INFO +1 -1
  2. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/dataproc_spark_connect.egg-info/PKG-INFO +1 -1
  3. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/google/cloud/dataproc_spark_connect/environment.py +9 -1
  4. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/google/cloud/dataproc_spark_connect/session.py +235 -22
  5. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/setup.py +1 -1
  6. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/LICENSE +0 -0
  7. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/README.md +0 -0
  8. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/dataproc_spark_connect.egg-info/SOURCES.txt +0 -0
  9. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
  10. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/dataproc_spark_connect.egg-info/requires.txt +0 -0
  11. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
  12. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
  13. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
  14. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/google/cloud/dataproc_spark_connect/client/core.py +0 -0
  15. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
  16. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/google/cloud/dataproc_spark_connect/exceptions.py +0 -0
  17. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
  18. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/pyproject.toml +0 -0
  19. {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 1.0.0rc3
3
+ Version: 1.0.0rc5
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 1.0.0rc3
3
+ Version: 1.0.0rc5
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -48,7 +48,15 @@ def is_jetbrains_ide() -> bool:
48
48
 
49
49
 
50
50
  def is_interactive():
51
- return hasattr(sys, "ps1")
51
+ try:
52
+ from IPython import get_ipython
53
+
54
+ if get_ipython() is not None:
55
+ return True
56
+ except ImportError:
57
+ pass
58
+
59
+ return hasattr(sys, "ps1") or sys.flags.interactive
52
60
 
53
61
 
54
62
  def is_terminal():
@@ -46,6 +46,7 @@ from google.cloud.dataproc_spark_connect.pypi_artifacts import PyPiArtifacts
46
46
  from google.cloud.dataproc_v1 import (
47
47
  AuthenticationConfig,
48
48
  CreateSessionRequest,
49
+ DeleteSessionRequest,
49
50
  GetSessionRequest,
50
51
  Session,
51
52
  SessionControllerClient,
@@ -87,6 +88,22 @@ def _is_valid_label_value(value: str) -> bool:
87
88
  return bool(re.match(pattern, value))
88
89
 
89
90
 
91
+ def _is_valid_session_id(session_id: str) -> bool:
92
+ """
93
+ Validates if a string complies with Google Cloud session ID format.
94
+ - Must be 4-63 characters
95
+ - Only lowercase letters, numbers, and dashes are allowed
96
+ - Must start with a lowercase letter
97
+ - Cannot end with a dash
98
+ """
99
+ if not session_id:
100
+ return False
101
+
102
+ # The pattern is sufficient for validation and already enforces length constraints.
103
+ pattern = r"^[a-z][a-z0-9-]{2,61}[a-z0-9]$"
104
+ return bool(re.match(pattern, session_id))
105
+
106
+
90
107
  class DataprocSparkSession(SparkSession):
91
108
  """The entry point to programming Spark with the Dataset and DataFrame API.
92
109
 
@@ -114,6 +131,7 @@ class DataprocSparkSession(SparkSession):
114
131
  _region = None
115
132
  _client_options = None
116
133
  _active_s8s_session_id: ClassVar[Optional[str]] = None
134
+ _active_session_uses_custom_id: ClassVar[bool] = False
117
135
  _execution_progress_bar = dict()
118
136
 
119
137
  class Builder(SparkSession.Builder):
@@ -122,6 +140,7 @@ class DataprocSparkSession(SparkSession):
122
140
  self._options: Dict[str, Any] = {}
123
141
  self._channel_builder: Optional[DataprocChannelBuilder] = None
124
142
  self._dataproc_config: Optional[Session] = None
143
+ self._custom_session_id: Optional[str] = None
125
144
  self._project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
126
145
  self._region = os.getenv("GOOGLE_CLOUD_REGION")
127
146
  self._client_options = ClientOptions(
@@ -130,6 +149,18 @@ class DataprocSparkSession(SparkSession):
130
149
  f"{self._region}-dataproc.googleapis.com",
131
150
  )
132
151
  )
152
+ self._session_controller_client: Optional[
153
+ SessionControllerClient
154
+ ] = None
155
+
156
+ @property
157
+ def session_controller_client(self) -> SessionControllerClient:
158
+ """Get or create a SessionControllerClient instance."""
159
+ if self._session_controller_client is None:
160
+ self._session_controller_client = SessionControllerClient(
161
+ client_options=self._client_options
162
+ )
163
+ return self._session_controller_client
133
164
 
134
165
  def projectId(self, project_id):
135
166
  self._project_id = project_id
@@ -143,6 +174,35 @@ class DataprocSparkSession(SparkSession):
143
174
  )
144
175
  return self
145
176
 
177
+ def dataprocSessionId(self, session_id: str):
178
+ """
179
+ Set a custom session ID for creating or reusing sessions.
180
+
181
+ The session ID must:
182
+ - Be 4-63 characters long
183
+ - Start with a lowercase letter
184
+ - Contain only lowercase letters, numbers, and hyphens
185
+ - Not end with a hyphen
186
+
187
+ Args:
188
+ session_id: The custom session ID to use
189
+
190
+ Returns:
191
+ This Builder instance for method chaining
192
+
193
+ Raises:
194
+ ValueError: If the session ID format is invalid
195
+ """
196
+ if not _is_valid_session_id(session_id):
197
+ raise ValueError(
198
+ f"Invalid session ID: '{session_id}'. "
199
+ "Session ID must be 4-63 characters, start with a lowercase letter, "
200
+ "contain only lowercase letters, numbers, and hyphens, "
201
+ "and not end with a hyphen."
202
+ )
203
+ self._custom_session_id = session_id
204
+ return self
205
+
146
206
  def dataprocSessionConfig(self, dataproc_config: Session):
147
207
  self._dataproc_config = dataproc_config
148
208
  for k, v in dataproc_config.runtime_config.properties.items():
@@ -274,7 +334,13 @@ class DataprocSparkSession(SparkSession):
274
334
  # Check runtime version compatibility before creating session
275
335
  self._check_runtime_compatibility(dataproc_config)
276
336
 
277
- session_id = self.generate_dataproc_session_id()
337
+ # Use custom session ID if provided, otherwise generate one
338
+ session_id = (
339
+ self._custom_session_id
340
+ if self._custom_session_id
341
+ else self.generate_dataproc_session_id()
342
+ )
343
+
278
344
  dataproc_config.name = f"projects/{self._project_id}/locations/{self._region}/sessions/{session_id}"
279
345
  logger.debug(
280
346
  f"Dataproc Session configuration:\n{dataproc_config}"
@@ -289,6 +355,10 @@ class DataprocSparkSession(SparkSession):
289
355
 
290
356
  logger.debug("Creating Dataproc Session")
291
357
  DataprocSparkSession._active_s8s_session_id = session_id
358
+ # Track whether this session uses a custom ID (unmanaged) or auto-generated ID (managed)
359
+ DataprocSparkSession._active_session_uses_custom_id = (
360
+ self._custom_session_id is not None
361
+ )
292
362
  s8s_creation_start_time = time.time()
293
363
 
294
364
  stop_create_session_pbar_event = threading.Event()
@@ -379,6 +449,7 @@ class DataprocSparkSession(SparkSession):
379
449
  if create_session_pbar_thread.is_alive():
380
450
  create_session_pbar_thread.join()
381
451
  DataprocSparkSession._active_s8s_session_id = None
452
+ DataprocSparkSession._active_session_uses_custom_id = False
382
453
  raise DataprocSparkConnectException(
383
454
  f"Error while creating Dataproc Session: {e.message}"
384
455
  )
@@ -387,6 +458,7 @@ class DataprocSparkSession(SparkSession):
387
458
  if create_session_pbar_thread.is_alive():
388
459
  create_session_pbar_thread.join()
389
460
  DataprocSparkSession._active_s8s_session_id = None
461
+ DataprocSparkSession._active_session_uses_custom_id = False
390
462
  raise RuntimeError(
391
463
  f"Error while creating Dataproc Session"
392
464
  ) from e
@@ -428,16 +500,19 @@ class DataprocSparkSession(SparkSession):
428
500
  :param html_element: HTML element to display for interactive IPython
429
501
  environment
430
502
  """
503
+ # Don't print any output (Rich or Plain) for non-interactive
504
+ if not environment.is_interactive():
505
+ return
506
+
507
+ if environment.is_interactive_terminal():
508
+ print(plain_message)
509
+ return
510
+
431
511
  try:
432
512
  from IPython.display import display, HTML
433
- from IPython.core.interactiveshell import InteractiveShell
434
513
 
435
- if not InteractiveShell.initialized():
436
- raise DataprocSparkConnectException(
437
- "Not in an Interactive IPython Environment"
438
- )
439
514
  display(HTML(html_element))
440
- except (ImportError, DataprocSparkConnectException):
515
+ except ImportError:
441
516
  print(plain_message)
442
517
 
443
518
  def _get_exiting_active_session(
@@ -477,11 +552,30 @@ class DataprocSparkSession(SparkSession):
477
552
 
478
553
  def getOrCreate(self) -> "DataprocSparkSession":
479
554
  with DataprocSparkSession._lock:
555
+ # Handle custom session ID by setting it early and letting existing logic handle it
556
+ if self._custom_session_id:
557
+ self._handle_custom_session_id()
558
+
480
559
  session = self._get_exiting_active_session()
481
560
  if session is None:
482
561
  session = self.__create()
483
562
  return session
484
563
 
564
+ def _handle_custom_session_id(self):
565
+ """Handle custom session ID by checking if it exists and setting _active_s8s_session_id."""
566
+ session_response = self._get_session_by_id(self._custom_session_id)
567
+ if session_response is not None:
568
+ # Found an active session with the custom ID, set it as the active session
569
+ DataprocSparkSession._active_s8s_session_id = (
570
+ self._custom_session_id
571
+ )
572
+ # Mark that this session uses a custom ID
573
+ DataprocSparkSession._active_session_uses_custom_id = True
574
+ else:
575
+ # No existing session found, clear any existing active session ID
576
+ # so we'll create a new one with the custom ID
577
+ DataprocSparkSession._active_s8s_session_id = None
578
+
485
579
  def _get_dataproc_config(self):
486
580
  # Use the property to ensure we always have a config
487
581
  dataproc_config = self.dataproc_config
@@ -499,20 +593,33 @@ class DataprocSparkSession(SparkSession):
499
593
  self._check_python_version_compatibility(
500
594
  dataproc_config.runtime_config.version
501
595
  )
596
+
597
+ # Use local variable to improve readability of deeply nested attribute access
598
+ exec_config = dataproc_config.environment_config.execution_config
599
+
600
+ # Set service account from environment if not already set
502
601
  if (
503
- not dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type
504
- and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
505
- ):
506
- dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = AuthenticationConfig.AuthenticationType[
507
- os.getenv("DATAPROC_SPARK_CONNECT_AUTH_TYPE")
508
- ]
509
- if (
510
- not dataproc_config.environment_config.execution_config.service_account
602
+ not exec_config.service_account
511
603
  and "DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT" in os.environ
512
604
  ):
513
- dataproc_config.environment_config.execution_config.service_account = os.getenv(
605
+ exec_config.service_account = os.getenv(
514
606
  "DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT"
515
607
  )
608
+
609
+ # Auto-set authentication type to SERVICE_ACCOUNT when service account is provided
610
+ if exec_config.service_account:
611
+ # When service account is provided, explicitly set auth type to SERVICE_ACCOUNT
612
+ exec_config.authentication_config.user_workload_authentication_type = (
613
+ AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
614
+ )
615
+ elif (
616
+ not exec_config.authentication_config.user_workload_authentication_type
617
+ and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
618
+ ):
619
+ # Only set auth type from environment if no service account is present
620
+ exec_config.authentication_config.user_workload_authentication_type = AuthenticationConfig.AuthenticationType[
621
+ os.getenv("DATAPROC_SPARK_CONNECT_AUTH_TYPE")
622
+ ]
516
623
  if (
517
624
  not dataproc_config.environment_config.execution_config.subnetwork_uri
518
625
  and "DATAPROC_SPARK_CONNECT_SUBNET" in os.environ
@@ -579,6 +686,7 @@ class DataprocSparkSession(SparkSession):
579
686
  f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
580
687
  f" {default_datasource}. Supported value is 'bigquery'."
581
688
  )
689
+
582
690
  return dataproc_config
583
691
 
584
692
  def _check_python_version_compatibility(self, runtime_version):
@@ -643,6 +751,14 @@ class DataprocSparkSession(SparkSession):
643
751
  )
644
752
 
645
753
  def _display_view_session_details_button(self, session_id):
754
+ # Display button is only supported in colab enterprise
755
+ if not environment.is_colab_enterprise():
756
+ return
757
+
758
+ # Skip button display for colab enterprise IPython terminals
759
+ if environment.is_interactive_terminal():
760
+ return
761
+
646
762
  try:
647
763
  session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
648
764
  from IPython.core.interactiveshell import InteractiveShell
@@ -658,6 +774,90 @@ class DataprocSparkSession(SparkSession):
658
774
  except ImportError as e:
659
775
  logger.debug(f"Import error: {e}")
660
776
 
777
+ def _get_session_by_id(self, session_id: str) -> Optional[Session]:
778
+ """
779
+ Get existing session by ID.
780
+
781
+ Returns:
782
+ Session if ACTIVE/CREATING, None if not found or not usable
783
+ """
784
+ session_name = f"projects/{self._project_id}/locations/{self._region}/sessions/{session_id}"
785
+
786
+ try:
787
+ get_request = GetSessionRequest(name=session_name)
788
+ session = self.session_controller_client.get_session(
789
+ get_request
790
+ )
791
+
792
+ logger.debug(
793
+ f"Found existing session {session_id} in state: {session.state}"
794
+ )
795
+
796
+ if session.state in [
797
+ Session.State.ACTIVE,
798
+ Session.State.CREATING,
799
+ ]:
800
+ # Reuse the active session
801
+ logger.info(f"Reusing existing session: {session_id}")
802
+ return session
803
+ else:
804
+ # Session exists but is not usable (terminated/failed/terminating)
805
+ logger.info(
806
+ f"Session {session_id} in {session.state.name} state, cannot reuse"
807
+ )
808
+ return None
809
+
810
+ except NotFound:
811
+ # Session doesn't exist, can create new one
812
+ logger.debug(
813
+ f"Session {session_id} not found, can create new one"
814
+ )
815
+ return None
816
+ except Exception as e:
817
+ logger.error(f"Error checking session {session_id}: {e}")
818
+ return None
819
+
820
+ def _delete_session(self, session_name: str):
821
+ """Delete a session to free up the session ID for reuse."""
822
+ try:
823
+ delete_request = DeleteSessionRequest(name=session_name)
824
+ self.session_controller_client.delete_session(delete_request)
825
+ logger.debug(f"Deleted session: {session_name}")
826
+ except NotFound:
827
+ logger.debug(f"Session already deleted: {session_name}")
828
+
829
+ def _wait_for_termination(self, session_name: str, timeout: int = 180):
830
+ """Wait for a session to finish terminating."""
831
+ start_time = time.time()
832
+
833
+ while time.time() - start_time < timeout:
834
+ try:
835
+ get_request = GetSessionRequest(name=session_name)
836
+ session = self.session_controller_client.get_session(
837
+ get_request
838
+ )
839
+
840
+ if session.state in [
841
+ Session.State.TERMINATED,
842
+ Session.State.FAILED,
843
+ ]:
844
+ return
845
+ elif session.state != Session.State.TERMINATING:
846
+ # Session is in unexpected state
847
+ logger.warning(
848
+ f"Session {session_name} in unexpected state while waiting for termination: {session.state}"
849
+ )
850
+ return
851
+
852
+ time.sleep(2)
853
+ except NotFound:
854
+ # Session was deleted
855
+ return
856
+
857
+ logger.warning(
858
+ f"Timeout waiting for session {session_name} to terminate"
859
+ )
860
+
661
861
  @staticmethod
662
862
  def generate_dataproc_session_id():
663
863
  timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -942,16 +1142,29 @@ class DataprocSparkSession(SparkSession):
942
1142
  def stop(self) -> None:
943
1143
  with DataprocSparkSession._lock:
944
1144
  if DataprocSparkSession._active_s8s_session_id is not None:
945
- terminate_s8s_session(
946
- DataprocSparkSession._project_id,
947
- DataprocSparkSession._region,
948
- DataprocSparkSession._active_s8s_session_id,
949
- self._client_options,
950
- )
1145
+ # Check if this is a managed session (auto-generated ID) or unmanaged session (custom ID)
1146
+ if DataprocSparkSession._active_session_uses_custom_id:
1147
+ # Unmanaged session (custom ID): Only clean up client-side state
1148
+ # Don't terminate as it might be in use by other notebooks or clients
1149
+ logger.debug(
1150
+ f"Stopping unmanaged session {DataprocSparkSession._active_s8s_session_id} without termination"
1151
+ )
1152
+ else:
1153
+ # Managed session (auto-generated ID): Use original behavior and terminate
1154
+ logger.debug(
1155
+ f"Terminating managed session {DataprocSparkSession._active_s8s_session_id}"
1156
+ )
1157
+ terminate_s8s_session(
1158
+ DataprocSparkSession._project_id,
1159
+ DataprocSparkSession._region,
1160
+ DataprocSparkSession._active_s8s_session_id,
1161
+ self._client_options,
1162
+ )
951
1163
 
952
1164
  self._remove_stopped_session_from_file()
953
1165
  DataprocSparkSession._active_s8s_session_uuid = None
954
1166
  DataprocSparkSession._active_s8s_session_id = None
1167
+ DataprocSparkSession._active_session_uses_custom_id = False
955
1168
  DataprocSparkSession._project_id = None
956
1169
  DataprocSparkSession._region = None
957
1170
  DataprocSparkSession._client_options = None
@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
20
20
 
21
21
  setup(
22
22
  name="dataproc-spark-connect",
23
- version="1.0.0rc3",
23
+ version="1.0.0rc5",
24
24
  description="Dataproc client library for Spark Connect",
25
25
  long_description=long_description,
26
26
  author="Google LLC",