dataproc-spark-connect 1.0.0rc3__tar.gz → 1.0.0rc4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/PKG-INFO +1 -1
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/dataproc_spark_connect.egg-info/PKG-INFO +1 -1
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/google/cloud/dataproc_spark_connect/environment.py +9 -1
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/google/cloud/dataproc_spark_connect/session.py +212 -13
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/setup.py +1 -1
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/LICENSE +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/README.md +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/dataproc_spark_connect.egg-info/SOURCES.txt +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/dataproc_spark_connect.egg-info/requires.txt +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/google/cloud/dataproc_spark_connect/client/core.py +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/google/cloud/dataproc_spark_connect/exceptions.py +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/pyproject.toml +0 -0
- {dataproc_spark_connect-1.0.0rc3 → dataproc_spark_connect-1.0.0rc4}/setup.cfg +0 -0
|
@@ -48,7 +48,15 @@ def is_jetbrains_ide() -> bool:
|
|
|
48
48
|
|
|
49
49
|
|
|
50
50
|
def is_interactive():
|
|
51
|
-
|
|
51
|
+
try:
|
|
52
|
+
from IPython import get_ipython
|
|
53
|
+
|
|
54
|
+
if get_ipython() is not None:
|
|
55
|
+
return True
|
|
56
|
+
except ImportError:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
return hasattr(sys, "ps1") or sys.flags.interactive
|
|
52
60
|
|
|
53
61
|
|
|
54
62
|
def is_terminal():
|
|
@@ -46,6 +46,7 @@ from google.cloud.dataproc_spark_connect.pypi_artifacts import PyPiArtifacts
|
|
|
46
46
|
from google.cloud.dataproc_v1 import (
|
|
47
47
|
AuthenticationConfig,
|
|
48
48
|
CreateSessionRequest,
|
|
49
|
+
DeleteSessionRequest,
|
|
49
50
|
GetSessionRequest,
|
|
50
51
|
Session,
|
|
51
52
|
SessionControllerClient,
|
|
@@ -87,6 +88,22 @@ def _is_valid_label_value(value: str) -> bool:
|
|
|
87
88
|
return bool(re.match(pattern, value))
|
|
88
89
|
|
|
89
90
|
|
|
91
|
+
def _is_valid_session_id(session_id: str) -> bool:
|
|
92
|
+
"""
|
|
93
|
+
Validates if a string complies with Google Cloud session ID format.
|
|
94
|
+
- Must be 4-63 characters
|
|
95
|
+
- Only lowercase letters, numbers, and dashes are allowed
|
|
96
|
+
- Must start with a lowercase letter
|
|
97
|
+
- Cannot end with a dash
|
|
98
|
+
"""
|
|
99
|
+
if not session_id:
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
# The pattern is sufficient for validation and already enforces length constraints.
|
|
103
|
+
pattern = r"^[a-z][a-z0-9-]{2,61}[a-z0-9]$"
|
|
104
|
+
return bool(re.match(pattern, session_id))
|
|
105
|
+
|
|
106
|
+
|
|
90
107
|
class DataprocSparkSession(SparkSession):
|
|
91
108
|
"""The entry point to programming Spark with the Dataset and DataFrame API.
|
|
92
109
|
|
|
@@ -114,6 +131,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
114
131
|
_region = None
|
|
115
132
|
_client_options = None
|
|
116
133
|
_active_s8s_session_id: ClassVar[Optional[str]] = None
|
|
134
|
+
_active_session_uses_custom_id: ClassVar[bool] = False
|
|
117
135
|
_execution_progress_bar = dict()
|
|
118
136
|
|
|
119
137
|
class Builder(SparkSession.Builder):
|
|
@@ -122,6 +140,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
122
140
|
self._options: Dict[str, Any] = {}
|
|
123
141
|
self._channel_builder: Optional[DataprocChannelBuilder] = None
|
|
124
142
|
self._dataproc_config: Optional[Session] = None
|
|
143
|
+
self._custom_session_id: Optional[str] = None
|
|
125
144
|
self._project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
|
|
126
145
|
self._region = os.getenv("GOOGLE_CLOUD_REGION")
|
|
127
146
|
self._client_options = ClientOptions(
|
|
@@ -130,6 +149,18 @@ class DataprocSparkSession(SparkSession):
|
|
|
130
149
|
f"{self._region}-dataproc.googleapis.com",
|
|
131
150
|
)
|
|
132
151
|
)
|
|
152
|
+
self._session_controller_client: Optional[
|
|
153
|
+
SessionControllerClient
|
|
154
|
+
] = None
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def session_controller_client(self) -> SessionControllerClient:
|
|
158
|
+
"""Get or create a SessionControllerClient instance."""
|
|
159
|
+
if self._session_controller_client is None:
|
|
160
|
+
self._session_controller_client = SessionControllerClient(
|
|
161
|
+
client_options=self._client_options
|
|
162
|
+
)
|
|
163
|
+
return self._session_controller_client
|
|
133
164
|
|
|
134
165
|
def projectId(self, project_id):
|
|
135
166
|
self._project_id = project_id
|
|
@@ -143,6 +174,35 @@ class DataprocSparkSession(SparkSession):
|
|
|
143
174
|
)
|
|
144
175
|
return self
|
|
145
176
|
|
|
177
|
+
def dataprocSessionId(self, session_id: str):
|
|
178
|
+
"""
|
|
179
|
+
Set a custom session ID for creating or reusing sessions.
|
|
180
|
+
|
|
181
|
+
The session ID must:
|
|
182
|
+
- Be 4-63 characters long
|
|
183
|
+
- Start with a lowercase letter
|
|
184
|
+
- Contain only lowercase letters, numbers, and hyphens
|
|
185
|
+
- Not end with a hyphen
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
session_id: The custom session ID to use
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
This Builder instance for method chaining
|
|
192
|
+
|
|
193
|
+
Raises:
|
|
194
|
+
ValueError: If the session ID format is invalid
|
|
195
|
+
"""
|
|
196
|
+
if not _is_valid_session_id(session_id):
|
|
197
|
+
raise ValueError(
|
|
198
|
+
f"Invalid session ID: '{session_id}'. "
|
|
199
|
+
"Session ID must be 4-63 characters, start with a lowercase letter, "
|
|
200
|
+
"contain only lowercase letters, numbers, and hyphens, "
|
|
201
|
+
"and not end with a hyphen."
|
|
202
|
+
)
|
|
203
|
+
self._custom_session_id = session_id
|
|
204
|
+
return self
|
|
205
|
+
|
|
146
206
|
def dataprocSessionConfig(self, dataproc_config: Session):
|
|
147
207
|
self._dataproc_config = dataproc_config
|
|
148
208
|
for k, v in dataproc_config.runtime_config.properties.items():
|
|
@@ -274,7 +334,13 @@ class DataprocSparkSession(SparkSession):
|
|
|
274
334
|
# Check runtime version compatibility before creating session
|
|
275
335
|
self._check_runtime_compatibility(dataproc_config)
|
|
276
336
|
|
|
277
|
-
|
|
337
|
+
# Use custom session ID if provided, otherwise generate one
|
|
338
|
+
session_id = (
|
|
339
|
+
self._custom_session_id
|
|
340
|
+
if self._custom_session_id
|
|
341
|
+
else self.generate_dataproc_session_id()
|
|
342
|
+
)
|
|
343
|
+
|
|
278
344
|
dataproc_config.name = f"projects/{self._project_id}/locations/{self._region}/sessions/{session_id}"
|
|
279
345
|
logger.debug(
|
|
280
346
|
f"Dataproc Session configuration:\n{dataproc_config}"
|
|
@@ -289,6 +355,10 @@ class DataprocSparkSession(SparkSession):
|
|
|
289
355
|
|
|
290
356
|
logger.debug("Creating Dataproc Session")
|
|
291
357
|
DataprocSparkSession._active_s8s_session_id = session_id
|
|
358
|
+
# Track whether this session uses a custom ID (unmanaged) or auto-generated ID (managed)
|
|
359
|
+
DataprocSparkSession._active_session_uses_custom_id = (
|
|
360
|
+
self._custom_session_id is not None
|
|
361
|
+
)
|
|
292
362
|
s8s_creation_start_time = time.time()
|
|
293
363
|
|
|
294
364
|
stop_create_session_pbar_event = threading.Event()
|
|
@@ -379,6 +449,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
379
449
|
if create_session_pbar_thread.is_alive():
|
|
380
450
|
create_session_pbar_thread.join()
|
|
381
451
|
DataprocSparkSession._active_s8s_session_id = None
|
|
452
|
+
DataprocSparkSession._active_session_uses_custom_id = False
|
|
382
453
|
raise DataprocSparkConnectException(
|
|
383
454
|
f"Error while creating Dataproc Session: {e.message}"
|
|
384
455
|
)
|
|
@@ -387,6 +458,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
387
458
|
if create_session_pbar_thread.is_alive():
|
|
388
459
|
create_session_pbar_thread.join()
|
|
389
460
|
DataprocSparkSession._active_s8s_session_id = None
|
|
461
|
+
DataprocSparkSession._active_session_uses_custom_id = False
|
|
390
462
|
raise RuntimeError(
|
|
391
463
|
f"Error while creating Dataproc Session"
|
|
392
464
|
) from e
|
|
@@ -428,16 +500,19 @@ class DataprocSparkSession(SparkSession):
|
|
|
428
500
|
:param html_element: HTML element to display for interactive IPython
|
|
429
501
|
environment
|
|
430
502
|
"""
|
|
503
|
+
# Don't print any output (Rich or Plain) for non-interactive
|
|
504
|
+
if not environment.is_interactive():
|
|
505
|
+
return
|
|
506
|
+
|
|
507
|
+
if environment.is_interactive_terminal():
|
|
508
|
+
print(plain_message)
|
|
509
|
+
return
|
|
510
|
+
|
|
431
511
|
try:
|
|
432
512
|
from IPython.display import display, HTML
|
|
433
|
-
from IPython.core.interactiveshell import InteractiveShell
|
|
434
513
|
|
|
435
|
-
if not InteractiveShell.initialized():
|
|
436
|
-
raise DataprocSparkConnectException(
|
|
437
|
-
"Not in an Interactive IPython Environment"
|
|
438
|
-
)
|
|
439
514
|
display(HTML(html_element))
|
|
440
|
-
except
|
|
515
|
+
except ImportError:
|
|
441
516
|
print(plain_message)
|
|
442
517
|
|
|
443
518
|
def _get_exiting_active_session(
|
|
@@ -477,11 +552,30 @@ class DataprocSparkSession(SparkSession):
|
|
|
477
552
|
|
|
478
553
|
def getOrCreate(self) -> "DataprocSparkSession":
|
|
479
554
|
with DataprocSparkSession._lock:
|
|
555
|
+
# Handle custom session ID by setting it early and letting existing logic handle it
|
|
556
|
+
if self._custom_session_id:
|
|
557
|
+
self._handle_custom_session_id()
|
|
558
|
+
|
|
480
559
|
session = self._get_exiting_active_session()
|
|
481
560
|
if session is None:
|
|
482
561
|
session = self.__create()
|
|
483
562
|
return session
|
|
484
563
|
|
|
564
|
+
def _handle_custom_session_id(self):
|
|
565
|
+
"""Handle custom session ID by checking if it exists and setting _active_s8s_session_id."""
|
|
566
|
+
session_response = self._get_session_by_id(self._custom_session_id)
|
|
567
|
+
if session_response is not None:
|
|
568
|
+
# Found an active session with the custom ID, set it as the active session
|
|
569
|
+
DataprocSparkSession._active_s8s_session_id = (
|
|
570
|
+
self._custom_session_id
|
|
571
|
+
)
|
|
572
|
+
# Mark that this session uses a custom ID
|
|
573
|
+
DataprocSparkSession._active_session_uses_custom_id = True
|
|
574
|
+
else:
|
|
575
|
+
# No existing session found, clear any existing active session ID
|
|
576
|
+
# so we'll create a new one with the custom ID
|
|
577
|
+
DataprocSparkSession._active_s8s_session_id = None
|
|
578
|
+
|
|
485
579
|
def _get_dataproc_config(self):
|
|
486
580
|
# Use the property to ensure we always have a config
|
|
487
581
|
dataproc_config = self.dataproc_config
|
|
@@ -643,6 +737,14 @@ class DataprocSparkSession(SparkSession):
|
|
|
643
737
|
)
|
|
644
738
|
|
|
645
739
|
def _display_view_session_details_button(self, session_id):
|
|
740
|
+
# Display button is only supported in colab enterprise
|
|
741
|
+
if not environment.is_colab_enterprise():
|
|
742
|
+
return
|
|
743
|
+
|
|
744
|
+
# Skip button display for colab enterprise IPython terminals
|
|
745
|
+
if environment.is_interactive_terminal():
|
|
746
|
+
return
|
|
747
|
+
|
|
646
748
|
try:
|
|
647
749
|
session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
|
|
648
750
|
from IPython.core.interactiveshell import InteractiveShell
|
|
@@ -658,6 +760,90 @@ class DataprocSparkSession(SparkSession):
|
|
|
658
760
|
except ImportError as e:
|
|
659
761
|
logger.debug(f"Import error: {e}")
|
|
660
762
|
|
|
763
|
+
def _get_session_by_id(self, session_id: str) -> Optional[Session]:
|
|
764
|
+
"""
|
|
765
|
+
Get existing session by ID.
|
|
766
|
+
|
|
767
|
+
Returns:
|
|
768
|
+
Session if ACTIVE/CREATING, None if not found or not usable
|
|
769
|
+
"""
|
|
770
|
+
session_name = f"projects/{self._project_id}/locations/{self._region}/sessions/{session_id}"
|
|
771
|
+
|
|
772
|
+
try:
|
|
773
|
+
get_request = GetSessionRequest(name=session_name)
|
|
774
|
+
session = self.session_controller_client.get_session(
|
|
775
|
+
get_request
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
logger.debug(
|
|
779
|
+
f"Found existing session {session_id} in state: {session.state}"
|
|
780
|
+
)
|
|
781
|
+
|
|
782
|
+
if session.state in [
|
|
783
|
+
Session.State.ACTIVE,
|
|
784
|
+
Session.State.CREATING,
|
|
785
|
+
]:
|
|
786
|
+
# Reuse the active session
|
|
787
|
+
logger.info(f"Reusing existing session: {session_id}")
|
|
788
|
+
return session
|
|
789
|
+
else:
|
|
790
|
+
# Session exists but is not usable (terminated/failed/terminating)
|
|
791
|
+
logger.info(
|
|
792
|
+
f"Session {session_id} in {session.state.name} state, cannot reuse"
|
|
793
|
+
)
|
|
794
|
+
return None
|
|
795
|
+
|
|
796
|
+
except NotFound:
|
|
797
|
+
# Session doesn't exist, can create new one
|
|
798
|
+
logger.debug(
|
|
799
|
+
f"Session {session_id} not found, can create new one"
|
|
800
|
+
)
|
|
801
|
+
return None
|
|
802
|
+
except Exception as e:
|
|
803
|
+
logger.error(f"Error checking session {session_id}: {e}")
|
|
804
|
+
return None
|
|
805
|
+
|
|
806
|
+
def _delete_session(self, session_name: str):
|
|
807
|
+
"""Delete a session to free up the session ID for reuse."""
|
|
808
|
+
try:
|
|
809
|
+
delete_request = DeleteSessionRequest(name=session_name)
|
|
810
|
+
self.session_controller_client.delete_session(delete_request)
|
|
811
|
+
logger.debug(f"Deleted session: {session_name}")
|
|
812
|
+
except NotFound:
|
|
813
|
+
logger.debug(f"Session already deleted: {session_name}")
|
|
814
|
+
|
|
815
|
+
def _wait_for_termination(self, session_name: str, timeout: int = 180):
|
|
816
|
+
"""Wait for a session to finish terminating."""
|
|
817
|
+
start_time = time.time()
|
|
818
|
+
|
|
819
|
+
while time.time() - start_time < timeout:
|
|
820
|
+
try:
|
|
821
|
+
get_request = GetSessionRequest(name=session_name)
|
|
822
|
+
session = self.session_controller_client.get_session(
|
|
823
|
+
get_request
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
if session.state in [
|
|
827
|
+
Session.State.TERMINATED,
|
|
828
|
+
Session.State.FAILED,
|
|
829
|
+
]:
|
|
830
|
+
return
|
|
831
|
+
elif session.state != Session.State.TERMINATING:
|
|
832
|
+
# Session is in unexpected state
|
|
833
|
+
logger.warning(
|
|
834
|
+
f"Session {session_name} in unexpected state while waiting for termination: {session.state}"
|
|
835
|
+
)
|
|
836
|
+
return
|
|
837
|
+
|
|
838
|
+
time.sleep(2)
|
|
839
|
+
except NotFound:
|
|
840
|
+
# Session was deleted
|
|
841
|
+
return
|
|
842
|
+
|
|
843
|
+
logger.warning(
|
|
844
|
+
f"Timeout waiting for session {session_name} to terminate"
|
|
845
|
+
)
|
|
846
|
+
|
|
661
847
|
@staticmethod
|
|
662
848
|
def generate_dataproc_session_id():
|
|
663
849
|
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
@@ -942,16 +1128,29 @@ class DataprocSparkSession(SparkSession):
|
|
|
942
1128
|
def stop(self) -> None:
|
|
943
1129
|
with DataprocSparkSession._lock:
|
|
944
1130
|
if DataprocSparkSession._active_s8s_session_id is not None:
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
1131
|
+
# Check if this is a managed session (auto-generated ID) or unmanaged session (custom ID)
|
|
1132
|
+
if DataprocSparkSession._active_session_uses_custom_id:
|
|
1133
|
+
# Unmanaged session (custom ID): Only clean up client-side state
|
|
1134
|
+
# Don't terminate as it might be in use by other notebooks or clients
|
|
1135
|
+
logger.debug(
|
|
1136
|
+
f"Stopping unmanaged session {DataprocSparkSession._active_s8s_session_id} without termination"
|
|
1137
|
+
)
|
|
1138
|
+
else:
|
|
1139
|
+
# Managed session (auto-generated ID): Use original behavior and terminate
|
|
1140
|
+
logger.debug(
|
|
1141
|
+
f"Terminating managed session {DataprocSparkSession._active_s8s_session_id}"
|
|
1142
|
+
)
|
|
1143
|
+
terminate_s8s_session(
|
|
1144
|
+
DataprocSparkSession._project_id,
|
|
1145
|
+
DataprocSparkSession._region,
|
|
1146
|
+
DataprocSparkSession._active_s8s_session_id,
|
|
1147
|
+
self._client_options,
|
|
1148
|
+
)
|
|
951
1149
|
|
|
952
1150
|
self._remove_stopped_session_from_file()
|
|
953
1151
|
DataprocSparkSession._active_s8s_session_uuid = None
|
|
954
1152
|
DataprocSparkSession._active_s8s_session_id = None
|
|
1153
|
+
DataprocSparkSession._active_session_uses_custom_id = False
|
|
955
1154
|
DataprocSparkSession._project_id = None
|
|
956
1155
|
DataprocSparkSession._region = None
|
|
957
1156
|
DataprocSparkSession._client_options = None
|
|
@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
|
|
|
20
20
|
|
|
21
21
|
setup(
|
|
22
22
|
name="dataproc-spark-connect",
|
|
23
|
-
version="1.0.
|
|
23
|
+
version="1.0.0rc4",
|
|
24
24
|
description="Dataproc client library for Spark Connect",
|
|
25
25
|
long_description=long_description,
|
|
26
26
|
author="Google LLC",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|