dataproc-spark-connect 0.7.5__tar.gz → 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/PKG-INFO +9 -2
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/dataproc_spark_connect.egg-info/PKG-INFO +9 -2
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/google/cloud/dataproc_spark_connect/exceptions.py +1 -1
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/google/cloud/dataproc_spark_connect/session.py +174 -36
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/setup.py +1 -1
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/LICENSE +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/README.md +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/dataproc_spark_connect.egg-info/SOURCES.txt +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/dataproc_spark_connect.egg-info/requires.txt +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/google/cloud/dataproc_spark_connect/client/core.py +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/pyproject.toml +0 -0
- {dataproc_spark_connect-0.7.5 → dataproc_spark_connect-0.8.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
@@ -12,6 +12,13 @@ Requires-Dist: packaging>=20.0
|
|
|
12
12
|
Requires-Dist: pyspark[connect]~=3.5.1
|
|
13
13
|
Requires-Dist: tqdm>=4.67
|
|
14
14
|
Requires-Dist: websockets>=14.0
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: description
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: license
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
Dynamic: requires-dist
|
|
21
|
+
Dynamic: summary
|
|
15
22
|
|
|
16
23
|
# Dataproc Spark Connect Client
|
|
17
24
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
@@ -12,6 +12,13 @@ Requires-Dist: packaging>=20.0
|
|
|
12
12
|
Requires-Dist: pyspark[connect]~=3.5.1
|
|
13
13
|
Requires-Dist: tqdm>=4.67
|
|
14
14
|
Requires-Dist: websockets>=14.0
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: description
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: license
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
Dynamic: requires-dist
|
|
21
|
+
Dynamic: summary
|
|
15
22
|
|
|
16
23
|
# Dataproc Spark Connect Client
|
|
17
24
|
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
class DataprocSparkConnectException(Exception):
|
|
17
17
|
"""A custom exception class to only print the error messages.
|
|
18
18
|
This would be used for exceptions where the stack trace
|
|
19
|
-
doesn't provide any additional information.
|
|
19
|
+
doesn't provide any additional information.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
def __init__(self, message):
|
|
@@ -18,14 +18,23 @@ import json
|
|
|
18
18
|
import logging
|
|
19
19
|
import os
|
|
20
20
|
import random
|
|
21
|
+
import re
|
|
21
22
|
import string
|
|
22
23
|
import threading
|
|
23
24
|
import time
|
|
25
|
+
from typing import Any, cast, ClassVar, Dict, Optional, Union
|
|
26
|
+
import uuid
|
|
24
27
|
import tqdm
|
|
25
28
|
|
|
26
29
|
from google.api_core import retry
|
|
27
30
|
from google.api_core.client_options import ClientOptions
|
|
28
|
-
from google.api_core.exceptions import
|
|
31
|
+
from google.api_core.exceptions import (
|
|
32
|
+
Aborted,
|
|
33
|
+
FailedPrecondition,
|
|
34
|
+
InvalidArgument,
|
|
35
|
+
NotFound,
|
|
36
|
+
PermissionDenied,
|
|
37
|
+
)
|
|
29
38
|
from google.api_core.future.polling import POLLING_PREDICATE
|
|
30
39
|
from google.cloud.dataproc_spark_connect.client import DataprocChannelBuilder
|
|
31
40
|
from google.cloud.dataproc_spark_connect.exceptions import DataprocSparkConnectException
|
|
@@ -41,13 +50,32 @@ from google.cloud.dataproc_v1 import (
|
|
|
41
50
|
from google.cloud.dataproc_v1.types import sessions
|
|
42
51
|
from pyspark.sql.connect.session import SparkSession
|
|
43
52
|
from pyspark.sql.utils import to_str
|
|
44
|
-
from typing import Any, cast, ClassVar, Dict, Optional
|
|
45
53
|
|
|
46
54
|
# Set up logging
|
|
47
55
|
logging.basicConfig(level=logging.INFO)
|
|
48
56
|
logger = logging.getLogger(__name__)
|
|
49
57
|
|
|
50
58
|
|
|
59
|
+
def _is_valid_label_value(value: str) -> bool:
|
|
60
|
+
"""
|
|
61
|
+
Validates if a string complies with Google Cloud label value format.
|
|
62
|
+
Only lowercase letters, numbers, and dashes are allowed.
|
|
63
|
+
The value must start with lowercase letter or number and end with a lowercase letter or number.
|
|
64
|
+
Maximum length is 63 characters.
|
|
65
|
+
"""
|
|
66
|
+
if not value:
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
# Check maximum length (63 characters for GCP label values)
|
|
70
|
+
if len(value) > 63:
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
# Check if the value matches the pattern: starts and ends with alphanumeric,
|
|
74
|
+
# contains only lowercase letters, numbers, and dashes
|
|
75
|
+
pattern = r"^[a-z0-9]([a-z0-9-]*[a-z0-9])?$"
|
|
76
|
+
return bool(re.match(pattern, value))
|
|
77
|
+
|
|
78
|
+
|
|
51
79
|
class DataprocSparkSession(SparkSession):
|
|
52
80
|
"""The entry point to programming Spark with the Dataset and DataFrame API.
|
|
53
81
|
|
|
@@ -77,16 +105,6 @@ class DataprocSparkSession(SparkSession):
|
|
|
77
105
|
|
|
78
106
|
class Builder(SparkSession.Builder):
|
|
79
107
|
|
|
80
|
-
_session_static_configs = [
|
|
81
|
-
"spark.executor.cores",
|
|
82
|
-
"spark.executor.memoryOverhead",
|
|
83
|
-
"spark.executor.memory",
|
|
84
|
-
"spark.driver.memory",
|
|
85
|
-
"spark.driver.cores",
|
|
86
|
-
"spark.eventLog.dir",
|
|
87
|
-
"spark.history.fs.logDirectory",
|
|
88
|
-
]
|
|
89
|
-
|
|
90
108
|
def __init__(self):
|
|
91
109
|
self._options: Dict[str, Any] = {}
|
|
92
110
|
self._channel_builder: Optional[DataprocChannelBuilder] = None
|
|
@@ -100,15 +118,6 @@ class DataprocSparkSession(SparkSession):
|
|
|
100
118
|
)
|
|
101
119
|
)
|
|
102
120
|
|
|
103
|
-
def __apply_options(self, session: "SparkSession") -> None:
|
|
104
|
-
with self._lock:
|
|
105
|
-
self._options = {
|
|
106
|
-
key: value
|
|
107
|
-
for key, value in self._options.items()
|
|
108
|
-
if key not in self._session_static_configs
|
|
109
|
-
}
|
|
110
|
-
self._apply_options(session)
|
|
111
|
-
|
|
112
121
|
def projectId(self, project_id):
|
|
113
122
|
self._project_id = project_id
|
|
114
123
|
return self
|
|
@@ -166,7 +175,6 @@ class DataprocSparkSession(SparkSession):
|
|
|
166
175
|
session = DataprocSparkSession(connection=self._channel_builder)
|
|
167
176
|
|
|
168
177
|
DataprocSparkSession._set_default_and_active_session(session)
|
|
169
|
-
self.__apply_options(session)
|
|
170
178
|
return session
|
|
171
179
|
|
|
172
180
|
def __create(self) -> "DataprocSparkSession":
|
|
@@ -245,9 +253,9 @@ class DataprocSparkSession(SparkSession):
|
|
|
245
253
|
operation = SessionControllerClient(
|
|
246
254
|
client_options=self._client_options
|
|
247
255
|
).create_session(session_request)
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
)
|
|
256
|
+
self._display_session_link_on_creation(session_id)
|
|
257
|
+
# TODO: Add the 'View Session Details' button once the UI changes are done.
|
|
258
|
+
# self._display_view_session_details_button(session_id)
|
|
251
259
|
create_session_pbar_thread.start()
|
|
252
260
|
session_response: Session = operation.result(
|
|
253
261
|
polling=retry.Retry(
|
|
@@ -260,7 +268,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
260
268
|
)
|
|
261
269
|
stop_create_session_pbar_event.set()
|
|
262
270
|
create_session_pbar_thread.join()
|
|
263
|
-
|
|
271
|
+
self._print_session_created_message()
|
|
264
272
|
file_path = (
|
|
265
273
|
DataprocSparkSession._get_active_session_file_path()
|
|
266
274
|
)
|
|
@@ -305,6 +313,46 @@ class DataprocSparkSession(SparkSession):
|
|
|
305
313
|
session_response, dataproc_config.name
|
|
306
314
|
)
|
|
307
315
|
|
|
316
|
+
def _display_session_link_on_creation(self, session_id):
|
|
317
|
+
session_url = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
|
|
318
|
+
plain_message = f"Creating Dataproc Session: {session_url}"
|
|
319
|
+
html_element = f"""
|
|
320
|
+
<div>
|
|
321
|
+
<p>Creating Dataproc Spark Session<p>
|
|
322
|
+
<p><a href="{session_url}">Dataproc Session</a></p>
|
|
323
|
+
</div>
|
|
324
|
+
"""
|
|
325
|
+
|
|
326
|
+
self._output_element_or_message(plain_message, html_element)
|
|
327
|
+
|
|
328
|
+
def _print_session_created_message(self):
|
|
329
|
+
plain_message = f"Dataproc Session was successfully created"
|
|
330
|
+
html_element = f"<div><p>{plain_message}</p></div>"
|
|
331
|
+
|
|
332
|
+
self._output_element_or_message(plain_message, html_element)
|
|
333
|
+
|
|
334
|
+
def _output_element_or_message(self, plain_message, html_element):
|
|
335
|
+
"""
|
|
336
|
+
Display / print the needed rich HTML element or plain text depending
|
|
337
|
+
on whether rich element is supported or not.
|
|
338
|
+
|
|
339
|
+
:param plain_message: Message to print on non-IPython or
|
|
340
|
+
non-interactive shell
|
|
341
|
+
:param html_element: HTML element to display for interactive IPython
|
|
342
|
+
environment
|
|
343
|
+
"""
|
|
344
|
+
try:
|
|
345
|
+
from IPython.display import display, HTML
|
|
346
|
+
from IPython.core.interactiveshell import InteractiveShell
|
|
347
|
+
|
|
348
|
+
if not InteractiveShell.initialized():
|
|
349
|
+
raise DataprocSparkConnectException(
|
|
350
|
+
"Not in an Interactive IPython Environment"
|
|
351
|
+
)
|
|
352
|
+
display(HTML(html_element))
|
|
353
|
+
except (ImportError, DataprocSparkConnectException):
|
|
354
|
+
print(plain_message)
|
|
355
|
+
|
|
308
356
|
def _get_exiting_active_session(
|
|
309
357
|
self,
|
|
310
358
|
) -> Optional["DataprocSparkSession"]:
|
|
@@ -325,6 +373,8 @@ class DataprocSparkSession(SparkSession):
|
|
|
325
373
|
print(
|
|
326
374
|
f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
|
|
327
375
|
)
|
|
376
|
+
# TODO: Add the 'View Session Details' button once the UI changes are done.
|
|
377
|
+
# self._display_view_session_details_button(s8s_session_id)
|
|
328
378
|
if session is None:
|
|
329
379
|
session = self.__create_spark_connect_session_from_s8s(
|
|
330
380
|
session_response, session_name
|
|
@@ -344,8 +394,6 @@ class DataprocSparkSession(SparkSession):
|
|
|
344
394
|
session = self._get_exiting_active_session()
|
|
345
395
|
if session is None:
|
|
346
396
|
session = self.__create()
|
|
347
|
-
if session:
|
|
348
|
-
self.__apply_options(session)
|
|
349
397
|
return session
|
|
350
398
|
|
|
351
399
|
def _get_dataproc_config(self):
|
|
@@ -400,14 +448,22 @@ class DataprocSparkSession(SparkSession):
|
|
|
400
448
|
os.getenv("DATAPROC_SPARK_CONNECT_IDLE_TTL_SECONDS")
|
|
401
449
|
)
|
|
402
450
|
}
|
|
403
|
-
if "
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
)
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
451
|
+
if "COLAB_NOTEBOOK_ID" in os.environ:
|
|
452
|
+
colab_notebook_name = os.environ["COLAB_NOTEBOOK_ID"]
|
|
453
|
+
# Extract the last part of the path, which is the ID
|
|
454
|
+
notebook_id = os.path.basename(colab_notebook_name)
|
|
455
|
+
if _is_valid_label_value(notebook_id):
|
|
456
|
+
dataproc_config.labels["goog-colab-notebook-id"] = (
|
|
457
|
+
notebook_id
|
|
458
|
+
)
|
|
459
|
+
else:
|
|
460
|
+
logger.warning(
|
|
461
|
+
f"Warning while processing notebook ID: Notebook ID '{notebook_id}' is not compliant with label value format. "
|
|
462
|
+
f"Only lowercase letters, numbers, and dashes are allowed. "
|
|
463
|
+
f"The value must start with lowercase letter or number and end with a lowercase letter or number. "
|
|
464
|
+
f"Maximum length is 63 characters. "
|
|
465
|
+
f"Skipping notebook ID label."
|
|
466
|
+
)
|
|
411
467
|
default_datasource = os.getenv(
|
|
412
468
|
"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
|
|
413
469
|
)
|
|
@@ -434,6 +490,22 @@ class DataprocSparkSession(SparkSession):
|
|
|
434
490
|
)
|
|
435
491
|
return dataproc_config
|
|
436
492
|
|
|
493
|
+
def _display_view_session_details_button(self, session_id):
|
|
494
|
+
try:
|
|
495
|
+
session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
|
|
496
|
+
from IPython.core.interactiveshell import InteractiveShell
|
|
497
|
+
|
|
498
|
+
if not InteractiveShell.initialized():
|
|
499
|
+
return
|
|
500
|
+
|
|
501
|
+
from google.cloud.aiplatform.utils import _ipython_utils
|
|
502
|
+
|
|
503
|
+
_ipython_utils.display_link(
|
|
504
|
+
"View Session Details", f"{session_url}", "dashboard"
|
|
505
|
+
)
|
|
506
|
+
except ImportError as e:
|
|
507
|
+
logger.debug(f"Import error: {e}")
|
|
508
|
+
|
|
437
509
|
@staticmethod
|
|
438
510
|
def generate_dataproc_session_id():
|
|
439
511
|
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
@@ -445,6 +517,41 @@ class DataprocSparkSession(SparkSession):
|
|
|
445
517
|
)
|
|
446
518
|
return f"sc-{timestamp}-{random_suffix}"
|
|
447
519
|
|
|
520
|
+
def __init__(
|
|
521
|
+
self,
|
|
522
|
+
connection: Union[str, DataprocChannelBuilder],
|
|
523
|
+
user_id: Optional[str] = None,
|
|
524
|
+
):
|
|
525
|
+
"""
|
|
526
|
+
Creates a new DataprocSparkSession for the Spark Connect interface.
|
|
527
|
+
|
|
528
|
+
Parameters
|
|
529
|
+
----------
|
|
530
|
+
connection : str or :class:`DataprocChannelBuilder`
|
|
531
|
+
Connection string that is used to extract the connection parameters
|
|
532
|
+
and configure the GRPC connection. Or instance of ChannelBuilder /
|
|
533
|
+
DataprocChannelBuilder that creates GRPC connection.
|
|
534
|
+
user_id : str, optional
|
|
535
|
+
If not set, will default to the $USER environment. Defining the user
|
|
536
|
+
ID as part of the connection string takes precedence.
|
|
537
|
+
"""
|
|
538
|
+
|
|
539
|
+
super().__init__(connection, user_id)
|
|
540
|
+
|
|
541
|
+
base_method = self.client._execute_plan_request_with_metadata
|
|
542
|
+
|
|
543
|
+
def wrapped_method(*args, **kwargs):
|
|
544
|
+
req = base_method(*args, **kwargs)
|
|
545
|
+
if not req.operation_id:
|
|
546
|
+
req.operation_id = str(uuid.uuid4())
|
|
547
|
+
logger.debug(
|
|
548
|
+
f"No operation_id found. Setting operation_id: {req.operation_id}"
|
|
549
|
+
)
|
|
550
|
+
self._display_operation_link(req.operation_id)
|
|
551
|
+
return req
|
|
552
|
+
|
|
553
|
+
self.client._execute_plan_request_with_metadata = wrapped_method
|
|
554
|
+
|
|
448
555
|
def _repr_html_(self) -> str:
|
|
449
556
|
if not self._active_s8s_session_id:
|
|
450
557
|
return """
|
|
@@ -462,6 +569,37 @@ class DataprocSparkSession(SparkSession):
|
|
|
462
569
|
</div>
|
|
463
570
|
"""
|
|
464
571
|
|
|
572
|
+
def _display_operation_link(self, operation_id: str):
|
|
573
|
+
assert all(
|
|
574
|
+
[
|
|
575
|
+
operation_id is not None,
|
|
576
|
+
self._region is not None,
|
|
577
|
+
self._active_s8s_session_id is not None,
|
|
578
|
+
self._project_id is not None,
|
|
579
|
+
]
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
url = (
|
|
583
|
+
f"https://console.cloud.google.com/dataproc/interactive/{self._region}/"
|
|
584
|
+
f"{self._active_s8s_session_id}/sparkApplications/application;"
|
|
585
|
+
f"associatedSqlOperationId={operation_id}?project={self._project_id}"
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
try:
|
|
589
|
+
from IPython.display import display, HTML
|
|
590
|
+
from IPython.core.interactiveshell import InteractiveShell
|
|
591
|
+
|
|
592
|
+
if not InteractiveShell.initialized():
|
|
593
|
+
return
|
|
594
|
+
html_element = f"""
|
|
595
|
+
<div>
|
|
596
|
+
<p><a href="{url}">Spark UI</a> (Operation: {operation_id})</p>
|
|
597
|
+
</div>
|
|
598
|
+
"""
|
|
599
|
+
display(HTML(html_element))
|
|
600
|
+
except ImportError:
|
|
601
|
+
return
|
|
602
|
+
|
|
465
603
|
@staticmethod
|
|
466
604
|
def _remove_stopped_session_from_file():
|
|
467
605
|
file_path = DataprocSparkSession._get_active_session_file_path()
|
|
@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
|
|
|
20
20
|
|
|
21
21
|
setup(
|
|
22
22
|
name="dataproc-spark-connect",
|
|
23
|
-
version="0.
|
|
23
|
+
version="0.8.1",
|
|
24
24
|
description="Dataproc client library for Spark Connect",
|
|
25
25
|
long_description=long_description,
|
|
26
26
|
author="Google LLC",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|