dataproc-spark-connect 0.7.4__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/PKG-INFO +10 -3
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/dataproc_spark_connect.egg-info/PKG-INFO +10 -3
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/dataproc_spark_connect.egg-info/requires.txt +1 -1
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/google/cloud/dataproc_spark_connect/exceptions.py +1 -1
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/google/cloud/dataproc_spark_connect/session.py +125 -32
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/setup.py +2 -2
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/LICENSE +0 -0
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/README.md +0 -0
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/dataproc_spark_connect.egg-info/SOURCES.txt +0 -0
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/google/cloud/dataproc_spark_connect/client/core.py +0 -0
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/pyproject.toml +0 -0
- {dataproc_spark_connect-0.7.4 → dataproc_spark_connect-0.8.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
@@ -9,9 +9,16 @@ License-File: LICENSE
|
|
|
9
9
|
Requires-Dist: google-api-core>=2.19
|
|
10
10
|
Requires-Dist: google-cloud-dataproc>=5.18
|
|
11
11
|
Requires-Dist: packaging>=20.0
|
|
12
|
-
Requires-Dist: pyspark[connect]
|
|
12
|
+
Requires-Dist: pyspark[connect]~=3.5.1
|
|
13
13
|
Requires-Dist: tqdm>=4.67
|
|
14
14
|
Requires-Dist: websockets>=14.0
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: description
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: license
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
Dynamic: requires-dist
|
|
21
|
+
Dynamic: summary
|
|
15
22
|
|
|
16
23
|
# Dataproc Spark Connect Client
|
|
17
24
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.0
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
@@ -9,9 +9,16 @@ License-File: LICENSE
|
|
|
9
9
|
Requires-Dist: google-api-core>=2.19
|
|
10
10
|
Requires-Dist: google-cloud-dataproc>=5.18
|
|
11
11
|
Requires-Dist: packaging>=20.0
|
|
12
|
-
Requires-Dist: pyspark[connect]
|
|
12
|
+
Requires-Dist: pyspark[connect]~=3.5.1
|
|
13
13
|
Requires-Dist: tqdm>=4.67
|
|
14
14
|
Requires-Dist: websockets>=14.0
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: description
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: license
|
|
19
|
+
Dynamic: license-file
|
|
20
|
+
Dynamic: requires-dist
|
|
21
|
+
Dynamic: summary
|
|
15
22
|
|
|
16
23
|
# Dataproc Spark Connect Client
|
|
17
24
|
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
class DataprocSparkConnectException(Exception):
|
|
17
17
|
"""A custom exception class to only print the error messages.
|
|
18
18
|
This would be used for exceptions where the stack trace
|
|
19
|
-
doesn't provide any additional information.
|
|
19
|
+
doesn't provide any additional information.
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
def __init__(self, message):
|
|
@@ -18,14 +18,23 @@ import json
|
|
|
18
18
|
import logging
|
|
19
19
|
import os
|
|
20
20
|
import random
|
|
21
|
+
import re
|
|
21
22
|
import string
|
|
22
23
|
import threading
|
|
23
24
|
import time
|
|
25
|
+
from typing import Any, cast, ClassVar, Dict, Optional, Union
|
|
26
|
+
import uuid
|
|
24
27
|
import tqdm
|
|
25
28
|
|
|
26
29
|
from google.api_core import retry
|
|
27
30
|
from google.api_core.client_options import ClientOptions
|
|
28
|
-
from google.api_core.exceptions import
|
|
31
|
+
from google.api_core.exceptions import (
|
|
32
|
+
Aborted,
|
|
33
|
+
FailedPrecondition,
|
|
34
|
+
InvalidArgument,
|
|
35
|
+
NotFound,
|
|
36
|
+
PermissionDenied,
|
|
37
|
+
)
|
|
29
38
|
from google.api_core.future.polling import POLLING_PREDICATE
|
|
30
39
|
from google.cloud.dataproc_spark_connect.client import DataprocChannelBuilder
|
|
31
40
|
from google.cloud.dataproc_spark_connect.exceptions import DataprocSparkConnectException
|
|
@@ -41,13 +50,32 @@ from google.cloud.dataproc_v1 import (
|
|
|
41
50
|
from google.cloud.dataproc_v1.types import sessions
|
|
42
51
|
from pyspark.sql.connect.session import SparkSession
|
|
43
52
|
from pyspark.sql.utils import to_str
|
|
44
|
-
from typing import Any, cast, ClassVar, Dict, Optional
|
|
45
53
|
|
|
46
54
|
# Set up logging
|
|
47
55
|
logging.basicConfig(level=logging.INFO)
|
|
48
56
|
logger = logging.getLogger(__name__)
|
|
49
57
|
|
|
50
58
|
|
|
59
|
+
def _is_valid_label_value(value: str) -> bool:
|
|
60
|
+
"""
|
|
61
|
+
Validates if a string complies with Google Cloud label value format.
|
|
62
|
+
Only lowercase letters, numbers, and dashes are allowed.
|
|
63
|
+
The value must start with lowercase letter or number and end with a lowercase letter or number.
|
|
64
|
+
Maximum length is 63 characters.
|
|
65
|
+
"""
|
|
66
|
+
if not value:
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
# Check maximum length (63 characters for GCP label values)
|
|
70
|
+
if len(value) > 63:
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
# Check if the value matches the pattern: starts and ends with alphanumeric,
|
|
74
|
+
# contains only lowercase letters, numbers, and dashes
|
|
75
|
+
pattern = r"^[a-z0-9]([a-z0-9-]*[a-z0-9])?$"
|
|
76
|
+
return bool(re.match(pattern, value))
|
|
77
|
+
|
|
78
|
+
|
|
51
79
|
class DataprocSparkSession(SparkSession):
|
|
52
80
|
"""The entry point to programming Spark with the Dataset and DataFrame API.
|
|
53
81
|
|
|
@@ -77,16 +105,6 @@ class DataprocSparkSession(SparkSession):
|
|
|
77
105
|
|
|
78
106
|
class Builder(SparkSession.Builder):
|
|
79
107
|
|
|
80
|
-
_session_static_configs = [
|
|
81
|
-
"spark.executor.cores",
|
|
82
|
-
"spark.executor.memoryOverhead",
|
|
83
|
-
"spark.executor.memory",
|
|
84
|
-
"spark.driver.memory",
|
|
85
|
-
"spark.driver.cores",
|
|
86
|
-
"spark.eventLog.dir",
|
|
87
|
-
"spark.history.fs.logDirectory",
|
|
88
|
-
]
|
|
89
|
-
|
|
90
108
|
def __init__(self):
|
|
91
109
|
self._options: Dict[str, Any] = {}
|
|
92
110
|
self._channel_builder: Optional[DataprocChannelBuilder] = None
|
|
@@ -100,15 +118,6 @@ class DataprocSparkSession(SparkSession):
|
|
|
100
118
|
)
|
|
101
119
|
)
|
|
102
120
|
|
|
103
|
-
def __apply_options(self, session: "SparkSession") -> None:
|
|
104
|
-
with self._lock:
|
|
105
|
-
self._options = {
|
|
106
|
-
key: value
|
|
107
|
-
for key, value in self._options.items()
|
|
108
|
-
if key not in self._session_static_configs
|
|
109
|
-
}
|
|
110
|
-
self._apply_options(session)
|
|
111
|
-
|
|
112
121
|
def projectId(self, project_id):
|
|
113
122
|
self._project_id = project_id
|
|
114
123
|
return self
|
|
@@ -166,7 +175,6 @@ class DataprocSparkSession(SparkSession):
|
|
|
166
175
|
session = DataprocSparkSession(connection=self._channel_builder)
|
|
167
176
|
|
|
168
177
|
DataprocSparkSession._set_default_and_active_session(session)
|
|
169
|
-
self.__apply_options(session)
|
|
170
178
|
return session
|
|
171
179
|
|
|
172
180
|
def __create(self) -> "DataprocSparkSession":
|
|
@@ -248,6 +256,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
248
256
|
print(
|
|
249
257
|
f"Creating Dataproc Session: https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
|
|
250
258
|
)
|
|
259
|
+
self._display_view_session_details_button(session_id)
|
|
251
260
|
create_session_pbar_thread.start()
|
|
252
261
|
session_response: Session = operation.result(
|
|
253
262
|
polling=retry.Retry(
|
|
@@ -325,6 +334,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
325
334
|
print(
|
|
326
335
|
f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
|
|
327
336
|
)
|
|
337
|
+
self._display_view_session_details_button(s8s_session_id)
|
|
328
338
|
if session is None:
|
|
329
339
|
session = self.__create_spark_connect_session_from_s8s(
|
|
330
340
|
session_response, session_name
|
|
@@ -344,8 +354,6 @@ class DataprocSparkSession(SparkSession):
|
|
|
344
354
|
session = self._get_exiting_active_session()
|
|
345
355
|
if session is None:
|
|
346
356
|
session = self.__create()
|
|
347
|
-
if session:
|
|
348
|
-
self.__apply_options(session)
|
|
349
357
|
return session
|
|
350
358
|
|
|
351
359
|
def _get_dataproc_config(self):
|
|
@@ -400,14 +408,22 @@ class DataprocSparkSession(SparkSession):
|
|
|
400
408
|
os.getenv("DATAPROC_SPARK_CONNECT_IDLE_TTL_SECONDS")
|
|
401
409
|
)
|
|
402
410
|
}
|
|
403
|
-
if "
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
)
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
+
if "COLAB_NOTEBOOK_ID" in os.environ:
|
|
412
|
+
colab_notebook_name = os.environ["COLAB_NOTEBOOK_ID"]
|
|
413
|
+
# Extract the last part of the path, which is the ID
|
|
414
|
+
notebook_id = os.path.basename(colab_notebook_name)
|
|
415
|
+
if _is_valid_label_value(notebook_id):
|
|
416
|
+
dataproc_config.labels["goog-colab-notebook-id"] = (
|
|
417
|
+
notebook_id
|
|
418
|
+
)
|
|
419
|
+
else:
|
|
420
|
+
logger.warning(
|
|
421
|
+
f"Warning while processing notebook ID: Notebook ID '{notebook_id}' is not compliant with label value format. "
|
|
422
|
+
f"Only lowercase letters, numbers, and dashes are allowed. "
|
|
423
|
+
f"The value must start with lowercase letter or number and end with a lowercase letter or number. "
|
|
424
|
+
f"Maximum length is 63 characters. "
|
|
425
|
+
f"Skipping notebook ID label."
|
|
426
|
+
)
|
|
411
427
|
default_datasource = os.getenv(
|
|
412
428
|
"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
|
|
413
429
|
)
|
|
@@ -434,6 +450,17 @@ class DataprocSparkSession(SparkSession):
|
|
|
434
450
|
)
|
|
435
451
|
return dataproc_config
|
|
436
452
|
|
|
453
|
+
def _display_view_session_details_button(self, session_id):
|
|
454
|
+
try:
|
|
455
|
+
session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
|
|
456
|
+
from google.cloud.aiplatform.utils import _ipython_utils
|
|
457
|
+
|
|
458
|
+
_ipython_utils.display_link(
|
|
459
|
+
"View Session Details", f"{session_url}", "dashboard"
|
|
460
|
+
)
|
|
461
|
+
except ImportError as e:
|
|
462
|
+
logger.debug(f"Import error: {e}")
|
|
463
|
+
|
|
437
464
|
@staticmethod
|
|
438
465
|
def generate_dataproc_session_id():
|
|
439
466
|
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
@@ -445,6 +472,41 @@ class DataprocSparkSession(SparkSession):
|
|
|
445
472
|
)
|
|
446
473
|
return f"sc-{timestamp}-{random_suffix}"
|
|
447
474
|
|
|
475
|
+
def __init__(
|
|
476
|
+
self,
|
|
477
|
+
connection: Union[str, DataprocChannelBuilder],
|
|
478
|
+
user_id: Optional[str] = None,
|
|
479
|
+
):
|
|
480
|
+
"""
|
|
481
|
+
Creates a new DataprocSparkSession for the Spark Connect interface.
|
|
482
|
+
|
|
483
|
+
Parameters
|
|
484
|
+
----------
|
|
485
|
+
connection : str or :class:`DataprocChannelBuilder`
|
|
486
|
+
Connection string that is used to extract the connection parameters
|
|
487
|
+
and configure the GRPC connection. Or instance of ChannelBuilder /
|
|
488
|
+
DataprocChannelBuilder that creates GRPC connection.
|
|
489
|
+
user_id : str, optional
|
|
490
|
+
If not set, will default to the $USER environment. Defining the user
|
|
491
|
+
ID as part of the connection string takes precedence.
|
|
492
|
+
"""
|
|
493
|
+
|
|
494
|
+
super().__init__(connection, user_id)
|
|
495
|
+
|
|
496
|
+
base_method = self.client._execute_plan_request_with_metadata
|
|
497
|
+
|
|
498
|
+
def wrapped_method(*args, **kwargs):
|
|
499
|
+
req = base_method(*args, **kwargs)
|
|
500
|
+
if not req.operation_id:
|
|
501
|
+
req.operation_id = str(uuid.uuid4())
|
|
502
|
+
logger.debug(
|
|
503
|
+
f"No operation_id found. Setting operation_id: {req.operation_id}"
|
|
504
|
+
)
|
|
505
|
+
self._display_operation_link(req.operation_id)
|
|
506
|
+
return req
|
|
507
|
+
|
|
508
|
+
self.client._execute_plan_request_with_metadata = wrapped_method
|
|
509
|
+
|
|
448
510
|
def _repr_html_(self) -> str:
|
|
449
511
|
if not self._active_s8s_session_id:
|
|
450
512
|
return """
|
|
@@ -462,6 +524,37 @@ class DataprocSparkSession(SparkSession):
|
|
|
462
524
|
</div>
|
|
463
525
|
"""
|
|
464
526
|
|
|
527
|
+
def _display_operation_link(self, operation_id: str):
|
|
528
|
+
assert all(
|
|
529
|
+
[
|
|
530
|
+
operation_id is not None,
|
|
531
|
+
self._region is not None,
|
|
532
|
+
self._active_s8s_session_id is not None,
|
|
533
|
+
self._project_id is not None,
|
|
534
|
+
]
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
url = (
|
|
538
|
+
f"https://console.cloud.google.com/dataproc/interactive/{self._region}/"
|
|
539
|
+
f"{self._active_s8s_session_id}/sparkApplications/application;"
|
|
540
|
+
f"associatedSqlOperationId={operation_id}?project={self._project_id}"
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
try:
|
|
544
|
+
from IPython.display import display, HTML
|
|
545
|
+
from IPython.core.interactiveshell import InteractiveShell
|
|
546
|
+
|
|
547
|
+
if not InteractiveShell.initialized():
|
|
548
|
+
return
|
|
549
|
+
html_element = f"""
|
|
550
|
+
<div>
|
|
551
|
+
<p><a href="{url}">Spark UI</a> (Operation: {operation_id})</p>
|
|
552
|
+
</div>
|
|
553
|
+
"""
|
|
554
|
+
display(HTML(html_element))
|
|
555
|
+
except ImportError:
|
|
556
|
+
return
|
|
557
|
+
|
|
465
558
|
@staticmethod
|
|
466
559
|
def _remove_stopped_session_from_file():
|
|
467
560
|
file_path = DataprocSparkSession._get_active_session_file_path()
|
|
@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
|
|
|
20
20
|
|
|
21
21
|
setup(
|
|
22
22
|
name="dataproc-spark-connect",
|
|
23
|
-
version="0.
|
|
23
|
+
version="0.8.0",
|
|
24
24
|
description="Dataproc client library for Spark Connect",
|
|
25
25
|
long_description=long_description,
|
|
26
26
|
author="Google LLC",
|
|
@@ -31,7 +31,7 @@ setup(
|
|
|
31
31
|
"google-api-core>=2.19",
|
|
32
32
|
"google-cloud-dataproc>=5.18",
|
|
33
33
|
"packaging>=20.0",
|
|
34
|
-
"pyspark[connect]
|
|
34
|
+
"pyspark[connect]~=3.5.1",
|
|
35
35
|
"tqdm>=4.67",
|
|
36
36
|
"websockets>=14.0",
|
|
37
37
|
],
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|