dataproc-spark-connect 0.7.4__py2.py3-none-any.whl → 0.8.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 0.7.4
3
+ Version: 0.8.0
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -9,9 +9,16 @@ License-File: LICENSE
9
9
  Requires-Dist: google-api-core>=2.19
10
10
  Requires-Dist: google-cloud-dataproc>=5.18
11
11
  Requires-Dist: packaging>=20.0
12
- Requires-Dist: pyspark[connect]>=3.5
12
+ Requires-Dist: pyspark[connect]~=3.5.1
13
13
  Requires-Dist: tqdm>=4.67
14
14
  Requires-Dist: websockets>=14.0
15
+ Dynamic: author
16
+ Dynamic: description
17
+ Dynamic: home-page
18
+ Dynamic: license
19
+ Dynamic: license-file
20
+ Dynamic: requires-dist
21
+ Dynamic: summary
15
22
 
16
23
  # Dataproc Spark Connect Client
17
24
 
@@ -0,0 +1,12 @@
1
+ dataproc_spark_connect-0.8.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2
+ google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
3
+ google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
4
+ google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
5
+ google/cloud/dataproc_spark_connect/session.py,sha256=sX6uLFsm6wJWnE6zSNFCh5P5lJCoOhZpSfyUzFgwthw,30023
6
+ google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
7
+ google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
8
+ google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
9
+ dataproc_spark_connect-0.8.0.dist-info/METADATA,sha256=xgmdQGsugNmxvYq9NwKS-FonzZmYUyK4HyWFA_xZnv0,3465
10
+ dataproc_spark_connect-0.8.0.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
11
+ dataproc_spark_connect-0.8.0.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
12
+ dataproc_spark_connect-0.8.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2-none-any
5
5
  Tag: py3-none-any
@@ -16,7 +16,7 @@
16
16
  class DataprocSparkConnectException(Exception):
17
17
  """A custom exception class to only print the error messages.
18
18
  This would be used for exceptions where the stack trace
19
- doesn't provide any additional information.h
19
+ doesn't provide any additional information.
20
20
  """
21
21
 
22
22
  def __init__(self, message):
@@ -18,14 +18,23 @@ import json
18
18
  import logging
19
19
  import os
20
20
  import random
21
+ import re
21
22
  import string
22
23
  import threading
23
24
  import time
25
+ from typing import Any, cast, ClassVar, Dict, Optional, Union
26
+ import uuid
24
27
  import tqdm
25
28
 
26
29
  from google.api_core import retry
27
30
  from google.api_core.client_options import ClientOptions
28
- from google.api_core.exceptions import Aborted, FailedPrecondition, InvalidArgument, NotFound, PermissionDenied
31
+ from google.api_core.exceptions import (
32
+ Aborted,
33
+ FailedPrecondition,
34
+ InvalidArgument,
35
+ NotFound,
36
+ PermissionDenied,
37
+ )
29
38
  from google.api_core.future.polling import POLLING_PREDICATE
30
39
  from google.cloud.dataproc_spark_connect.client import DataprocChannelBuilder
31
40
  from google.cloud.dataproc_spark_connect.exceptions import DataprocSparkConnectException
@@ -41,13 +50,32 @@ from google.cloud.dataproc_v1 import (
41
50
  from google.cloud.dataproc_v1.types import sessions
42
51
  from pyspark.sql.connect.session import SparkSession
43
52
  from pyspark.sql.utils import to_str
44
- from typing import Any, cast, ClassVar, Dict, Optional
45
53
 
46
54
  # Set up logging
47
55
  logging.basicConfig(level=logging.INFO)
48
56
  logger = logging.getLogger(__name__)
49
57
 
50
58
 
59
+ def _is_valid_label_value(value: str) -> bool:
60
+ """
61
+ Validates if a string complies with Google Cloud label value format.
62
+ Only lowercase letters, numbers, and dashes are allowed.
63
+ The value must start with lowercase letter or number and end with a lowercase letter or number.
64
+ Maximum length is 63 characters.
65
+ """
66
+ if not value:
67
+ return False
68
+
69
+ # Check maximum length (63 characters for GCP label values)
70
+ if len(value) > 63:
71
+ return False
72
+
73
+ # Check if the value matches the pattern: starts and ends with alphanumeric,
74
+ # contains only lowercase letters, numbers, and dashes
75
+ pattern = r"^[a-z0-9]([a-z0-9-]*[a-z0-9])?$"
76
+ return bool(re.match(pattern, value))
77
+
78
+
51
79
  class DataprocSparkSession(SparkSession):
52
80
  """The entry point to programming Spark with the Dataset and DataFrame API.
53
81
 
@@ -77,16 +105,6 @@ class DataprocSparkSession(SparkSession):
77
105
 
78
106
  class Builder(SparkSession.Builder):
79
107
 
80
- _session_static_configs = [
81
- "spark.executor.cores",
82
- "spark.executor.memoryOverhead",
83
- "spark.executor.memory",
84
- "spark.driver.memory",
85
- "spark.driver.cores",
86
- "spark.eventLog.dir",
87
- "spark.history.fs.logDirectory",
88
- ]
89
-
90
108
  def __init__(self):
91
109
  self._options: Dict[str, Any] = {}
92
110
  self._channel_builder: Optional[DataprocChannelBuilder] = None
@@ -100,15 +118,6 @@ class DataprocSparkSession(SparkSession):
100
118
  )
101
119
  )
102
120
 
103
- def __apply_options(self, session: "SparkSession") -> None:
104
- with self._lock:
105
- self._options = {
106
- key: value
107
- for key, value in self._options.items()
108
- if key not in self._session_static_configs
109
- }
110
- self._apply_options(session)
111
-
112
121
  def projectId(self, project_id):
113
122
  self._project_id = project_id
114
123
  return self
@@ -166,7 +175,6 @@ class DataprocSparkSession(SparkSession):
166
175
  session = DataprocSparkSession(connection=self._channel_builder)
167
176
 
168
177
  DataprocSparkSession._set_default_and_active_session(session)
169
- self.__apply_options(session)
170
178
  return session
171
179
 
172
180
  def __create(self) -> "DataprocSparkSession":
@@ -248,6 +256,7 @@ class DataprocSparkSession(SparkSession):
248
256
  print(
249
257
  f"Creating Dataproc Session: https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
250
258
  )
259
+ self._display_view_session_details_button(session_id)
251
260
  create_session_pbar_thread.start()
252
261
  session_response: Session = operation.result(
253
262
  polling=retry.Retry(
@@ -325,6 +334,7 @@ class DataprocSparkSession(SparkSession):
325
334
  print(
326
335
  f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
327
336
  )
337
+ self._display_view_session_details_button(s8s_session_id)
328
338
  if session is None:
329
339
  session = self.__create_spark_connect_session_from_s8s(
330
340
  session_response, session_name
@@ -344,8 +354,6 @@ class DataprocSparkSession(SparkSession):
344
354
  session = self._get_exiting_active_session()
345
355
  if session is None:
346
356
  session = self.__create()
347
- if session:
348
- self.__apply_options(session)
349
357
  return session
350
358
 
351
359
  def _get_dataproc_config(self):
@@ -400,14 +408,22 @@ class DataprocSparkSession(SparkSession):
400
408
  os.getenv("DATAPROC_SPARK_CONNECT_IDLE_TTL_SECONDS")
401
409
  )
402
410
  }
403
- if "COLAB_NOTEBOOK_RUNTIME_ID" in os.environ:
404
- dataproc_config.labels["colab-notebook-runtime-id"] = (
405
- os.environ["COLAB_NOTEBOOK_RUNTIME_ID"]
406
- )
407
- if "COLAB_NOTEBOOK_KERNEL_ID" in os.environ:
408
- dataproc_config.labels["colab-notebook-kernel-id"] = os.environ[
409
- "COLAB_NOTEBOOK_KERNEL_ID"
410
- ]
411
+ if "COLAB_NOTEBOOK_ID" in os.environ:
412
+ colab_notebook_name = os.environ["COLAB_NOTEBOOK_ID"]
413
+ # Extract the last part of the path, which is the ID
414
+ notebook_id = os.path.basename(colab_notebook_name)
415
+ if _is_valid_label_value(notebook_id):
416
+ dataproc_config.labels["goog-colab-notebook-id"] = (
417
+ notebook_id
418
+ )
419
+ else:
420
+ logger.warning(
421
+ f"Warning while processing notebook ID: Notebook ID '{notebook_id}' is not compliant with label value format. "
422
+ f"Only lowercase letters, numbers, and dashes are allowed. "
423
+ f"The value must start with lowercase letter or number and end with a lowercase letter or number. "
424
+ f"Maximum length is 63 characters. "
425
+ f"Skipping notebook ID label."
426
+ )
411
427
  default_datasource = os.getenv(
412
428
  "DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
413
429
  )
@@ -434,6 +450,17 @@ class DataprocSparkSession(SparkSession):
434
450
  )
435
451
  return dataproc_config
436
452
 
453
+ def _display_view_session_details_button(self, session_id):
454
+ try:
455
+ session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
456
+ from google.cloud.aiplatform.utils import _ipython_utils
457
+
458
+ _ipython_utils.display_link(
459
+ "View Session Details", f"{session_url}", "dashboard"
460
+ )
461
+ except ImportError as e:
462
+ logger.debug(f"Import error: {e}")
463
+
437
464
  @staticmethod
438
465
  def generate_dataproc_session_id():
439
466
  timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -445,6 +472,41 @@ class DataprocSparkSession(SparkSession):
445
472
  )
446
473
  return f"sc-{timestamp}-{random_suffix}"
447
474
 
475
+ def __init__(
476
+ self,
477
+ connection: Union[str, DataprocChannelBuilder],
478
+ user_id: Optional[str] = None,
479
+ ):
480
+ """
481
+ Creates a new DataprocSparkSession for the Spark Connect interface.
482
+
483
+ Parameters
484
+ ----------
485
+ connection : str or :class:`DataprocChannelBuilder`
486
+ Connection string that is used to extract the connection parameters
487
+ and configure the GRPC connection. Or instance of ChannelBuilder /
488
+ DataprocChannelBuilder that creates GRPC connection.
489
+ user_id : str, optional
490
+ If not set, will default to the $USER environment. Defining the user
491
+ ID as part of the connection string takes precedence.
492
+ """
493
+
494
+ super().__init__(connection, user_id)
495
+
496
+ base_method = self.client._execute_plan_request_with_metadata
497
+
498
+ def wrapped_method(*args, **kwargs):
499
+ req = base_method(*args, **kwargs)
500
+ if not req.operation_id:
501
+ req.operation_id = str(uuid.uuid4())
502
+ logger.debug(
503
+ f"No operation_id found. Setting operation_id: {req.operation_id}"
504
+ )
505
+ self._display_operation_link(req.operation_id)
506
+ return req
507
+
508
+ self.client._execute_plan_request_with_metadata = wrapped_method
509
+
448
510
  def _repr_html_(self) -> str:
449
511
  if not self._active_s8s_session_id:
450
512
  return """
@@ -462,6 +524,37 @@ class DataprocSparkSession(SparkSession):
462
524
  </div>
463
525
  """
464
526
 
527
+ def _display_operation_link(self, operation_id: str):
528
+ assert all(
529
+ [
530
+ operation_id is not None,
531
+ self._region is not None,
532
+ self._active_s8s_session_id is not None,
533
+ self._project_id is not None,
534
+ ]
535
+ )
536
+
537
+ url = (
538
+ f"https://console.cloud.google.com/dataproc/interactive/{self._region}/"
539
+ f"{self._active_s8s_session_id}/sparkApplications/application;"
540
+ f"associatedSqlOperationId={operation_id}?project={self._project_id}"
541
+ )
542
+
543
+ try:
544
+ from IPython.display import display, HTML
545
+ from IPython.core.interactiveshell import InteractiveShell
546
+
547
+ if not InteractiveShell.initialized():
548
+ return
549
+ html_element = f"""
550
+ <div>
551
+ <p><a href="{url}">Spark UI</a> (Operation: {operation_id})</p>
552
+ </div>
553
+ """
554
+ display(HTML(html_element))
555
+ except ImportError:
556
+ return
557
+
465
558
  @staticmethod
466
559
  def _remove_stopped_session_from_file():
467
560
  file_path = DataprocSparkSession._get_active_session_file_path()
@@ -1,12 +0,0 @@
1
- google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
2
- google/cloud/dataproc_spark_connect/exceptions.py,sha256=ilGyHD5M_yBQ3IC58-Y5miRGIQVJsLaNKvEGcHuk_BE,969
3
- google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
4
- google/cloud/dataproc_spark_connect/session.py,sha256=kMCZWmi_-ScJy9NO7NFrHaHDTXKxMwaCSDbdqGxEngk,26390
5
- google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
6
- google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
7
- google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
8
- dataproc_spark_connect-0.7.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
9
- dataproc_spark_connect-0.7.4.dist-info/METADATA,sha256=viQwCWio0b-xja72qtR447f9Ol7nl0k5d6bx1j2BAEk,3328
10
- dataproc_spark_connect-0.7.4.dist-info/WHEEL,sha256=OpXWERl2xLPRHTvd2ZXo_iluPEQd8uSbYkJ53NAER_Y,109
11
- dataproc_spark_connect-0.7.4.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
12
- dataproc_spark_connect-0.7.4.dist-info/RECORD,,