dataproc-spark-connect 0.7.5__py2.py3-none-any.whl → 0.8.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 0.7.5
3
+ Version: 0.8.1
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -12,6 +12,13 @@ Requires-Dist: packaging>=20.0
12
12
  Requires-Dist: pyspark[connect]~=3.5.1
13
13
  Requires-Dist: tqdm>=4.67
14
14
  Requires-Dist: websockets>=14.0
15
+ Dynamic: author
16
+ Dynamic: description
17
+ Dynamic: home-page
18
+ Dynamic: license
19
+ Dynamic: license-file
20
+ Dynamic: requires-dist
21
+ Dynamic: summary
15
22
 
16
23
  # Dataproc Spark Connect Client
17
24
 
@@ -0,0 +1,12 @@
1
+ dataproc_spark_connect-0.8.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2
+ google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
3
+ google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
4
+ google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
5
+ google/cloud/dataproc_spark_connect/session.py,sha256=o7N68KddasHkANrNPxsq4Uij711eAaNz4oKa7jGdsaY,32017
6
+ google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
7
+ google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
8
+ google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
9
+ dataproc_spark_connect-0.8.1.dist-info/METADATA,sha256=vrvJk30s46CeD-1AZC05zaUtX4ENurK8Ee90Bsb4XEQ,3465
10
+ dataproc_spark_connect-0.8.1.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
11
+ dataproc_spark_connect-0.8.1.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
12
+ dataproc_spark_connect-0.8.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2-none-any
5
5
  Tag: py3-none-any
@@ -16,7 +16,7 @@
16
16
  class DataprocSparkConnectException(Exception):
17
17
  """A custom exception class to only print the error messages.
18
18
  This would be used for exceptions where the stack trace
19
- doesn't provide any additional information.h
19
+ doesn't provide any additional information.
20
20
  """
21
21
 
22
22
  def __init__(self, message):
@@ -18,14 +18,23 @@ import json
18
18
  import logging
19
19
  import os
20
20
  import random
21
+ import re
21
22
  import string
22
23
  import threading
23
24
  import time
25
+ from typing import Any, cast, ClassVar, Dict, Optional, Union
26
+ import uuid
24
27
  import tqdm
25
28
 
26
29
  from google.api_core import retry
27
30
  from google.api_core.client_options import ClientOptions
28
- from google.api_core.exceptions import Aborted, FailedPrecondition, InvalidArgument, NotFound, PermissionDenied
31
+ from google.api_core.exceptions import (
32
+ Aborted,
33
+ FailedPrecondition,
34
+ InvalidArgument,
35
+ NotFound,
36
+ PermissionDenied,
37
+ )
29
38
  from google.api_core.future.polling import POLLING_PREDICATE
30
39
  from google.cloud.dataproc_spark_connect.client import DataprocChannelBuilder
31
40
  from google.cloud.dataproc_spark_connect.exceptions import DataprocSparkConnectException
@@ -41,13 +50,32 @@ from google.cloud.dataproc_v1 import (
41
50
  from google.cloud.dataproc_v1.types import sessions
42
51
  from pyspark.sql.connect.session import SparkSession
43
52
  from pyspark.sql.utils import to_str
44
- from typing import Any, cast, ClassVar, Dict, Optional
45
53
 
46
54
  # Set up logging
47
55
  logging.basicConfig(level=logging.INFO)
48
56
  logger = logging.getLogger(__name__)
49
57
 
50
58
 
59
+ def _is_valid_label_value(value: str) -> bool:
60
+ """
61
+ Validates if a string complies with Google Cloud label value format.
62
+ Only lowercase letters, numbers, and dashes are allowed.
63
+ The value must start with lowercase letter or number and end with a lowercase letter or number.
64
+ Maximum length is 63 characters.
65
+ """
66
+ if not value:
67
+ return False
68
+
69
+ # Check maximum length (63 characters for GCP label values)
70
+ if len(value) > 63:
71
+ return False
72
+
73
+ # Check if the value matches the pattern: starts and ends with alphanumeric,
74
+ # contains only lowercase letters, numbers, and dashes
75
+ pattern = r"^[a-z0-9]([a-z0-9-]*[a-z0-9])?$"
76
+ return bool(re.match(pattern, value))
77
+
78
+
51
79
  class DataprocSparkSession(SparkSession):
52
80
  """The entry point to programming Spark with the Dataset and DataFrame API.
53
81
 
@@ -77,16 +105,6 @@ class DataprocSparkSession(SparkSession):
77
105
 
78
106
  class Builder(SparkSession.Builder):
79
107
 
80
- _session_static_configs = [
81
- "spark.executor.cores",
82
- "spark.executor.memoryOverhead",
83
- "spark.executor.memory",
84
- "spark.driver.memory",
85
- "spark.driver.cores",
86
- "spark.eventLog.dir",
87
- "spark.history.fs.logDirectory",
88
- ]
89
-
90
108
  def __init__(self):
91
109
  self._options: Dict[str, Any] = {}
92
110
  self._channel_builder: Optional[DataprocChannelBuilder] = None
@@ -100,15 +118,6 @@ class DataprocSparkSession(SparkSession):
100
118
  )
101
119
  )
102
120
 
103
- def __apply_options(self, session: "SparkSession") -> None:
104
- with self._lock:
105
- self._options = {
106
- key: value
107
- for key, value in self._options.items()
108
- if key not in self._session_static_configs
109
- }
110
- self._apply_options(session)
111
-
112
121
  def projectId(self, project_id):
113
122
  self._project_id = project_id
114
123
  return self
@@ -166,7 +175,6 @@ class DataprocSparkSession(SparkSession):
166
175
  session = DataprocSparkSession(connection=self._channel_builder)
167
176
 
168
177
  DataprocSparkSession._set_default_and_active_session(session)
169
- self.__apply_options(session)
170
178
  return session
171
179
 
172
180
  def __create(self) -> "DataprocSparkSession":
@@ -245,9 +253,9 @@ class DataprocSparkSession(SparkSession):
245
253
  operation = SessionControllerClient(
246
254
  client_options=self._client_options
247
255
  ).create_session(session_request)
248
- print(
249
- f"Creating Dataproc Session: https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
250
- )
256
+ self._display_session_link_on_creation(session_id)
257
+ # TODO: Add the 'View Session Details' button once the UI changes are done.
258
+ # self._display_view_session_details_button(session_id)
251
259
  create_session_pbar_thread.start()
252
260
  session_response: Session = operation.result(
253
261
  polling=retry.Retry(
@@ -260,7 +268,7 @@ class DataprocSparkSession(SparkSession):
260
268
  )
261
269
  stop_create_session_pbar_event.set()
262
270
  create_session_pbar_thread.join()
263
- print("Dataproc Session was successfully created")
271
+ self._print_session_created_message()
264
272
  file_path = (
265
273
  DataprocSparkSession._get_active_session_file_path()
266
274
  )
@@ -305,6 +313,46 @@ class DataprocSparkSession(SparkSession):
305
313
  session_response, dataproc_config.name
306
314
  )
307
315
 
316
+ def _display_session_link_on_creation(self, session_id):
317
+ session_url = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
318
+ plain_message = f"Creating Dataproc Session: {session_url}"
319
+ html_element = f"""
320
+ <div>
321
+ <p>Creating Dataproc Spark Session<p>
322
+ <p><a href="{session_url}">Dataproc Session</a></p>
323
+ </div>
324
+ """
325
+
326
+ self._output_element_or_message(plain_message, html_element)
327
+
328
+ def _print_session_created_message(self):
329
+ plain_message = f"Dataproc Session was successfully created"
330
+ html_element = f"<div><p>{plain_message}</p></div>"
331
+
332
+ self._output_element_or_message(plain_message, html_element)
333
+
334
+ def _output_element_or_message(self, plain_message, html_element):
335
+ """
336
+ Display / print the needed rich HTML element or plain text depending
337
+ on whether rich element is supported or not.
338
+
339
+ :param plain_message: Message to print on non-IPython or
340
+ non-interactive shell
341
+ :param html_element: HTML element to display for interactive IPython
342
+ environment
343
+ """
344
+ try:
345
+ from IPython.display import display, HTML
346
+ from IPython.core.interactiveshell import InteractiveShell
347
+
348
+ if not InteractiveShell.initialized():
349
+ raise DataprocSparkConnectException(
350
+ "Not in an Interactive IPython Environment"
351
+ )
352
+ display(HTML(html_element))
353
+ except (ImportError, DataprocSparkConnectException):
354
+ print(plain_message)
355
+
308
356
  def _get_exiting_active_session(
309
357
  self,
310
358
  ) -> Optional["DataprocSparkSession"]:
@@ -325,6 +373,8 @@ class DataprocSparkSession(SparkSession):
325
373
  print(
326
374
  f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
327
375
  )
376
+ # TODO: Add the 'View Session Details' button once the UI changes are done.
377
+ # self._display_view_session_details_button(s8s_session_id)
328
378
  if session is None:
329
379
  session = self.__create_spark_connect_session_from_s8s(
330
380
  session_response, session_name
@@ -344,8 +394,6 @@ class DataprocSparkSession(SparkSession):
344
394
  session = self._get_exiting_active_session()
345
395
  if session is None:
346
396
  session = self.__create()
347
- if session:
348
- self.__apply_options(session)
349
397
  return session
350
398
 
351
399
  def _get_dataproc_config(self):
@@ -400,14 +448,22 @@ class DataprocSparkSession(SparkSession):
400
448
  os.getenv("DATAPROC_SPARK_CONNECT_IDLE_TTL_SECONDS")
401
449
  )
402
450
  }
403
- if "COLAB_NOTEBOOK_RUNTIME_ID" in os.environ:
404
- dataproc_config.labels["colab-notebook-runtime-id"] = (
405
- os.environ["COLAB_NOTEBOOK_RUNTIME_ID"]
406
- )
407
- if "COLAB_NOTEBOOK_KERNEL_ID" in os.environ:
408
- dataproc_config.labels["colab-notebook-kernel-id"] = os.environ[
409
- "COLAB_NOTEBOOK_KERNEL_ID"
410
- ]
451
+ if "COLAB_NOTEBOOK_ID" in os.environ:
452
+ colab_notebook_name = os.environ["COLAB_NOTEBOOK_ID"]
453
+ # Extract the last part of the path, which is the ID
454
+ notebook_id = os.path.basename(colab_notebook_name)
455
+ if _is_valid_label_value(notebook_id):
456
+ dataproc_config.labels["goog-colab-notebook-id"] = (
457
+ notebook_id
458
+ )
459
+ else:
460
+ logger.warning(
461
+ f"Warning while processing notebook ID: Notebook ID '{notebook_id}' is not compliant with label value format. "
462
+ f"Only lowercase letters, numbers, and dashes are allowed. "
463
+ f"The value must start with lowercase letter or number and end with a lowercase letter or number. "
464
+ f"Maximum length is 63 characters. "
465
+ f"Skipping notebook ID label."
466
+ )
411
467
  default_datasource = os.getenv(
412
468
  "DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
413
469
  )
@@ -434,6 +490,22 @@ class DataprocSparkSession(SparkSession):
434
490
  )
435
491
  return dataproc_config
436
492
 
493
+ def _display_view_session_details_button(self, session_id):
494
+ try:
495
+ session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
496
+ from IPython.core.interactiveshell import InteractiveShell
497
+
498
+ if not InteractiveShell.initialized():
499
+ return
500
+
501
+ from google.cloud.aiplatform.utils import _ipython_utils
502
+
503
+ _ipython_utils.display_link(
504
+ "View Session Details", f"{session_url}", "dashboard"
505
+ )
506
+ except ImportError as e:
507
+ logger.debug(f"Import error: {e}")
508
+
437
509
  @staticmethod
438
510
  def generate_dataproc_session_id():
439
511
  timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -445,6 +517,41 @@ class DataprocSparkSession(SparkSession):
445
517
  )
446
518
  return f"sc-{timestamp}-{random_suffix}"
447
519
 
520
+ def __init__(
521
+ self,
522
+ connection: Union[str, DataprocChannelBuilder],
523
+ user_id: Optional[str] = None,
524
+ ):
525
+ """
526
+ Creates a new DataprocSparkSession for the Spark Connect interface.
527
+
528
+ Parameters
529
+ ----------
530
+ connection : str or :class:`DataprocChannelBuilder`
531
+ Connection string that is used to extract the connection parameters
532
+ and configure the GRPC connection. Or instance of ChannelBuilder /
533
+ DataprocChannelBuilder that creates GRPC connection.
534
+ user_id : str, optional
535
+ If not set, will default to the $USER environment. Defining the user
536
+ ID as part of the connection string takes precedence.
537
+ """
538
+
539
+ super().__init__(connection, user_id)
540
+
541
+ base_method = self.client._execute_plan_request_with_metadata
542
+
543
+ def wrapped_method(*args, **kwargs):
544
+ req = base_method(*args, **kwargs)
545
+ if not req.operation_id:
546
+ req.operation_id = str(uuid.uuid4())
547
+ logger.debug(
548
+ f"No operation_id found. Setting operation_id: {req.operation_id}"
549
+ )
550
+ self._display_operation_link(req.operation_id)
551
+ return req
552
+
553
+ self.client._execute_plan_request_with_metadata = wrapped_method
554
+
448
555
  def _repr_html_(self) -> str:
449
556
  if not self._active_s8s_session_id:
450
557
  return """
@@ -462,6 +569,37 @@ class DataprocSparkSession(SparkSession):
462
569
  </div>
463
570
  """
464
571
 
572
+ def _display_operation_link(self, operation_id: str):
573
+ assert all(
574
+ [
575
+ operation_id is not None,
576
+ self._region is not None,
577
+ self._active_s8s_session_id is not None,
578
+ self._project_id is not None,
579
+ ]
580
+ )
581
+
582
+ url = (
583
+ f"https://console.cloud.google.com/dataproc/interactive/{self._region}/"
584
+ f"{self._active_s8s_session_id}/sparkApplications/application;"
585
+ f"associatedSqlOperationId={operation_id}?project={self._project_id}"
586
+ )
587
+
588
+ try:
589
+ from IPython.display import display, HTML
590
+ from IPython.core.interactiveshell import InteractiveShell
591
+
592
+ if not InteractiveShell.initialized():
593
+ return
594
+ html_element = f"""
595
+ <div>
596
+ <p><a href="{url}">Spark UI</a> (Operation: {operation_id})</p>
597
+ </div>
598
+ """
599
+ display(HTML(html_element))
600
+ except ImportError:
601
+ return
602
+
465
603
  @staticmethod
466
604
  def _remove_stopped_session_from_file():
467
605
  file_path = DataprocSparkSession._get_active_session_file_path()
@@ -1,12 +0,0 @@
1
- google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
2
- google/cloud/dataproc_spark_connect/exceptions.py,sha256=ilGyHD5M_yBQ3IC58-Y5miRGIQVJsLaNKvEGcHuk_BE,969
3
- google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
4
- google/cloud/dataproc_spark_connect/session.py,sha256=kMCZWmi_-ScJy9NO7NFrHaHDTXKxMwaCSDbdqGxEngk,26390
5
- google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
6
- google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
7
- google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
8
- dataproc_spark_connect-0.7.5.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
9
- dataproc_spark_connect-0.7.5.dist-info/METADATA,sha256=byc2dTo3PdkmbMZHyaJ9A-WVhSQTHn1ZAX_Jqz9jmd0,3330
10
- dataproc_spark_connect-0.7.5.dist-info/WHEEL,sha256=OpXWERl2xLPRHTvd2ZXo_iluPEQd8uSbYkJ53NAER_Y,109
11
- dataproc_spark_connect-0.7.5.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
12
- dataproc_spark_connect-0.7.5.dist-info/RECORD,,