dataproc-spark-connect 0.8.2__py2.py3-none-any.whl → 0.9.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 0.8.2
3
+ Version: 0.9.0
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -1,12 +1,13 @@
1
- dataproc_spark_connect-0.8.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
1
+ dataproc_spark_connect-0.9.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2
2
  google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
3
+ google/cloud/dataproc_spark_connect/environment.py,sha256=UICy9XyqAxL-cryVWx7GZPRAxoir5LKk0dtqqY_l--c,2307
3
4
  google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
4
5
  google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
5
- google/cloud/dataproc_spark_connect/session.py,sha256=Sr9ISKIJ6U5dJ13FzKQ8UC_pGeFXbchc7X3d9U5Hj48,32144
6
+ google/cloud/dataproc_spark_connect/session.py,sha256=ELj5hDhofK1967eE5YaG_LP5B80KWFQWJn5gxi9yYt0,38577
6
7
  google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
7
8
  google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
8
9
  google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
9
- dataproc_spark_connect-0.8.2.dist-info/METADATA,sha256=2PCMrKtuuab4232elYKFHiTdaJcqiM4N38ceD_AhS-E,3465
10
- dataproc_spark_connect-0.8.2.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
11
- dataproc_spark_connect-0.8.2.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
12
- dataproc_spark_connect-0.8.2.dist-info/RECORD,,
10
+ dataproc_spark_connect-0.9.0.dist-info/METADATA,sha256=1z8Ag1P_Lh9db0Rk9nGFoOu6sdeRs0UlrgtOqN_OhIQ,3465
11
+ dataproc_spark_connect-0.9.0.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
+ dataproc_spark_connect-0.9.0.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
+ dataproc_spark_connect-0.9.0.dist-info/RECORD,,
@@ -0,0 +1,76 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ from typing import Callable, Tuple, List
17
+
18
+
19
+ def is_vscode() -> bool:
20
+ """True if running inside VS Code at all."""
21
+ return os.getenv("VSCODE_PID") is not None
22
+
23
+
24
+ def is_jupyter() -> bool:
25
+ """True if running in a Jupyter environment."""
26
+ return os.getenv("JPY_PARENT_PID") is not None
27
+
28
+
29
+ def is_colab_enterprise() -> bool:
30
+ """True if running in Colab Enterprise (Vertex AI)."""
31
+ return os.getenv("VERTEX_PRODUCT") == "COLAB_ENTERPRISE"
32
+
33
+
34
+ def is_colab() -> bool:
35
+ """True if running in Google Colab."""
36
+ return os.getenv("COLAB_RELEASE_TAG") is not None
37
+
38
+
39
+ def is_workbench() -> bool:
40
+ """True if running in AI Workbench (managed Jupyter)."""
41
+ return os.getenv("VERTEX_PRODUCT") == "WORKBENCH_INSTANCE"
42
+
43
+
44
+ def is_jetbrains_ide() -> bool:
45
+ """True if running inside any JetBrains IDE."""
46
+ return "jetbrains" in os.getenv("TERMINAL_EMULATOR", "").lower()
47
+
48
+
49
+ def get_client_environment_label() -> str:
50
+ """
51
+ Map current environment to a standardized client label.
52
+
53
+ Priority order:
54
+ 1. Colab Enterprise ("colab-enterprise")
55
+ 2. Colab ("colab")
56
+ 3. Workbench ("workbench-jupyter")
57
+ 4. VS Code ("vscode")
58
+ 5. JetBrains IDE ("jetbrains")
59
+ 6. Jupyter ("jupyter")
60
+ 7. Unknown ("unknown")
61
+ """
62
+ checks: List[Tuple[Callable[[], bool], str]] = [
63
+ (is_colab_enterprise, "colab-enterprise"),
64
+ (is_colab, "colab"),
65
+ (is_workbench, "workbench-jupyter"),
66
+ (is_vscode, "vscode"),
67
+ (is_jetbrains_ide, "jetbrains"),
68
+ (is_jupyter, "jupyter"),
69
+ ]
70
+ for detector, label in checks:
71
+ try:
72
+ if detector():
73
+ return label
74
+ except Exception:
75
+ pass
76
+ return "unknown"
@@ -22,9 +22,10 @@ import re
22
22
  import string
23
23
  import threading
24
24
  import time
25
- from typing import Any, cast, ClassVar, Dict, Optional, Union
26
25
  import uuid
27
26
  import tqdm
27
+ from types import MethodType
28
+ from typing import Any, cast, ClassVar, Dict, Optional, Union
28
29
 
29
30
  from google.api_core import retry
30
31
  from google.api_core.client_options import ClientOptions
@@ -48,6 +49,7 @@ from google.cloud.dataproc_v1 import (
48
49
  TerminateSessionRequest,
49
50
  )
50
51
  from google.cloud.dataproc_v1.types import sessions
52
+ from google.cloud.dataproc_spark_connect import environment
51
53
  from pyspark.sql.connect.session import SparkSession
52
54
  from pyspark.sql.utils import to_str
53
55
 
@@ -55,6 +57,12 @@ from pyspark.sql.utils import to_str
55
57
  logging.basicConfig(level=logging.INFO)
56
58
  logger = logging.getLogger(__name__)
57
59
 
60
+ # System labels that should not be overridden by user
61
+ SYSTEM_LABELS = {
62
+ "dataproc-session-client",
63
+ "goog-colab-notebook-id",
64
+ }
65
+
58
66
 
59
67
  def _is_valid_label_value(value: str) -> bool:
60
68
  """
@@ -131,11 +139,95 @@ class DataprocSparkSession(SparkSession):
131
139
  return self
132
140
 
133
141
  def dataprocSessionConfig(self, dataproc_config: Session):
142
+ self._dataproc_config = dataproc_config
143
+ for k, v in dataproc_config.runtime_config.properties.items():
144
+ self._options[cast(str, k)] = to_str(v)
145
+ return self
146
+
147
+ @property
148
+ def dataproc_config(self):
134
149
  with self._lock:
135
- self._dataproc_config = dataproc_config
136
- for k, v in dataproc_config.runtime_config.properties.items():
137
- self._options[cast(str, k)] = to_str(v)
138
- return self
150
+ self._dataproc_config = self._dataproc_config or Session()
151
+ return self._dataproc_config
152
+
153
+ def runtimeVersion(self, version: str):
154
+ self.dataproc_config.runtime_config.version = version
155
+ return self
156
+
157
+ def serviceAccount(self, account: str):
158
+ self.dataproc_config.environment_config.execution_config.service_account = (
159
+ account
160
+ )
161
+ # Automatically set auth type to SERVICE_ACCOUNT when service account is provided
162
+ # This overrides any env var setting to simplify user experience
163
+ self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
164
+ AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
165
+ )
166
+ return self
167
+
168
+ def authType(
169
+ self, auth_type: "AuthenticationConfig.AuthenticationType"
170
+ ):
171
+ self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
172
+ auth_type
173
+ )
174
+ return self
175
+
176
+ def subnetwork(self, subnet: str):
177
+ self.dataproc_config.environment_config.execution_config.subnetwork_uri = (
178
+ subnet
179
+ )
180
+ return self
181
+
182
+ def ttl(self, duration: datetime.timedelta):
183
+ """Set the time-to-live (TTL) for the session using a timedelta object."""
184
+ self.dataproc_config.environment_config.execution_config.ttl = {
185
+ "seconds": int(duration.total_seconds())
186
+ }
187
+ return self
188
+
189
+ def ttlSeconds(self, seconds: int):
190
+ """Set the time-to-live (TTL) for the session in seconds."""
191
+ self.dataproc_config.environment_config.execution_config.ttl = {
192
+ "seconds": seconds
193
+ }
194
+ return self
195
+
196
+ def idleTtl(self, duration: datetime.timedelta):
197
+ """Set the idle time-to-live (idle TTL) for the session using a timedelta object."""
198
+ self.dataproc_config.environment_config.execution_config.idle_ttl = {
199
+ "seconds": int(duration.total_seconds())
200
+ }
201
+ return self
202
+
203
+ def idleTtlSeconds(self, seconds: int):
204
+ """Set the idle time-to-live (idle TTL) for the session in seconds."""
205
+ self.dataproc_config.environment_config.execution_config.idle_ttl = {
206
+ "seconds": seconds
207
+ }
208
+ return self
209
+
210
+ def sessionTemplate(self, template: str):
211
+ self.dataproc_config.session_template = template
212
+ return self
213
+
214
+ def label(self, key: str, value: str):
215
+ """Add a single label to the session."""
216
+ return self.labels({key: value})
217
+
218
+ def labels(self, labels: Dict[str, str]):
219
+ # Filter out system labels and warn user
220
+ filtered_labels = {}
221
+ for key, value in labels.items():
222
+ if key in SYSTEM_LABELS:
223
+ logger.warning(
224
+ f"Label '{key}' is a system label and cannot be overridden by user. Ignoring."
225
+ )
226
+ else:
227
+ filtered_labels[key] = value
228
+
229
+ self.dataproc_config.labels.update(filtered_labels)
230
+ return self
139
231
 
140
232
  def remote(self, url: Optional[str] = None) -> "SparkSession.Builder":
141
233
  if url:
@@ -257,8 +349,7 @@ class DataprocSparkSession(SparkSession):
257
349
  client_options=self._client_options
258
350
  ).create_session(session_request)
259
351
  self._display_session_link_on_creation(session_id)
260
- # TODO: Add the 'View Session Details' button once the UI changes are done.
261
- # self._display_view_session_details_button(session_id)
352
+ self._display_view_session_details_button(session_id)
262
353
  create_session_pbar_thread.start()
263
354
  session_response: Session = operation.result(
264
355
  polling=retry.Retry(
@@ -376,8 +467,7 @@ class DataprocSparkSession(SparkSession):
376
467
  print(
377
468
  f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
378
469
  )
379
- # TODO: Add the 'View Session Details' button once the UI changes are done.
380
- # self._display_view_session_details_button(s8s_session_id)
470
+ self._display_view_session_details_button(s8s_session_id)
381
471
  if session is None:
382
472
  session = self.__create_spark_connect_session_from_s8s(
383
473
  session_response, session_name
@@ -400,11 +490,10 @@ class DataprocSparkSession(SparkSession):
400
490
  return session
401
491
 
402
492
  def _get_dataproc_config(self):
403
- dataproc_config = Session()
404
- if self._dataproc_config:
405
- dataproc_config = self._dataproc_config
406
- for k, v in self._options.items():
407
- dataproc_config.runtime_config.properties[k] = v
493
+ # Use the property to ensure we always have a config
494
+ dataproc_config = self.dataproc_config
495
+ for k, v in self._options.items():
496
+ dataproc_config.runtime_config.properties[k] = v
408
497
  dataproc_config.spark_connect_session = (
409
498
  sessions.SparkConnectConfig()
410
499
  )
@@ -412,6 +501,11 @@ class DataprocSparkSession(SparkSession):
412
501
  dataproc_config.runtime_config.version = (
413
502
  DataprocSparkSession._DEFAULT_RUNTIME_VERSION
414
503
  )
504
+
505
+ # Check for Python version mismatch with runtime for UDF compatibility
506
+ self._check_python_version_compatibility(
507
+ dataproc_config.runtime_config.version
508
+ )
415
509
  if (
416
510
  not dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type
417
511
  and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
@@ -451,6 +545,10 @@ class DataprocSparkSession(SparkSession):
451
545
  os.getenv("DATAPROC_SPARK_CONNECT_IDLE_TTL_SECONDS")
452
546
  )
453
547
  }
548
+ client_environment = environment.get_client_environment_label()
549
+ dataproc_config.labels["dataproc-session-client"] = (
550
+ client_environment
551
+ )
454
552
  if "COLAB_NOTEBOOK_ID" in os.environ:
455
553
  colab_notebook_name = os.environ["COLAB_NOTEBOOK_ID"]
456
554
  # Extract the last part of the path, which is the ID
@@ -465,7 +563,7 @@ class DataprocSparkSession(SparkSession):
465
563
  f"Only lowercase letters, numbers, and dashes are allowed. "
466
564
  f"The value must start with lowercase letter or number and end with a lowercase letter or number. "
467
565
  f"Maximum length is 63 characters. "
468
- f"Skipping notebook ID label."
566
+ f"Ignoring notebook ID label."
469
567
  )
470
568
  default_datasource = os.getenv(
471
569
  "DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
@@ -493,6 +591,32 @@ class DataprocSparkSession(SparkSession):
493
591
  )
494
592
  return dataproc_config
495
593
 
594
+ def _check_python_version_compatibility(self, runtime_version):
595
+ """Check if client Python version matches server Python version for UDF compatibility."""
596
+ import sys
597
+ import warnings
598
+
599
+ # Runtime version to server Python version mapping
600
+ RUNTIME_PYTHON_MAP = {
601
+ "1.2": (3, 12),
602
+ "2.2": (3, 12),
603
+ "2.3": (3, 11),
604
+ }
605
+
606
+ client_python = sys.version_info[:2] # (major, minor)
607
+
608
+ if runtime_version in RUNTIME_PYTHON_MAP:
609
+ server_python = RUNTIME_PYTHON_MAP[runtime_version]
610
+
611
+ if client_python != server_python:
612
+ warnings.warn(
613
+ f"Python version mismatch detected: Client is using Python {client_python[0]}.{client_python[1]}, "
614
+ f"but Dataproc runtime {runtime_version} uses Python {server_python[0]}.{server_python[1]}. "
615
+ f"This mismatch may cause issues with Python UDF (User Defined Function) compatibility. "
616
+ f"Consider using Python {server_python[0]}.{server_python[1]} for optimal UDF execution.",
617
+ stacklevel=3,
618
+ )
619
+
496
620
  def _display_view_session_details_button(self, session_id):
497
621
  try:
498
622
  session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
@@ -541,19 +665,57 @@ class DataprocSparkSession(SparkSession):
541
665
 
542
666
  super().__init__(connection, user_id)
543
667
 
544
- base_method = self.client._execute_plan_request_with_metadata
668
+ execute_plan_request_base_method = (
669
+ self.client._execute_plan_request_with_metadata
670
+ )
671
+ execute_base_method = self.client._execute
672
+ execute_and_fetch_as_iterator_base_method = (
673
+ self.client._execute_and_fetch_as_iterator
674
+ )
545
675
 
546
- def wrapped_method(*args, **kwargs):
547
- req = base_method(*args, **kwargs)
676
+ def execute_plan_request_wrapped_method(*args, **kwargs):
677
+ req = execute_plan_request_base_method(*args, **kwargs)
548
678
  if not req.operation_id:
549
679
  req.operation_id = str(uuid.uuid4())
550
680
  logger.debug(
551
681
  f"No operation_id found. Setting operation_id: {req.operation_id}"
552
682
  )
553
- self._display_operation_link(req.operation_id)
554
683
  return req
555
684
 
556
- self.client._execute_plan_request_with_metadata = wrapped_method
685
+ self.client._execute_plan_request_with_metadata = (
686
+ execute_plan_request_wrapped_method
687
+ )
688
+
689
+ def execute_wrapped_method(client_self, req, *args, **kwargs):
690
+ if not self._sql_lazy_transformation(req):
691
+ self._display_operation_link(req.operation_id)
692
+ execute_base_method(req, *args, **kwargs)
693
+
694
+ self.client._execute = MethodType(execute_wrapped_method, self.client)
695
+
696
+ def execute_and_fetch_as_iterator_wrapped_method(
697
+ client_self, req, *args, **kwargs
698
+ ):
699
+ if not self._sql_lazy_transformation(req):
700
+ self._display_operation_link(req.operation_id)
701
+ return execute_and_fetch_as_iterator_base_method(
702
+ req, *args, **kwargs
703
+ )
704
+
705
+ self.client._execute_and_fetch_as_iterator = MethodType(
706
+ execute_and_fetch_as_iterator_wrapped_method, self.client
707
+ )
708
+
709
+ @staticmethod
710
+ def _sql_lazy_transformation(req):
711
+ # Select SQL command
712
+ if req.plan and req.plan.command and req.plan.command.sql_command:
713
+ return (
714
+ "select"
715
+ in req.plan.command.sql_command.sql.strip().lower().split()
716
+ )
717
+
718
+ return False
557
719
 
558
720
  def _repr_html_(self) -> str:
559
721
  if not self._active_s8s_session_id:
@@ -596,7 +758,7 @@ class DataprocSparkSession(SparkSession):
596
758
  return
597
759
  html_element = f"""
598
760
  <div>
599
- <p><a href="{url}">Spark UI</a> (Operation: {operation_id})</p>
761
+ <p><a href="{url}">Spark Query</a> (Operation: {operation_id})</p>
600
762
  </div>
601
763
  """
602
764
  display(HTML(html_element))