dataproc-spark-connect 0.8.3__py2.py3-none-any.whl → 0.9.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 0.8.3
3
+ Version: 0.9.0
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -1,12 +1,13 @@
1
- dataproc_spark_connect-0.8.3.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
1
+ dataproc_spark_connect-0.9.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2
2
  google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
3
+ google/cloud/dataproc_spark_connect/environment.py,sha256=UICy9XyqAxL-cryVWx7GZPRAxoir5LKk0dtqqY_l--c,2307
3
4
  google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
4
5
  google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
5
- google/cloud/dataproc_spark_connect/session.py,sha256=ZWoW9-otaCJnttPt7h9W3pmhHpdbQsAOl8ypOX3fVbo,33556
6
+ google/cloud/dataproc_spark_connect/session.py,sha256=ELj5hDhofK1967eE5YaG_LP5B80KWFQWJn5gxi9yYt0,38577
6
7
  google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
7
8
  google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
8
9
  google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
9
- dataproc_spark_connect-0.8.3.dist-info/METADATA,sha256=croGipnWGtSrd2NLyMCHrcVagYCk9yJ6cEOqCEAm-Qc,3465
10
- dataproc_spark_connect-0.8.3.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
11
- dataproc_spark_connect-0.8.3.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
12
- dataproc_spark_connect-0.8.3.dist-info/RECORD,,
10
+ dataproc_spark_connect-0.9.0.dist-info/METADATA,sha256=1z8Ag1P_Lh9db0Rk9nGFoOu6sdeRs0UlrgtOqN_OhIQ,3465
11
+ dataproc_spark_connect-0.9.0.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
+ dataproc_spark_connect-0.9.0.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
+ dataproc_spark_connect-0.9.0.dist-info/RECORD,,
@@ -0,0 +1,76 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ from typing import Callable, Tuple, List
17
+
18
+
19
+ def is_vscode() -> bool:
20
+ """True if running inside VS Code at all."""
21
+ return os.getenv("VSCODE_PID") is not None
22
+
23
+
24
+ def is_jupyter() -> bool:
25
+ """True if running in a Jupyter environment."""
26
+ return os.getenv("JPY_PARENT_PID") is not None
27
+
28
+
29
+ def is_colab_enterprise() -> bool:
30
+ """True if running in Colab Enterprise (Vertex AI)."""
31
+ return os.getenv("VERTEX_PRODUCT") == "COLAB_ENTERPRISE"
32
+
33
+
34
+ def is_colab() -> bool:
35
+ """True if running in Google Colab."""
36
+ return os.getenv("COLAB_RELEASE_TAG") is not None
37
+
38
+
39
+ def is_workbench() -> bool:
40
+ """True if running in AI Workbench (managed Jupyter)."""
41
+ return os.getenv("VERTEX_PRODUCT") == "WORKBENCH_INSTANCE"
42
+
43
+
44
+ def is_jetbrains_ide() -> bool:
45
+ """True if running inside any JetBrains IDE."""
46
+ return "jetbrains" in os.getenv("TERMINAL_EMULATOR", "").lower()
47
+
48
+
49
+ def get_client_environment_label() -> str:
50
+ """
51
+ Map current environment to a standardized client label.
52
+
53
+ Priority order:
54
+ 1. Colab Enterprise ("colab-enterprise")
55
+ 2. Colab ("colab")
56
+ 3. Workbench ("workbench-jupyter")
57
+ 4. VS Code ("vscode")
58
+ 5. JetBrains IDE ("jetbrains")
59
+ 6. Jupyter ("jupyter")
60
+ 7. Unknown ("unknown")
61
+ """
62
+ checks: List[Tuple[Callable[[], bool], str]] = [
63
+ (is_colab_enterprise, "colab-enterprise"),
64
+ (is_colab, "colab"),
65
+ (is_workbench, "workbench-jupyter"),
66
+ (is_vscode, "vscode"),
67
+ (is_jetbrains_ide, "jetbrains"),
68
+ (is_jupyter, "jupyter"),
69
+ ]
70
+ for detector, label in checks:
71
+ try:
72
+ if detector():
73
+ return label
74
+ except Exception:
75
+ pass
76
+ return "unknown"
@@ -49,6 +49,7 @@ from google.cloud.dataproc_v1 import (
49
49
  TerminateSessionRequest,
50
50
  )
51
51
  from google.cloud.dataproc_v1.types import sessions
52
+ from google.cloud.dataproc_spark_connect import environment
52
53
  from pyspark.sql.connect.session import SparkSession
53
54
  from pyspark.sql.utils import to_str
54
55
 
@@ -56,6 +57,12 @@ from pyspark.sql.utils import to_str
56
57
  logging.basicConfig(level=logging.INFO)
57
58
  logger = logging.getLogger(__name__)
58
59
 
60
+ # System labels that should not be overridden by user
61
+ SYSTEM_LABELS = {
62
+ "dataproc-session-client",
63
+ "goog-colab-notebook-id",
64
+ }
65
+
59
66
 
60
67
  def _is_valid_label_value(value: str) -> bool:
61
68
  """
@@ -132,11 +139,95 @@ class DataprocSparkSession(SparkSession):
132
139
  return self
133
140
 
134
141
  def dataprocSessionConfig(self, dataproc_config: Session):
142
+ self._dataproc_config = dataproc_config
143
+ for k, v in dataproc_config.runtime_config.properties.items():
144
+ self._options[cast(str, k)] = to_str(v)
145
+ return self
146
+
147
+ @property
148
+ def dataproc_config(self):
135
149
  with self._lock:
136
- self._dataproc_config = dataproc_config
137
- for k, v in dataproc_config.runtime_config.properties.items():
138
- self._options[cast(str, k)] = to_str(v)
139
- return self
150
+ self._dataproc_config = self._dataproc_config or Session()
151
+ return self._dataproc_config
152
+
153
+ def runtimeVersion(self, version: str):
154
+ self.dataproc_config.runtime_config.version = version
155
+ return self
156
+
157
+ def serviceAccount(self, account: str):
158
+ self.dataproc_config.environment_config.execution_config.service_account = (
159
+ account
160
+ )
161
+ # Automatically set auth type to SERVICE_ACCOUNT when service account is provided
162
+ # This overrides any env var setting to simplify user experience
163
+ self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
164
+ AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
165
+ )
166
+ return self
167
+
168
+ def authType(
169
+ self, auth_type: "AuthenticationConfig.AuthenticationType"
170
+ ):
171
+ self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
172
+ auth_type
173
+ )
174
+ return self
175
+
176
+ def subnetwork(self, subnet: str):
177
+ self.dataproc_config.environment_config.execution_config.subnetwork_uri = (
178
+ subnet
179
+ )
180
+ return self
181
+
182
+ def ttl(self, duration: datetime.timedelta):
183
+ """Set the time-to-live (TTL) for the session using a timedelta object."""
184
+ self.dataproc_config.environment_config.execution_config.ttl = {
185
+ "seconds": int(duration.total_seconds())
186
+ }
187
+ return self
188
+
189
+ def ttlSeconds(self, seconds: int):
190
+ """Set the time-to-live (TTL) for the session in seconds."""
191
+ self.dataproc_config.environment_config.execution_config.ttl = {
192
+ "seconds": seconds
193
+ }
194
+ return self
195
+
196
+ def idleTtl(self, duration: datetime.timedelta):
197
+ """Set the idle time-to-live (idle TTL) for the session using a timedelta object."""
198
+ self.dataproc_config.environment_config.execution_config.idle_ttl = {
199
+ "seconds": int(duration.total_seconds())
200
+ }
201
+ return self
202
+
203
+ def idleTtlSeconds(self, seconds: int):
204
+ """Set the idle time-to-live (idle TTL) for the session in seconds."""
205
+ self.dataproc_config.environment_config.execution_config.idle_ttl = {
206
+ "seconds": seconds
207
+ }
208
+ return self
209
+
210
+ def sessionTemplate(self, template: str):
211
+ self.dataproc_config.session_template = template
212
+ return self
213
+
214
+ def label(self, key: str, value: str):
215
+ """Add a single label to the session."""
216
+ return self.labels({key: value})
217
+
218
+ def labels(self, labels: Dict[str, str]):
219
+ # Filter out system labels and warn user
220
+ filtered_labels = {}
221
+ for key, value in labels.items():
222
+ if key in SYSTEM_LABELS:
223
+ logger.warning(
224
+ f"Label '{key}' is a system label and cannot be overridden by user. Ignoring."
225
+ )
226
+ else:
227
+ filtered_labels[key] = value
228
+
229
+ self.dataproc_config.labels.update(filtered_labels)
230
+ return self
140
231
 
141
232
  def remote(self, url: Optional[str] = None) -> "SparkSession.Builder":
142
233
  if url:
@@ -258,8 +349,7 @@ class DataprocSparkSession(SparkSession):
258
349
  client_options=self._client_options
259
350
  ).create_session(session_request)
260
351
  self._display_session_link_on_creation(session_id)
261
- # TODO: Add the 'View Session Details' button once the UI changes are done.
262
- # self._display_view_session_details_button(session_id)
352
+ self._display_view_session_details_button(session_id)
263
353
  create_session_pbar_thread.start()
264
354
  session_response: Session = operation.result(
265
355
  polling=retry.Retry(
@@ -377,8 +467,7 @@ class DataprocSparkSession(SparkSession):
377
467
  print(
378
468
  f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
379
469
  )
380
- # TODO: Add the 'View Session Details' button once the UI changes are done.
381
- # self._display_view_session_details_button(s8s_session_id)
470
+ self._display_view_session_details_button(s8s_session_id)
382
471
  if session is None:
383
472
  session = self.__create_spark_connect_session_from_s8s(
384
473
  session_response, session_name
@@ -401,11 +490,10 @@ class DataprocSparkSession(SparkSession):
401
490
  return session
402
491
 
403
492
  def _get_dataproc_config(self):
404
- dataproc_config = Session()
405
- if self._dataproc_config:
406
- dataproc_config = self._dataproc_config
407
- for k, v in self._options.items():
408
- dataproc_config.runtime_config.properties[k] = v
493
+ # Use the property to ensure we always have a config
494
+ dataproc_config = self.dataproc_config
495
+ for k, v in self._options.items():
496
+ dataproc_config.runtime_config.properties[k] = v
409
497
  dataproc_config.spark_connect_session = (
410
498
  sessions.SparkConnectConfig()
411
499
  )
@@ -413,6 +501,11 @@ class DataprocSparkSession(SparkSession):
413
501
  dataproc_config.runtime_config.version = (
414
502
  DataprocSparkSession._DEFAULT_RUNTIME_VERSION
415
503
  )
504
+
505
+ # Check for Python version mismatch with runtime for UDF compatibility
506
+ self._check_python_version_compatibility(
507
+ dataproc_config.runtime_config.version
508
+ )
416
509
  if (
417
510
  not dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type
418
511
  and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
@@ -452,6 +545,10 @@ class DataprocSparkSession(SparkSession):
452
545
  os.getenv("DATAPROC_SPARK_CONNECT_IDLE_TTL_SECONDS")
453
546
  )
454
547
  }
548
+ client_environment = environment.get_client_environment_label()
549
+ dataproc_config.labels["dataproc-session-client"] = (
550
+ client_environment
551
+ )
455
552
  if "COLAB_NOTEBOOK_ID" in os.environ:
456
553
  colab_notebook_name = os.environ["COLAB_NOTEBOOK_ID"]
457
554
  # Extract the last part of the path, which is the ID
@@ -466,7 +563,7 @@ class DataprocSparkSession(SparkSession):
466
563
  f"Only lowercase letters, numbers, and dashes are allowed. "
467
564
  f"The value must start with lowercase letter or number and end with a lowercase letter or number. "
468
565
  f"Maximum length is 63 characters. "
469
- f"Skipping notebook ID label."
566
+ f"Ignoring notebook ID label."
470
567
  )
471
568
  default_datasource = os.getenv(
472
569
  "DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
@@ -494,6 +591,32 @@ class DataprocSparkSession(SparkSession):
494
591
  )
495
592
  return dataproc_config
496
593
 
594
+ def _check_python_version_compatibility(self, runtime_version):
595
+ """Check if client Python version matches server Python version for UDF compatibility."""
596
+ import sys
597
+ import warnings
598
+
599
+ # Runtime version to server Python version mapping
600
+ RUNTIME_PYTHON_MAP = {
601
+ "1.2": (3, 12),
602
+ "2.2": (3, 12),
603
+ "2.3": (3, 11),
604
+ }
605
+
606
+ client_python = sys.version_info[:2] # (major, minor)
607
+
608
+ if runtime_version in RUNTIME_PYTHON_MAP:
609
+ server_python = RUNTIME_PYTHON_MAP[runtime_version]
610
+
611
+ if client_python != server_python:
612
+ warnings.warn(
613
+ f"Python version mismatch detected: Client is using Python {client_python[0]}.{client_python[1]}, "
614
+ f"but Dataproc runtime {runtime_version} uses Python {server_python[0]}.{server_python[1]}. "
615
+ f"This mismatch may cause issues with Python UDF (User Defined Function) compatibility. "
616
+ f"Consider using Python {server_python[0]}.{server_python[1]} for optimal UDF execution.",
617
+ stacklevel=3,
618
+ )
619
+
497
620
  def _display_view_session_details_button(self, session_id):
498
621
  try:
499
622
  session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"