dataproc-spark-connect 0.8.3__tar.gz → 0.9.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/PKG-INFO +1 -1
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/dataproc_spark_connect.egg-info/PKG-INFO +1 -1
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/dataproc_spark_connect.egg-info/SOURCES.txt +1 -0
- dataproc_spark_connect-0.9.0/google/cloud/dataproc_spark_connect/environment.py +76 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/google/cloud/dataproc_spark_connect/session.py +137 -14
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/setup.py +1 -1
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/LICENSE +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/README.md +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/dataproc_spark_connect.egg-info/requires.txt +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/google/cloud/dataproc_spark_connect/client/core.py +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/google/cloud/dataproc_spark_connect/exceptions.py +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/pyproject.toml +0 -0
- {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-0.9.0}/setup.cfg +0 -0
|
@@ -9,6 +9,7 @@ dataproc_spark_connect.egg-info/dependency_links.txt
|
|
|
9
9
|
dataproc_spark_connect.egg-info/requires.txt
|
|
10
10
|
dataproc_spark_connect.egg-info/top_level.txt
|
|
11
11
|
google/cloud/dataproc_spark_connect/__init__.py
|
|
12
|
+
google/cloud/dataproc_spark_connect/environment.py
|
|
12
13
|
google/cloud/dataproc_spark_connect/exceptions.py
|
|
13
14
|
google/cloud/dataproc_spark_connect/pypi_artifacts.py
|
|
14
15
|
google/cloud/dataproc_spark_connect/session.py
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# Copyright 2025 Google LLC
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
from typing import Callable, Tuple, List
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def is_vscode() -> bool:
|
|
20
|
+
"""True if running inside VS Code at all."""
|
|
21
|
+
return os.getenv("VSCODE_PID") is not None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def is_jupyter() -> bool:
|
|
25
|
+
"""True if running in a Jupyter environment."""
|
|
26
|
+
return os.getenv("JPY_PARENT_PID") is not None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def is_colab_enterprise() -> bool:
|
|
30
|
+
"""True if running in Colab Enterprise (Vertex AI)."""
|
|
31
|
+
return os.getenv("VERTEX_PRODUCT") == "COLAB_ENTERPRISE"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def is_colab() -> bool:
|
|
35
|
+
"""True if running in Google Colab."""
|
|
36
|
+
return os.getenv("COLAB_RELEASE_TAG") is not None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def is_workbench() -> bool:
|
|
40
|
+
"""True if running in AI Workbench (managed Jupyter)."""
|
|
41
|
+
return os.getenv("VERTEX_PRODUCT") == "WORKBENCH_INSTANCE"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def is_jetbrains_ide() -> bool:
|
|
45
|
+
"""True if running inside any JetBrains IDE."""
|
|
46
|
+
return "jetbrains" in os.getenv("TERMINAL_EMULATOR", "").lower()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def get_client_environment_label() -> str:
|
|
50
|
+
"""
|
|
51
|
+
Map current environment to a standardized client label.
|
|
52
|
+
|
|
53
|
+
Priority order:
|
|
54
|
+
1. Colab Enterprise ("colab-enterprise")
|
|
55
|
+
2. Colab ("colab")
|
|
56
|
+
3. Workbench ("workbench-jupyter")
|
|
57
|
+
4. VS Code ("vscode")
|
|
58
|
+
5. JetBrains IDE ("jetbrains")
|
|
59
|
+
6. Jupyter ("jupyter")
|
|
60
|
+
7. Unknown ("unknown")
|
|
61
|
+
"""
|
|
62
|
+
checks: List[Tuple[Callable[[], bool], str]] = [
|
|
63
|
+
(is_colab_enterprise, "colab-enterprise"),
|
|
64
|
+
(is_colab, "colab"),
|
|
65
|
+
(is_workbench, "workbench-jupyter"),
|
|
66
|
+
(is_vscode, "vscode"),
|
|
67
|
+
(is_jetbrains_ide, "jetbrains"),
|
|
68
|
+
(is_jupyter, "jupyter"),
|
|
69
|
+
]
|
|
70
|
+
for detector, label in checks:
|
|
71
|
+
try:
|
|
72
|
+
if detector():
|
|
73
|
+
return label
|
|
74
|
+
except Exception:
|
|
75
|
+
pass
|
|
76
|
+
return "unknown"
|
|
@@ -49,6 +49,7 @@ from google.cloud.dataproc_v1 import (
|
|
|
49
49
|
TerminateSessionRequest,
|
|
50
50
|
)
|
|
51
51
|
from google.cloud.dataproc_v1.types import sessions
|
|
52
|
+
from google.cloud.dataproc_spark_connect import environment
|
|
52
53
|
from pyspark.sql.connect.session import SparkSession
|
|
53
54
|
from pyspark.sql.utils import to_str
|
|
54
55
|
|
|
@@ -56,6 +57,12 @@ from pyspark.sql.utils import to_str
|
|
|
56
57
|
logging.basicConfig(level=logging.INFO)
|
|
57
58
|
logger = logging.getLogger(__name__)
|
|
58
59
|
|
|
60
|
+
# System labels that should not be overridden by user
|
|
61
|
+
SYSTEM_LABELS = {
|
|
62
|
+
"dataproc-session-client",
|
|
63
|
+
"goog-colab-notebook-id",
|
|
64
|
+
}
|
|
65
|
+
|
|
59
66
|
|
|
60
67
|
def _is_valid_label_value(value: str) -> bool:
|
|
61
68
|
"""
|
|
@@ -132,11 +139,95 @@ class DataprocSparkSession(SparkSession):
|
|
|
132
139
|
return self
|
|
133
140
|
|
|
134
141
|
def dataprocSessionConfig(self, dataproc_config: Session):
|
|
142
|
+
self._dataproc_config = dataproc_config
|
|
143
|
+
for k, v in dataproc_config.runtime_config.properties.items():
|
|
144
|
+
self._options[cast(str, k)] = to_str(v)
|
|
145
|
+
return self
|
|
146
|
+
|
|
147
|
+
@property
|
|
148
|
+
def dataproc_config(self):
|
|
135
149
|
with self._lock:
|
|
136
|
-
self._dataproc_config =
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
150
|
+
self._dataproc_config = self._dataproc_config or Session()
|
|
151
|
+
return self._dataproc_config
|
|
152
|
+
|
|
153
|
+
def runtimeVersion(self, version: str):
|
|
154
|
+
self.dataproc_config.runtime_config.version = version
|
|
155
|
+
return self
|
|
156
|
+
|
|
157
|
+
def serviceAccount(self, account: str):
|
|
158
|
+
self.dataproc_config.environment_config.execution_config.service_account = (
|
|
159
|
+
account
|
|
160
|
+
)
|
|
161
|
+
# Automatically set auth type to SERVICE_ACCOUNT when service account is provided
|
|
162
|
+
# This overrides any env var setting to simplify user experience
|
|
163
|
+
self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
|
|
164
|
+
AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
|
|
165
|
+
)
|
|
166
|
+
return self
|
|
167
|
+
|
|
168
|
+
def authType(
|
|
169
|
+
self, auth_type: "AuthenticationConfig.AuthenticationType"
|
|
170
|
+
):
|
|
171
|
+
self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
|
|
172
|
+
auth_type
|
|
173
|
+
)
|
|
174
|
+
return self
|
|
175
|
+
|
|
176
|
+
def subnetwork(self, subnet: str):
|
|
177
|
+
self.dataproc_config.environment_config.execution_config.subnetwork_uri = (
|
|
178
|
+
subnet
|
|
179
|
+
)
|
|
180
|
+
return self
|
|
181
|
+
|
|
182
|
+
def ttl(self, duration: datetime.timedelta):
|
|
183
|
+
"""Set the time-to-live (TTL) for the session using a timedelta object."""
|
|
184
|
+
self.dataproc_config.environment_config.execution_config.ttl = {
|
|
185
|
+
"seconds": int(duration.total_seconds())
|
|
186
|
+
}
|
|
187
|
+
return self
|
|
188
|
+
|
|
189
|
+
def ttlSeconds(self, seconds: int):
|
|
190
|
+
"""Set the time-to-live (TTL) for the session in seconds."""
|
|
191
|
+
self.dataproc_config.environment_config.execution_config.ttl = {
|
|
192
|
+
"seconds": seconds
|
|
193
|
+
}
|
|
194
|
+
return self
|
|
195
|
+
|
|
196
|
+
def idleTtl(self, duration: datetime.timedelta):
|
|
197
|
+
"""Set the idle time-to-live (idle TTL) for the session using a timedelta object."""
|
|
198
|
+
self.dataproc_config.environment_config.execution_config.idle_ttl = {
|
|
199
|
+
"seconds": int(duration.total_seconds())
|
|
200
|
+
}
|
|
201
|
+
return self
|
|
202
|
+
|
|
203
|
+
def idleTtlSeconds(self, seconds: int):
|
|
204
|
+
"""Set the idle time-to-live (idle TTL) for the session in seconds."""
|
|
205
|
+
self.dataproc_config.environment_config.execution_config.idle_ttl = {
|
|
206
|
+
"seconds": seconds
|
|
207
|
+
}
|
|
208
|
+
return self
|
|
209
|
+
|
|
210
|
+
def sessionTemplate(self, template: str):
|
|
211
|
+
self.dataproc_config.session_template = template
|
|
212
|
+
return self
|
|
213
|
+
|
|
214
|
+
def label(self, key: str, value: str):
|
|
215
|
+
"""Add a single label to the session."""
|
|
216
|
+
return self.labels({key: value})
|
|
217
|
+
|
|
218
|
+
def labels(self, labels: Dict[str, str]):
|
|
219
|
+
# Filter out system labels and warn user
|
|
220
|
+
filtered_labels = {}
|
|
221
|
+
for key, value in labels.items():
|
|
222
|
+
if key in SYSTEM_LABELS:
|
|
223
|
+
logger.warning(
|
|
224
|
+
f"Label '{key}' is a system label and cannot be overridden by user. Ignoring."
|
|
225
|
+
)
|
|
226
|
+
else:
|
|
227
|
+
filtered_labels[key] = value
|
|
228
|
+
|
|
229
|
+
self.dataproc_config.labels.update(filtered_labels)
|
|
230
|
+
return self
|
|
140
231
|
|
|
141
232
|
def remote(self, url: Optional[str] = None) -> "SparkSession.Builder":
|
|
142
233
|
if url:
|
|
@@ -258,8 +349,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
258
349
|
client_options=self._client_options
|
|
259
350
|
).create_session(session_request)
|
|
260
351
|
self._display_session_link_on_creation(session_id)
|
|
261
|
-
|
|
262
|
-
# self._display_view_session_details_button(session_id)
|
|
352
|
+
self._display_view_session_details_button(session_id)
|
|
263
353
|
create_session_pbar_thread.start()
|
|
264
354
|
session_response: Session = operation.result(
|
|
265
355
|
polling=retry.Retry(
|
|
@@ -377,8 +467,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
377
467
|
print(
|
|
378
468
|
f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
|
|
379
469
|
)
|
|
380
|
-
|
|
381
|
-
# self._display_view_session_details_button(s8s_session_id)
|
|
470
|
+
self._display_view_session_details_button(s8s_session_id)
|
|
382
471
|
if session is None:
|
|
383
472
|
session = self.__create_spark_connect_session_from_s8s(
|
|
384
473
|
session_response, session_name
|
|
@@ -401,11 +490,10 @@ class DataprocSparkSession(SparkSession):
|
|
|
401
490
|
return session
|
|
402
491
|
|
|
403
492
|
def _get_dataproc_config(self):
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
dataproc_config.runtime_config.properties[k] = v
|
|
493
|
+
# Use the property to ensure we always have a config
|
|
494
|
+
dataproc_config = self.dataproc_config
|
|
495
|
+
for k, v in self._options.items():
|
|
496
|
+
dataproc_config.runtime_config.properties[k] = v
|
|
409
497
|
dataproc_config.spark_connect_session = (
|
|
410
498
|
sessions.SparkConnectConfig()
|
|
411
499
|
)
|
|
@@ -413,6 +501,11 @@ class DataprocSparkSession(SparkSession):
|
|
|
413
501
|
dataproc_config.runtime_config.version = (
|
|
414
502
|
DataprocSparkSession._DEFAULT_RUNTIME_VERSION
|
|
415
503
|
)
|
|
504
|
+
|
|
505
|
+
# Check for Python version mismatch with runtime for UDF compatibility
|
|
506
|
+
self._check_python_version_compatibility(
|
|
507
|
+
dataproc_config.runtime_config.version
|
|
508
|
+
)
|
|
416
509
|
if (
|
|
417
510
|
not dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type
|
|
418
511
|
and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
|
|
@@ -452,6 +545,10 @@ class DataprocSparkSession(SparkSession):
|
|
|
452
545
|
os.getenv("DATAPROC_SPARK_CONNECT_IDLE_TTL_SECONDS")
|
|
453
546
|
)
|
|
454
547
|
}
|
|
548
|
+
client_environment = environment.get_client_environment_label()
|
|
549
|
+
dataproc_config.labels["dataproc-session-client"] = (
|
|
550
|
+
client_environment
|
|
551
|
+
)
|
|
455
552
|
if "COLAB_NOTEBOOK_ID" in os.environ:
|
|
456
553
|
colab_notebook_name = os.environ["COLAB_NOTEBOOK_ID"]
|
|
457
554
|
# Extract the last part of the path, which is the ID
|
|
@@ -466,7 +563,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
466
563
|
f"Only lowercase letters, numbers, and dashes are allowed. "
|
|
467
564
|
f"The value must start with lowercase letter or number and end with a lowercase letter or number. "
|
|
468
565
|
f"Maximum length is 63 characters. "
|
|
469
|
-
f"
|
|
566
|
+
f"Ignoring notebook ID label."
|
|
470
567
|
)
|
|
471
568
|
default_datasource = os.getenv(
|
|
472
569
|
"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
|
|
@@ -494,6 +591,32 @@ class DataprocSparkSession(SparkSession):
|
|
|
494
591
|
)
|
|
495
592
|
return dataproc_config
|
|
496
593
|
|
|
594
|
+
def _check_python_version_compatibility(self, runtime_version):
|
|
595
|
+
"""Check if client Python version matches server Python version for UDF compatibility."""
|
|
596
|
+
import sys
|
|
597
|
+
import warnings
|
|
598
|
+
|
|
599
|
+
# Runtime version to server Python version mapping
|
|
600
|
+
RUNTIME_PYTHON_MAP = {
|
|
601
|
+
"1.2": (3, 12),
|
|
602
|
+
"2.2": (3, 12),
|
|
603
|
+
"2.3": (3, 11),
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
client_python = sys.version_info[:2] # (major, minor)
|
|
607
|
+
|
|
608
|
+
if runtime_version in RUNTIME_PYTHON_MAP:
|
|
609
|
+
server_python = RUNTIME_PYTHON_MAP[runtime_version]
|
|
610
|
+
|
|
611
|
+
if client_python != server_python:
|
|
612
|
+
warnings.warn(
|
|
613
|
+
f"Python version mismatch detected: Client is using Python {client_python[0]}.{client_python[1]}, "
|
|
614
|
+
f"but Dataproc runtime {runtime_version} uses Python {server_python[0]}.{server_python[1]}. "
|
|
615
|
+
f"This mismatch may cause issues with Python UDF (User Defined Function) compatibility. "
|
|
616
|
+
f"Consider using Python {server_python[0]}.{server_python[1]} for optimal UDF execution.",
|
|
617
|
+
stacklevel=3,
|
|
618
|
+
)
|
|
619
|
+
|
|
497
620
|
def _display_view_session_details_button(self, session_id):
|
|
498
621
|
try:
|
|
499
622
|
session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
|
|
@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
|
|
|
20
20
|
|
|
21
21
|
setup(
|
|
22
22
|
name="dataproc-spark-connect",
|
|
23
|
-
version="0.
|
|
23
|
+
version="0.9.0",
|
|
24
24
|
description="Dataproc client library for Spark Connect",
|
|
25
25
|
long_description=long_description,
|
|
26
26
|
author="Google LLC",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|