dataproc-spark-connect 0.8.3__tar.gz → 1.0.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/PKG-INFO +2 -2
  2. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/PKG-INFO +2 -2
  3. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/SOURCES.txt +1 -0
  4. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/requires.txt +1 -1
  5. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/client/core.py +5 -3
  6. dataproc_spark_connect-1.0.0rc1/google/cloud/dataproc_spark_connect/environment.py +76 -0
  7. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/session.py +130 -31
  8. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/setup.py +2 -2
  9. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/LICENSE +0 -0
  10. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/README.md +0 -0
  11. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
  12. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
  13. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
  14. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
  15. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
  16. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/exceptions.py +0 -0
  17. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
  18. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/pyproject.toml +0 -0
  19. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 0.8.3
3
+ Version: 1.0.0rc1
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -9,7 +9,7 @@ License-File: LICENSE
9
9
  Requires-Dist: google-api-core>=2.19
10
10
  Requires-Dist: google-cloud-dataproc>=5.18
11
11
  Requires-Dist: packaging>=20.0
12
- Requires-Dist: pyspark[connect]~=3.5.1
12
+ Requires-Dist: pyspark[connect]~=4.0.0
13
13
  Requires-Dist: tqdm>=4.67
14
14
  Requires-Dist: websockets>=14.0
15
15
  Dynamic: author
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 0.8.3
3
+ Version: 1.0.0rc1
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -9,7 +9,7 @@ License-File: LICENSE
9
9
  Requires-Dist: google-api-core>=2.19
10
10
  Requires-Dist: google-cloud-dataproc>=5.18
11
11
  Requires-Dist: packaging>=20.0
12
- Requires-Dist: pyspark[connect]~=3.5.1
12
+ Requires-Dist: pyspark[connect]~=4.0.0
13
13
  Requires-Dist: tqdm>=4.67
14
14
  Requires-Dist: websockets>=14.0
15
15
  Dynamic: author
@@ -9,6 +9,7 @@ dataproc_spark_connect.egg-info/dependency_links.txt
9
9
  dataproc_spark_connect.egg-info/requires.txt
10
10
  dataproc_spark_connect.egg-info/top_level.txt
11
11
  google/cloud/dataproc_spark_connect/__init__.py
12
+ google/cloud/dataproc_spark_connect/environment.py
12
13
  google/cloud/dataproc_spark_connect/exceptions.py
13
14
  google/cloud/dataproc_spark_connect/pypi_artifacts.py
14
15
  google/cloud/dataproc_spark_connect/session.py
@@ -1,6 +1,6 @@
1
1
  google-api-core>=2.19
2
2
  google-cloud-dataproc>=5.18
3
3
  packaging>=20.0
4
- pyspark[connect]~=3.5.1
4
+ pyspark[connect]~=4.0.0
5
5
  tqdm>=4.67
6
6
  websockets>=14.0
@@ -15,14 +15,14 @@ import logging
15
15
 
16
16
  import google
17
17
  import grpc
18
- from pyspark.sql.connect.client import ChannelBuilder
18
+ from pyspark.sql.connect.client import DefaultChannelBuilder
19
19
 
20
20
  from . import proxy
21
21
 
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
24
 
25
- class DataprocChannelBuilder(ChannelBuilder):
25
+ class DataprocChannelBuilder(DefaultChannelBuilder):
26
26
  """
27
27
  This is a helper class that is used to create a GRPC channel based on the given
28
28
  connection string per the documentation of Spark Connect.
@@ -88,7 +88,9 @@ class ProxiedChannel(grpc.Channel):
88
88
  self._proxy = proxy.DataprocSessionProxy(0, target_host)
89
89
  self._proxy.start()
90
90
  self._proxied_connect_url = f"sc://localhost:{self._proxy.port}"
91
- self._wrapped = ChannelBuilder(self._proxied_connect_url).toChannel()
91
+ self._wrapped = DefaultChannelBuilder(
92
+ self._proxied_connect_url
93
+ ).toChannel()
92
94
 
93
95
  def __enter__(self):
94
96
  return self
@@ -0,0 +1,76 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ from typing import Callable, Tuple, List
17
+
18
+
19
+ def is_vscode() -> bool:
20
+ """True if running inside VS Code at all."""
21
+ return os.getenv("VSCODE_PID") is not None
22
+
23
+
24
+ def is_jupyter() -> bool:
25
+ """True if running in a Jupyter environment."""
26
+ return os.getenv("JPY_PARENT_PID") is not None
27
+
28
+
29
+ def is_colab_enterprise() -> bool:
30
+ """True if running in Colab Enterprise (Vertex AI)."""
31
+ return os.getenv("VERTEX_PRODUCT") == "COLAB_ENTERPRISE"
32
+
33
+
34
+ def is_colab() -> bool:
35
+ """True if running in Google Colab."""
36
+ return os.getenv("COLAB_RELEASE_TAG") is not None
37
+
38
+
39
+ def is_workbench() -> bool:
40
+ """True if running in AI Workbench (managed Jupyter)."""
41
+ return os.getenv("VERTEX_PRODUCT") == "WORKBENCH_INSTANCE"
42
+
43
+
44
+ def is_jetbrains_ide() -> bool:
45
+ """True if running inside any JetBrains IDE."""
46
+ return "jetbrains" in os.getenv("TERMINAL_EMULATOR", "").lower()
47
+
48
+
49
+ def get_client_environment_label() -> str:
50
+ """
51
+ Map current environment to a standardized client label.
52
+
53
+ Priority order:
54
+ 1. Colab Enterprise ("colab-enterprise")
55
+ 2. Colab ("colab")
56
+ 3. Workbench ("workbench-jupyter")
57
+ 4. VS Code ("vscode")
58
+ 5. JetBrains IDE ("jetbrains")
59
+ 6. Jupyter ("jupyter")
60
+ 7. Unknown ("unknown")
61
+ """
62
+ checks: List[Tuple[Callable[[], bool], str]] = [
63
+ (is_colab_enterprise, "colab-enterprise"),
64
+ (is_colab, "colab"),
65
+ (is_workbench, "workbench-jupyter"),
66
+ (is_vscode, "vscode"),
67
+ (is_jetbrains_ide, "jetbrains"),
68
+ (is_jupyter, "jupyter"),
69
+ ]
70
+ for detector, label in checks:
71
+ try:
72
+ if detector():
73
+ return label
74
+ except Exception:
75
+ pass
76
+ return "unknown"
@@ -49,6 +49,7 @@ from google.cloud.dataproc_v1 import (
49
49
  TerminateSessionRequest,
50
50
  )
51
51
  from google.cloud.dataproc_v1.types import sessions
52
+ from google.cloud.dataproc_spark_connect import environment
52
53
  from pyspark.sql.connect.session import SparkSession
53
54
  from pyspark.sql.utils import to_str
54
55
 
@@ -56,6 +57,12 @@ from pyspark.sql.utils import to_str
56
57
  logging.basicConfig(level=logging.INFO)
57
58
  logger = logging.getLogger(__name__)
58
59
 
60
+ # System labels that should not be overridden by user
61
+ SYSTEM_LABELS = {
62
+ "dataproc-session-client",
63
+ "goog-colab-notebook-id",
64
+ }
65
+
59
66
 
60
67
  def _is_valid_label_value(value: str) -> bool:
61
68
  """
@@ -96,7 +103,7 @@ class DataprocSparkSession(SparkSession):
96
103
  ... ) # doctest: +SKIP
97
104
  """
98
105
 
99
- _DEFAULT_RUNTIME_VERSION = "2.3"
106
+ _DEFAULT_RUNTIME_VERSION = "3.0"
100
107
 
101
108
  _active_s8s_session_uuid: ClassVar[Optional[str]] = None
102
109
  _project_id = None
@@ -132,11 +139,76 @@ class DataprocSparkSession(SparkSession):
132
139
  return self
133
140
 
134
141
  def dataprocSessionConfig(self, dataproc_config: Session):
142
+ self._dataproc_config = dataproc_config
143
+ for k, v in dataproc_config.runtime_config.properties.items():
144
+ self._options[cast(str, k)] = to_str(v)
145
+ return self
146
+
147
+ @property
148
+ def dataproc_config(self):
135
149
  with self._lock:
136
- self._dataproc_config = dataproc_config
137
- for k, v in dataproc_config.runtime_config.properties.items():
138
- self._options[cast(str, k)] = to_str(v)
139
- return self
150
+ self._dataproc_config = self._dataproc_config or Session()
151
+ return self._dataproc_config
152
+
153
+ def runtimeVersion(self, version: str):
154
+ self.dataproc_config.runtime_config.version = version
155
+ return self
156
+
157
+ def serviceAccount(self, account: str):
158
+ self.dataproc_config.environment_config.execution_config.service_account = (
159
+ account
160
+ )
161
+ return self
162
+
163
+ def subnetwork(self, subnet: str):
164
+ self.dataproc_config.environment_config.execution_config.subnetwork_uri = (
165
+ subnet
166
+ )
167
+ return self
168
+
169
+ def ttl(self, duration: datetime.timedelta):
170
+ """Set the time-to-live (TTL) for the session using a timedelta object."""
171
+ return self.ttlSeconds(int(duration.total_seconds()))
172
+
173
+ def ttlSeconds(self, seconds: int):
174
+ """Set the time-to-live (TTL) for the session in seconds."""
175
+ self.dataproc_config.environment_config.execution_config.ttl = {
176
+ "seconds": seconds
177
+ }
178
+ return self
179
+
180
+ def idleTtl(self, duration: datetime.timedelta):
181
+ """Set the idle time-to-live (idle TTL) for the session using a timedelta object."""
182
+ return self.idleTtlSeconds(int(duration.total_seconds()))
183
+
184
+ def idleTtlSeconds(self, seconds: int):
185
+ """Set the idle time-to-live (idle TTL) for the session in seconds."""
186
+ self.dataproc_config.environment_config.execution_config.idle_ttl = {
187
+ "seconds": seconds
188
+ }
189
+ return self
190
+
191
+ def sessionTemplate(self, template: str):
192
+ self.dataproc_config.session_template = template
193
+ return self
194
+
195
+ def label(self, key: str, value: str):
196
+ """Add a single label to the session."""
197
+ return self.labels({key: value})
198
+
199
+ def labels(self, labels: Dict[str, str]):
200
+ # Filter out system labels and warn user
201
+ filtered_labels = {}
202
+ for key, value in labels.items():
203
+ if key in SYSTEM_LABELS:
204
+ logger.warning(
205
+ f"Label '{key}' is a system label and cannot be overridden by user. Ignoring."
206
+ )
207
+ else:
208
+ filtered_labels[key] = value
209
+
210
+ self.dataproc_config.labels.update(filtered_labels)
211
+ return self
140
212
 
141
213
  def remote(self, url: Optional[str] = None) -> "SparkSession.Builder":
142
214
  if url:
@@ -258,8 +330,7 @@ class DataprocSparkSession(SparkSession):
258
330
  client_options=self._client_options
259
331
  ).create_session(session_request)
260
332
  self._display_session_link_on_creation(session_id)
261
- # TODO: Add the 'View Session Details' button once the UI changes are done.
262
- # self._display_view_session_details_button(session_id)
333
+ self._display_view_session_details_button(session_id)
263
334
  create_session_pbar_thread.start()
264
335
  session_response: Session = operation.result(
265
336
  polling=retry.Retry(
@@ -377,8 +448,7 @@ class DataprocSparkSession(SparkSession):
377
448
  print(
378
449
  f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
379
450
  )
380
- # TODO: Add the 'View Session Details' button once the UI changes are done.
381
- # self._display_view_session_details_button(s8s_session_id)
451
+ self._display_view_session_details_button(s8s_session_id)
382
452
  if session is None:
383
453
  session = self.__create_spark_connect_session_from_s8s(
384
454
  session_response, session_name
@@ -401,11 +471,10 @@ class DataprocSparkSession(SparkSession):
401
471
  return session
402
472
 
403
473
  def _get_dataproc_config(self):
404
- dataproc_config = Session()
405
- if self._dataproc_config:
406
- dataproc_config = self._dataproc_config
407
- for k, v in self._options.items():
408
- dataproc_config.runtime_config.properties[k] = v
474
+ # Use the property to ensure we always have a config
475
+ dataproc_config = self.dataproc_config
476
+ for k, v in self._options.items():
477
+ dataproc_config.runtime_config.properties[k] = v
409
478
  dataproc_config.spark_connect_session = (
410
479
  sessions.SparkConnectConfig()
411
480
  )
@@ -413,6 +482,11 @@ class DataprocSparkSession(SparkSession):
413
482
  dataproc_config.runtime_config.version = (
414
483
  DataprocSparkSession._DEFAULT_RUNTIME_VERSION
415
484
  )
485
+
486
+ # Check for Python version mismatch with runtime for UDF compatibility
487
+ self._check_python_version_compatibility(
488
+ dataproc_config.runtime_config.version
489
+ )
416
490
  if (
417
491
  not dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type
418
492
  and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
@@ -452,6 +526,10 @@ class DataprocSparkSession(SparkSession):
452
526
  os.getenv("DATAPROC_SPARK_CONNECT_IDLE_TTL_SECONDS")
453
527
  )
454
528
  }
529
+ client_environment = environment.get_client_environment_label()
530
+ dataproc_config.labels["dataproc-session-client"] = (
531
+ client_environment
532
+ )
455
533
  if "COLAB_NOTEBOOK_ID" in os.environ:
456
534
  colab_notebook_name = os.environ["COLAB_NOTEBOOK_ID"]
457
535
  # Extract the last part of the path, which is the ID
@@ -466,34 +544,55 @@ class DataprocSparkSession(SparkSession):
466
544
  f"Only lowercase letters, numbers, and dashes are allowed. "
467
545
  f"The value must start with lowercase letter or number and end with a lowercase letter or number. "
468
546
  f"Maximum length is 63 characters. "
469
- f"Skipping notebook ID label."
547
+ f"Ignoring notebook ID label."
470
548
  )
471
549
  default_datasource = os.getenv(
472
550
  "DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
473
551
  )
474
- if (
475
- default_datasource
476
- and dataproc_config.runtime_config.version == "2.3"
477
- ):
478
- if default_datasource == "bigquery":
479
- bq_datasource_properties = {
552
+ match default_datasource:
553
+ case "bigquery":
554
+ # Merge default configs with existing properties,
555
+ # user configs take precedence
556
+ for k, v in {
480
557
  "spark.datasource.bigquery.viewsEnabled": "true",
481
558
  "spark.datasource.bigquery.writeMethod": "direct",
482
559
  "spark.sql.catalog.spark_catalog": "com.google.cloud.spark.bigquery.BigQuerySparkSessionCatalog",
483
- "spark.sql.legacy.createHiveTableByDefault": "false",
484
560
  "spark.sql.sources.default": "bigquery",
485
- }
486
- # Merge default configs with existing properties, user configs take precedence
487
- for k, v in bq_datasource_properties.items():
561
+ }.items():
488
562
  if k not in dataproc_config.runtime_config.properties:
489
563
  dataproc_config.runtime_config.properties[k] = v
490
- else:
491
- logger.warning(
492
- f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
493
- f" {default_datasource}. Supported value is 'bigquery'."
494
- )
564
+ case _:
565
+ if default_datasource:
566
+ logger.warning(
567
+ f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
568
+ f" {default_datasource}. Supported value is 'bigquery'."
569
+ )
495
570
  return dataproc_config
496
571
 
572
+ def _check_python_version_compatibility(self, runtime_version):
573
+ """Check if client Python version matches server Python version for UDF compatibility."""
574
+ import sys
575
+ import warnings
576
+
577
+ # Runtime version to server Python version mapping
578
+ RUNTIME_PYTHON_MAP = {
579
+ "3.0": (3, 11),
580
+ }
581
+
582
+ client_python = sys.version_info[:2] # (major, minor)
583
+
584
+ if runtime_version in RUNTIME_PYTHON_MAP:
585
+ server_python = RUNTIME_PYTHON_MAP[runtime_version]
586
+
587
+ if client_python != server_python:
588
+ warnings.warn(
589
+ f"Python version mismatch detected: Client is using Python {client_python[0]}.{client_python[1]}, "
590
+ f"but Dataproc runtime {runtime_version} uses Python {server_python[0]}.{server_python[1]}. "
591
+ f"This mismatch may cause issues with Python UDF (User Defined Function) compatibility. "
592
+ f"Consider using Python {server_python[0]}.{server_python[1]} for optimal UDF execution.",
593
+ stacklevel=3,
594
+ )
595
+
497
596
  def _display_view_session_details_button(self, session_id):
498
597
  try:
499
598
  session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
@@ -690,7 +789,7 @@ class DataprocSparkSession(SparkSession):
690
789
  This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws
691
790
  an exception.
692
791
  Regarding pypi: Popular packages are already pre-installed in s8s runtime.
693
- https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.2#python_libraries
792
+ https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.3#python_libraries
694
793
  If there are conflicts/package doesn't exist, it throws an exception.
695
794
  """
696
795
  if sum([pypi, file, pyfile, archive]) > 1:
@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
20
20
 
21
21
  setup(
22
22
  name="dataproc-spark-connect",
23
- version="0.8.3",
23
+ version="1.0.0rc1",
24
24
  description="Dataproc client library for Spark Connect",
25
25
  long_description=long_description,
26
26
  author="Google LLC",
@@ -31,7 +31,7 @@ setup(
31
31
  "google-api-core>=2.19",
32
32
  "google-cloud-dataproc>=5.18",
33
33
  "packaging>=20.0",
34
- "pyspark[connect]~=3.5.1",
34
+ "pyspark[connect]~=4.0.0",
35
35
  "tqdm>=4.67",
36
36
  "websockets>=14.0",
37
37
  ],