dataproc-spark-connect 1.0.0rc6__tar.gz → 1.0.0rc7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/PKG-INFO +66 -18
  2. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/README.md +63 -16
  3. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/dataproc_spark_connect.egg-info/PKG-INFO +66 -18
  4. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/dataproc_spark_connect.egg-info/requires.txt +1 -1
  5. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/session.py +100 -33
  6. dataproc_spark_connect-1.0.0rc7/setup.cfg +14 -0
  7. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/setup.py +2 -2
  8. dataproc_spark_connect-1.0.0rc6/setup.cfg +0 -7
  9. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/LICENSE +0 -0
  10. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/dataproc_spark_connect.egg-info/SOURCES.txt +0 -0
  11. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
  12. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
  13. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
  14. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
  15. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/client/core.py +0 -0
  16. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
  17. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/environment.py +0 -0
  18. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/exceptions.py +0 -0
  19. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
  20. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/pyproject.toml +0 -0
@@ -1,15 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 1.0.0rc6
3
+ Version: 1.0.0rc7
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
7
7
  License: Apache 2.0
8
+ Description-Content-Type: text/markdown
8
9
  License-File: LICENSE
9
10
  Requires-Dist: google-api-core>=2.19
10
11
  Requires-Dist: google-cloud-dataproc>=5.18
11
12
  Requires-Dist: packaging>=20.0
12
- Requires-Dist: pyspark[connect]~=4.0.0
13
+ Requires-Dist: pyspark-client~=4.0.0
13
14
  Requires-Dist: tqdm>=4.67
14
15
  Requires-Dist: websockets>=14.0
15
16
  Dynamic: author
@@ -43,39 +44,86 @@ pip uninstall dataproc_spark_connect
43
44
 
44
45
  This client requires permissions to
45
46
  manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
46
- If you are running the client outside of Google Cloud, you must set following
47
- environment variables:
48
47
 
49
- * `GOOGLE_CLOUD_PROJECT` - The Google Cloud project you use to run Spark
50
- workloads
51
- * `GOOGLE_CLOUD_REGION` - The Compute
52
- Engine [region](https://cloud.google.com/compute/docs/regions-zones#available)
53
- where you run the Spark workload.
54
- * `GOOGLE_APPLICATION_CREDENTIALS` -
55
- Your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
48
+ If you are running the client outside of Google Cloud, you need to provide
49
+ authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
50
+ variable to point to
51
+ your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
52
+ file.
53
+
54
+ You can specify the project and region either via environment variables or directly
55
+ in your code using the builder API:
56
+
57
+ * Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
58
+ * Builder API: `.projectId()` and `.location()` methods (recommended)
56
59
 
57
60
  ## Usage
58
61
 
59
- 1. Install the latest version of Dataproc Python client and Dataproc Spark
60
- Connect modules:
62
+ 1. Install the latest version of Dataproc Spark Connect:
61
63
 
62
64
  ```sh
63
- pip install google_cloud_dataproc dataproc_spark_connect --force-reinstall
65
+ pip install -U dataproc-spark-connect
64
66
  ```
65
67
 
66
68
  2. Add the required imports into your PySpark application or notebook and start
67
- a Spark session with the following code instead of using
68
- environment variables:
69
+ a Spark session using the fluent API:
70
+
71
+ ```python
72
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
73
+ spark = DataprocSparkSession.builder.getOrCreate()
74
+ ```
75
+
76
+ 3. You can configure Spark properties using the `.config()` method:
77
+
78
+ ```python
79
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
80
+ spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
81
+ ```
82
+
83
+ 4. For advanced configuration, you can use the `Session` class to customize
84
+ settings like subnetwork or other environment configurations:
69
85
 
70
86
  ```python
71
87
  from google.cloud.dataproc_spark_connect import DataprocSparkSession
72
88
  from google.cloud.dataproc_v1 import Session
73
89
  session_config = Session()
74
90
  session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
75
- session_config.runtime_config.version = '2.2'
76
- spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
91
+ session_config.runtime_config.version = '3.0'
92
+ spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
93
+ ```
94
+
95
+ ### Reusing Named Sessions Across Notebooks
96
+
97
+ Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
98
+
99
+ To create or connect to a named session:
100
+
101
+ 1. Create a session with a custom ID in your first notebook:
102
+
103
+ ```python
104
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
105
+ session_id = 'my-ml-pipeline-session'
106
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
107
+ df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
108
+ df.show()
109
+ ```
110
+
111
+ 2. Reuse the same session in another notebook by specifying the same session ID:
112
+
113
+ ```python
114
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
115
+ session_id = 'my-ml-pipeline-session'
116
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
117
+ df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
118
+ df.show()
77
119
  ```
78
120
 
121
+ 3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
122
+
123
+ 4. Named sessions persist until explicitly terminated or reach their configured TTL.
124
+
125
+ 5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
126
+
79
127
  ### Using Spark SQL Magic Commands (Jupyter Notebooks)
80
128
 
81
129
  The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
@@ -21,39 +21,86 @@ pip uninstall dataproc_spark_connect
21
21
 
22
22
  This client requires permissions to
23
23
  manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
24
- If you are running the client outside of Google Cloud, you must set following
25
- environment variables:
26
24
 
27
- * `GOOGLE_CLOUD_PROJECT` - The Google Cloud project you use to run Spark
28
- workloads
29
- * `GOOGLE_CLOUD_REGION` - The Compute
30
- Engine [region](https://cloud.google.com/compute/docs/regions-zones#available)
31
- where you run the Spark workload.
32
- * `GOOGLE_APPLICATION_CREDENTIALS` -
33
- Your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
25
+ If you are running the client outside of Google Cloud, you need to provide
26
+ authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
27
+ variable to point to
28
+ your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
29
+ file.
30
+
31
+ You can specify the project and region either via environment variables or directly
32
+ in your code using the builder API:
33
+
34
+ * Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
35
+ * Builder API: `.projectId()` and `.location()` methods (recommended)
34
36
 
35
37
  ## Usage
36
38
 
37
- 1. Install the latest version of Dataproc Python client and Dataproc Spark
38
- Connect modules:
39
+ 1. Install the latest version of Dataproc Spark Connect:
39
40
 
40
41
  ```sh
41
- pip install google_cloud_dataproc dataproc_spark_connect --force-reinstall
42
+ pip install -U dataproc-spark-connect
42
43
  ```
43
44
 
44
45
  2. Add the required imports into your PySpark application or notebook and start
45
- a Spark session with the following code instead of using
46
- environment variables:
46
+ a Spark session using the fluent API:
47
+
48
+ ```python
49
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
50
+ spark = DataprocSparkSession.builder.getOrCreate()
51
+ ```
52
+
53
+ 3. You can configure Spark properties using the `.config()` method:
54
+
55
+ ```python
56
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
57
+ spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
58
+ ```
59
+
60
+ 4. For advanced configuration, you can use the `Session` class to customize
61
+ settings like subnetwork or other environment configurations:
47
62
 
48
63
  ```python
49
64
  from google.cloud.dataproc_spark_connect import DataprocSparkSession
50
65
  from google.cloud.dataproc_v1 import Session
51
66
  session_config = Session()
52
67
  session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
53
- session_config.runtime_config.version = '2.2'
54
- spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
68
+ session_config.runtime_config.version = '3.0'
69
+ spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
70
+ ```
71
+
72
+ ### Reusing Named Sessions Across Notebooks
73
+
74
+ Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
75
+
76
+ To create or connect to a named session:
77
+
78
+ 1. Create a session with a custom ID in your first notebook:
79
+
80
+ ```python
81
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
82
+ session_id = 'my-ml-pipeline-session'
83
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
84
+ df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
85
+ df.show()
86
+ ```
87
+
88
+ 2. Reuse the same session in another notebook by specifying the same session ID:
89
+
90
+ ```python
91
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
92
+ session_id = 'my-ml-pipeline-session'
93
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
94
+ df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
95
+ df.show()
55
96
  ```
56
97
 
98
+ 3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
99
+
100
+ 4. Named sessions persist until explicitly terminated or reach their configured TTL.
101
+
102
+ 5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
103
+
57
104
  ### Using Spark SQL Magic Commands (Jupyter Notebooks)
58
105
 
59
106
  The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
@@ -1,15 +1,16 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 1.0.0rc6
3
+ Version: 1.0.0rc7
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
7
7
  License: Apache 2.0
8
+ Description-Content-Type: text/markdown
8
9
  License-File: LICENSE
9
10
  Requires-Dist: google-api-core>=2.19
10
11
  Requires-Dist: google-cloud-dataproc>=5.18
11
12
  Requires-Dist: packaging>=20.0
12
- Requires-Dist: pyspark[connect]~=4.0.0
13
+ Requires-Dist: pyspark-client~=4.0.0
13
14
  Requires-Dist: tqdm>=4.67
14
15
  Requires-Dist: websockets>=14.0
15
16
  Dynamic: author
@@ -43,39 +44,86 @@ pip uninstall dataproc_spark_connect
43
44
 
44
45
  This client requires permissions to
45
46
  manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
46
- If you are running the client outside of Google Cloud, you must set following
47
- environment variables:
48
47
 
49
- * `GOOGLE_CLOUD_PROJECT` - The Google Cloud project you use to run Spark
50
- workloads
51
- * `GOOGLE_CLOUD_REGION` - The Compute
52
- Engine [region](https://cloud.google.com/compute/docs/regions-zones#available)
53
- where you run the Spark workload.
54
- * `GOOGLE_APPLICATION_CREDENTIALS` -
55
- Your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
48
+ If you are running the client outside of Google Cloud, you need to provide
49
+ authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
50
+ variable to point to
51
+ your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
52
+ file.
53
+
54
+ You can specify the project and region either via environment variables or directly
55
+ in your code using the builder API:
56
+
57
+ * Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
58
+ * Builder API: `.projectId()` and `.location()` methods (recommended)
56
59
 
57
60
  ## Usage
58
61
 
59
- 1. Install the latest version of Dataproc Python client and Dataproc Spark
60
- Connect modules:
62
+ 1. Install the latest version of Dataproc Spark Connect:
61
63
 
62
64
  ```sh
63
- pip install google_cloud_dataproc dataproc_spark_connect --force-reinstall
65
+ pip install -U dataproc-spark-connect
64
66
  ```
65
67
 
66
68
  2. Add the required imports into your PySpark application or notebook and start
67
- a Spark session with the following code instead of using
68
- environment variables:
69
+ a Spark session using the fluent API:
70
+
71
+ ```python
72
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
73
+ spark = DataprocSparkSession.builder.getOrCreate()
74
+ ```
75
+
76
+ 3. You can configure Spark properties using the `.config()` method:
77
+
78
+ ```python
79
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
80
+ spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
81
+ ```
82
+
83
+ 4. For advanced configuration, you can use the `Session` class to customize
84
+ settings like subnetwork or other environment configurations:
69
85
 
70
86
  ```python
71
87
  from google.cloud.dataproc_spark_connect import DataprocSparkSession
72
88
  from google.cloud.dataproc_v1 import Session
73
89
  session_config = Session()
74
90
  session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
75
- session_config.runtime_config.version = '2.2'
76
- spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
91
+ session_config.runtime_config.version = '3.0'
92
+ spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
93
+ ```
94
+
95
+ ### Reusing Named Sessions Across Notebooks
96
+
97
+ Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
98
+
99
+ To create or connect to a named session:
100
+
101
+ 1. Create a session with a custom ID in your first notebook:
102
+
103
+ ```python
104
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
105
+ session_id = 'my-ml-pipeline-session'
106
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
107
+ df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
108
+ df.show()
109
+ ```
110
+
111
+ 2. Reuse the same session in another notebook by specifying the same session ID:
112
+
113
+ ```python
114
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
115
+ session_id = 'my-ml-pipeline-session'
116
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
117
+ df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
118
+ df.show()
77
119
  ```
78
120
 
121
+ 3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
122
+
123
+ 4. Named sessions persist until explicitly terminated or reach their configured TTL.
124
+
125
+ 5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
126
+
79
127
  ### Using Spark SQL Magic Commands (Jupyter Notebooks)
80
128
 
81
129
  The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
@@ -1,6 +1,6 @@
1
1
  google-api-core>=2.19
2
2
  google-cloud-dataproc>=5.18
3
3
  packaging>=20.0
4
- pyspark[connect]~=4.0.0
4
+ pyspark-client~=4.0.0
5
5
  tqdm>=4.67
6
6
  websockets>=14.0
@@ -14,6 +14,7 @@
14
14
 
15
15
  import atexit
16
16
  import datetime
17
+ import functools
17
18
  import json
18
19
  import logging
19
20
  import os
@@ -25,8 +26,6 @@ import time
25
26
  import uuid
26
27
  import tqdm
27
28
  from packaging import version
28
- from tqdm import tqdm as cli_tqdm
29
- from tqdm.notebook import tqdm as notebook_tqdm
30
29
  from types import MethodType
31
30
  from typing import Any, cast, ClassVar, Dict, Iterable, Optional, Union
32
31
 
@@ -67,6 +66,10 @@ SYSTEM_LABELS = {
67
66
  "goog-colab-notebook-id",
68
67
  }
69
68
 
69
+ _DATAPROC_SESSIONS_BASE_URL = (
70
+ "https://console.cloud.google.com/dataproc/interactive"
71
+ )
72
+
70
73
 
71
74
  def _is_valid_label_value(value: str) -> bool:
72
75
  """
@@ -494,15 +497,21 @@ class DataprocSparkSession(SparkSession):
494
497
  )
495
498
 
496
499
  def _display_session_link_on_creation(self, session_id):
497
- session_url = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
500
+ session_url = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{session_id}?project={self._project_id}"
498
501
  plain_message = f"Creating Dataproc Session: {session_url}"
499
- html_element = f"""
502
+ if environment.is_colab_enterprise():
503
+ html_element = f"""
500
504
  <div>
501
505
  <p>Creating Dataproc Spark Session<p>
502
- <p><a href="{session_url}">Dataproc Session</a></p>
503
506
  </div>
504
- """
505
-
507
+ """
508
+ else:
509
+ html_element = f"""
510
+ <div>
511
+ <p>Creating Dataproc Spark Session<p>
512
+ <p><a href="{session_url}">Dataproc Session</a></p>
513
+ </div>
514
+ """
506
515
  self._output_element_or_message(plain_message, html_element)
507
516
 
508
517
  def _print_session_created_message(self):
@@ -554,7 +563,7 @@ class DataprocSparkSession(SparkSession):
554
563
 
555
564
  if session_response is not None:
556
565
  print(
557
- f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
566
+ f"Using existing Dataproc Session (configuration changes may not be applied): {_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{s8s_session_id}?project={self._project_id}"
558
567
  )
559
568
  self._display_view_session_details_button(s8s_session_id)
560
569
  if session is None:
@@ -711,8 +720,6 @@ class DataprocSparkSession(SparkSession):
711
720
  # Merge default configs with existing properties,
712
721
  # user configs take precedence
713
722
  for k, v in {
714
- "spark.datasource.bigquery.viewsEnabled": "true",
715
- "spark.datasource.bigquery.writeMethod": "direct",
716
723
  "spark.sql.catalog.spark_catalog": "com.google.cloud.spark.bigquery.BigQuerySparkSessionCatalog",
717
724
  "spark.sql.sources.default": "bigquery",
718
725
  }.items():
@@ -734,7 +741,7 @@ class DataprocSparkSession(SparkSession):
734
741
 
735
742
  # Runtime version to server Python version mapping
736
743
  RUNTIME_PYTHON_MAP = {
737
- "3.0": (3, 11),
744
+ "3.0": (3, 12),
738
745
  }
739
746
 
740
747
  client_python = sys.version_info[:2] # (major, minor)
@@ -798,7 +805,7 @@ class DataprocSparkSession(SparkSession):
798
805
  return
799
806
 
800
807
  try:
801
- session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
808
+ session_url = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{session_id}?project={self._project_id}"
802
809
  from IPython.core.interactiveshell import InteractiveShell
803
810
 
804
811
  if not InteractiveShell.initialized():
@@ -981,6 +988,28 @@ class DataprocSparkSession(SparkSession):
981
988
  clearProgressHandlers_wrapper_method, self
982
989
  )
983
990
 
991
+ @staticmethod
992
+ @functools.lru_cache(maxsize=1)
993
+ def get_tqdm_bar():
994
+ """
995
+ Return a tqdm implementation that works in the current environment.
996
+
997
+ - Uses CLI tqdm for interactive terminals.
998
+ - Uses the notebook tqdm if available, otherwise falls back to CLI tqdm.
999
+ """
1000
+ from tqdm import tqdm as cli_tqdm
1001
+
1002
+ if environment.is_interactive_terminal():
1003
+ return cli_tqdm
1004
+
1005
+ try:
1006
+ import ipywidgets
1007
+ from tqdm.notebook import tqdm as notebook_tqdm
1008
+
1009
+ return notebook_tqdm
1010
+ except ImportError:
1011
+ return cli_tqdm
1012
+
984
1013
  def _register_progress_execution_handler(self):
985
1014
  from pyspark.sql.connect.shell.progress import StageInfo
986
1015
 
@@ -1005,9 +1034,12 @@ class DataprocSparkSession(SparkSession):
1005
1034
  total_tasks += stage.num_tasks
1006
1035
  completed_tasks += stage.num_completed_tasks
1007
1036
 
1008
- tqdm_pbar = notebook_tqdm
1009
- if environment.is_interactive_terminal():
1010
- tqdm_pbar = cli_tqdm
1037
+ # Don't show progress bar till we receive some tasks
1038
+ if total_tasks == 0:
1039
+ return
1040
+
1041
+ # Get correct tqdm (notebook or CLI)
1042
+ tqdm_pbar = self.get_tqdm_bar()
1011
1043
 
1012
1044
  # Use a lock to ensure only one thread can access and modify
1013
1045
  # the shared dictionaries at a time.
@@ -1044,13 +1076,11 @@ class DataprocSparkSession(SparkSession):
1044
1076
  @staticmethod
1045
1077
  def _sql_lazy_transformation(req):
1046
1078
  # Select SQL command
1047
- if req.plan and req.plan.command and req.plan.command.sql_command:
1048
- return (
1049
- "select"
1050
- in req.plan.command.sql_command.sql.strip().lower().split()
1051
- )
1052
-
1053
- return False
1079
+ try:
1080
+ query = req.plan.command.sql_command.input.sql.query
1081
+ return "select" in query.strip().lower().split()
1082
+ except AttributeError:
1083
+ return False
1054
1084
 
1055
1085
  def _repr_html_(self) -> str:
1056
1086
  if not self._active_s8s_session_id:
@@ -1058,7 +1088,7 @@ class DataprocSparkSession(SparkSession):
1058
1088
  <div>No Active Dataproc Session</div>
1059
1089
  """
1060
1090
 
1061
- s8s_session = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{self._active_s8s_session_id}"
1091
+ s8s_session = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{self._active_s8s_session_id}"
1062
1092
  ui = f"{s8s_session}/sparkApplications/applications"
1063
1093
  return f"""
1064
1094
  <div>
@@ -1085,7 +1115,7 @@ class DataprocSparkSession(SparkSession):
1085
1115
  )
1086
1116
 
1087
1117
  url = (
1088
- f"https://console.cloud.google.com/dataproc/interactive/{self._region}/"
1118
+ f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/"
1089
1119
  f"{self._active_s8s_session_id}/sparkApplications/application;"
1090
1120
  f"associatedSqlOperationId={operation_id}?project={self._project_id}"
1091
1121
  )
@@ -1177,20 +1207,52 @@ class DataprocSparkSession(SparkSession):
1177
1207
  def _get_active_session_file_path():
1178
1208
  return os.getenv("DATAPROC_SPARK_CONNECT_ACTIVE_SESSION_FILE_PATH")
1179
1209
 
1180
- def stop(self) -> None:
1210
+ def stop(self, terminate: Optional[bool] = None) -> None:
1211
+ """
1212
+ Stop the Spark session and optionally terminate the server-side session.
1213
+
1214
+ Parameters
1215
+ ----------
1216
+ terminate : bool, optional
1217
+ Control server-side termination behavior.
1218
+
1219
+ - None (default): Auto-detect based on session type
1220
+
1221
+ - Managed sessions (auto-generated ID): terminate server
1222
+ - Named sessions (custom ID): client-side cleanup only
1223
+
1224
+ - True: Always terminate the server-side session
1225
+ - False: Never terminate the server-side session (client cleanup only)
1226
+
1227
+ Examples
1228
+ --------
1229
+ Auto-detect termination behavior (existing behavior):
1230
+
1231
+ >>> spark.stop()
1232
+
1233
+ Force terminate a named session:
1234
+
1235
+ >>> spark.stop(terminate=True)
1236
+
1237
+ Prevent termination of a managed session:
1238
+
1239
+ >>> spark.stop(terminate=False)
1240
+ """
1181
1241
  with DataprocSparkSession._lock:
1182
1242
  if DataprocSparkSession._active_s8s_session_id is not None:
1183
- # Check if this is a managed session (auto-generated ID) or unmanaged session (custom ID)
1184
- if DataprocSparkSession._active_session_uses_custom_id:
1185
- # Unmanaged session (custom ID): Only clean up client-side state
1186
- # Don't terminate as it might be in use by other notebooks or clients
1187
- logger.debug(
1188
- f"Stopping unmanaged session {DataprocSparkSession._active_s8s_session_id} without termination"
1243
+ # Determine if we should terminate the server-side session
1244
+ if terminate is None:
1245
+ # Auto-detect: managed sessions terminate, named sessions don't
1246
+ should_terminate = (
1247
+ not DataprocSparkSession._active_session_uses_custom_id
1189
1248
  )
1190
1249
  else:
1191
- # Managed session (auto-generated ID): Use original behavior and terminate
1250
+ should_terminate = terminate
1251
+
1252
+ if should_terminate:
1253
+ # Terminate the server-side session
1192
1254
  logger.debug(
1193
- f"Terminating managed session {DataprocSparkSession._active_s8s_session_id}"
1255
+ f"Terminating session {DataprocSparkSession._active_s8s_session_id}"
1194
1256
  )
1195
1257
  terminate_s8s_session(
1196
1258
  DataprocSparkSession._project_id,
@@ -1198,6 +1260,11 @@ class DataprocSparkSession(SparkSession):
1198
1260
  DataprocSparkSession._active_s8s_session_id,
1199
1261
  self._client_options,
1200
1262
  )
1263
+ else:
1264
+ # Client-side cleanup only
1265
+ logger.debug(
1266
+ f"Stopping session {DataprocSparkSession._active_s8s_session_id} without termination"
1267
+ )
1201
1268
 
1202
1269
  self._remove_stopped_session_from_file()
1203
1270
 
@@ -0,0 +1,14 @@
1
+ [bdist_wheel]
2
+ universal = 1
3
+
4
+ [check-manifest]
5
+ ignore =
6
+ .github/**
7
+
8
+ [metadata]
9
+ long_description_content_type = text/markdown
10
+
11
+ [egg_info]
12
+ tag_build =
13
+ tag_date = 0
14
+
@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
20
20
 
21
21
  setup(
22
22
  name="dataproc-spark-connect",
23
- version="1.0.0rc6",
23
+ version="1.0.0rc7",
24
24
  description="Dataproc client library for Spark Connect",
25
25
  long_description=long_description,
26
26
  author="Google LLC",
@@ -31,7 +31,7 @@ setup(
31
31
  "google-api-core>=2.19",
32
32
  "google-cloud-dataproc>=5.18",
33
33
  "packaging>=20.0",
34
- "pyspark[connect]~=4.0.0",
34
+ "pyspark-client~=4.0.0",
35
35
  "tqdm>=4.67",
36
36
  "websockets>=14.0",
37
37
  ],
@@ -1,7 +0,0 @@
1
- [bdist_wheel]
2
- universal = 1
3
-
4
- [egg_info]
5
- tag_build =
6
- tag_date = 0
7
-