dataproc-spark-connect 1.0.0rc6__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/PKG-INFO +65 -17
  2. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/README.md +63 -16
  3. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/dataproc_spark_connect.egg-info/PKG-INFO +65 -17
  4. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/google/cloud/dataproc_spark_connect/session.py +110 -33
  5. dataproc_spark_connect-1.0.1/setup.cfg +14 -0
  6. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/setup.py +1 -1
  7. dataproc_spark_connect-1.0.0rc6/setup.cfg +0 -7
  8. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/LICENSE +0 -0
  9. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/dataproc_spark_connect.egg-info/SOURCES.txt +0 -0
  10. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
  11. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/dataproc_spark_connect.egg-info/requires.txt +0 -0
  12. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
  13. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
  14. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
  15. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/google/cloud/dataproc_spark_connect/client/core.py +0 -0
  16. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
  17. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/google/cloud/dataproc_spark_connect/environment.py +0 -0
  18. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/google/cloud/dataproc_spark_connect/exceptions.py +0 -0
  19. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
  20. {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.1}/pyproject.toml +0 -0
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 1.0.0rc6
3
+ Version: 1.0.1
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
7
7
  License: Apache 2.0
8
+ Description-Content-Type: text/markdown
8
9
  License-File: LICENSE
9
10
  Requires-Dist: google-api-core>=2.19
10
11
  Requires-Dist: google-cloud-dataproc>=5.18
@@ -43,39 +44,86 @@ pip uninstall dataproc_spark_connect
43
44
 
44
45
  This client requires permissions to
45
46
  manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
46
- If you are running the client outside of Google Cloud, you must set following
47
- environment variables:
48
47
 
49
- * `GOOGLE_CLOUD_PROJECT` - The Google Cloud project you use to run Spark
50
- workloads
51
- * `GOOGLE_CLOUD_REGION` - The Compute
52
- Engine [region](https://cloud.google.com/compute/docs/regions-zones#available)
53
- where you run the Spark workload.
54
- * `GOOGLE_APPLICATION_CREDENTIALS` -
55
- Your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
48
+ If you are running the client outside of Google Cloud, you need to provide
49
+ authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
50
+ variable to point to
51
+ your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
52
+ file.
53
+
54
+ You can specify the project and region either via environment variables or directly
55
+ in your code using the builder API:
56
+
57
+ * Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
58
+ * Builder API: `.projectId()` and `.location()` methods (recommended)
56
59
 
57
60
  ## Usage
58
61
 
59
- 1. Install the latest version of Dataproc Python client and Dataproc Spark
60
- Connect modules:
62
+ 1. Install the latest version of Dataproc Spark Connect:
61
63
 
62
64
  ```sh
63
- pip install google_cloud_dataproc dataproc_spark_connect --force-reinstall
65
+ pip install -U dataproc-spark-connect
64
66
  ```
65
67
 
66
68
  2. Add the required imports into your PySpark application or notebook and start
67
- a Spark session with the following code instead of using
68
- environment variables:
69
+ a Spark session using the fluent API:
70
+
71
+ ```python
72
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
73
+ spark = DataprocSparkSession.builder.getOrCreate()
74
+ ```
75
+
76
+ 3. You can configure Spark properties using the `.config()` method:
77
+
78
+ ```python
79
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
80
+ spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
81
+ ```
82
+
83
+ 4. For advanced configuration, you can use the `Session` class to customize
84
+ settings like subnetwork or other environment configurations:
69
85
 
70
86
  ```python
71
87
  from google.cloud.dataproc_spark_connect import DataprocSparkSession
72
88
  from google.cloud.dataproc_v1 import Session
73
89
  session_config = Session()
74
90
  session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
75
- session_config.runtime_config.version = '2.2'
76
- spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
91
+ session_config.runtime_config.version = '3.0'
92
+ spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
93
+ ```
94
+
95
+ ### Reusing Named Sessions Across Notebooks
96
+
97
+ Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
98
+
99
+ To create or connect to a named session:
100
+
101
+ 1. Create a session with a custom ID in your first notebook:
102
+
103
+ ```python
104
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
105
+ session_id = 'my-ml-pipeline-session'
106
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
107
+ df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
108
+ df.show()
109
+ ```
110
+
111
+ 2. Reuse the same session in another notebook by specifying the same session ID:
112
+
113
+ ```python
114
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
115
+ session_id = 'my-ml-pipeline-session'
116
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
117
+ df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
118
+ df.show()
77
119
  ```
78
120
 
121
+ 3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
122
+
123
+ 4. Named sessions persist until explicitly terminated or reach their configured TTL.
124
+
125
+ 5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
126
+
79
127
  ### Using Spark SQL Magic Commands (Jupyter Notebooks)
80
128
 
81
129
  The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
@@ -21,39 +21,86 @@ pip uninstall dataproc_spark_connect
21
21
 
22
22
  This client requires permissions to
23
23
  manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
24
- If you are running the client outside of Google Cloud, you must set following
25
- environment variables:
26
24
 
27
- * `GOOGLE_CLOUD_PROJECT` - The Google Cloud project you use to run Spark
28
- workloads
29
- * `GOOGLE_CLOUD_REGION` - The Compute
30
- Engine [region](https://cloud.google.com/compute/docs/regions-zones#available)
31
- where you run the Spark workload.
32
- * `GOOGLE_APPLICATION_CREDENTIALS` -
33
- Your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
25
+ If you are running the client outside of Google Cloud, you need to provide
26
+ authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
27
+ variable to point to
28
+ your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
29
+ file.
30
+
31
+ You can specify the project and region either via environment variables or directly
32
+ in your code using the builder API:
33
+
34
+ * Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
35
+ * Builder API: `.projectId()` and `.location()` methods (recommended)
34
36
 
35
37
  ## Usage
36
38
 
37
- 1. Install the latest version of Dataproc Python client and Dataproc Spark
38
- Connect modules:
39
+ 1. Install the latest version of Dataproc Spark Connect:
39
40
 
40
41
  ```sh
41
- pip install google_cloud_dataproc dataproc_spark_connect --force-reinstall
42
+ pip install -U dataproc-spark-connect
42
43
  ```
43
44
 
44
45
  2. Add the required imports into your PySpark application or notebook and start
45
- a Spark session with the following code instead of using
46
- environment variables:
46
+ a Spark session using the fluent API:
47
+
48
+ ```python
49
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
50
+ spark = DataprocSparkSession.builder.getOrCreate()
51
+ ```
52
+
53
+ 3. You can configure Spark properties using the `.config()` method:
54
+
55
+ ```python
56
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
57
+ spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
58
+ ```
59
+
60
+ 4. For advanced configuration, you can use the `Session` class to customize
61
+ settings like subnetwork or other environment configurations:
47
62
 
48
63
  ```python
49
64
  from google.cloud.dataproc_spark_connect import DataprocSparkSession
50
65
  from google.cloud.dataproc_v1 import Session
51
66
  session_config = Session()
52
67
  session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
53
- session_config.runtime_config.version = '2.2'
54
- spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
68
+ session_config.runtime_config.version = '3.0'
69
+ spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
70
+ ```
71
+
72
+ ### Reusing Named Sessions Across Notebooks
73
+
74
+ Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
75
+
76
+ To create or connect to a named session:
77
+
78
+ 1. Create a session with a custom ID in your first notebook:
79
+
80
+ ```python
81
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
82
+ session_id = 'my-ml-pipeline-session'
83
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
84
+ df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
85
+ df.show()
86
+ ```
87
+
88
+ 2. Reuse the same session in another notebook by specifying the same session ID:
89
+
90
+ ```python
91
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
92
+ session_id = 'my-ml-pipeline-session'
93
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
94
+ df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
95
+ df.show()
55
96
  ```
56
97
 
98
+ 3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
99
+
100
+ 4. Named sessions persist until explicitly terminated or reach their configured TTL.
101
+
102
+ 5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
103
+
57
104
  ### Using Spark SQL Magic Commands (Jupyter Notebooks)
58
105
 
59
106
  The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 1.0.0rc6
3
+ Version: 1.0.1
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
7
7
  License: Apache 2.0
8
+ Description-Content-Type: text/markdown
8
9
  License-File: LICENSE
9
10
  Requires-Dist: google-api-core>=2.19
10
11
  Requires-Dist: google-cloud-dataproc>=5.18
@@ -43,39 +44,86 @@ pip uninstall dataproc_spark_connect
43
44
 
44
45
  This client requires permissions to
45
46
  manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
46
- If you are running the client outside of Google Cloud, you must set following
47
- environment variables:
48
47
 
49
- * `GOOGLE_CLOUD_PROJECT` - The Google Cloud project you use to run Spark
50
- workloads
51
- * `GOOGLE_CLOUD_REGION` - The Compute
52
- Engine [region](https://cloud.google.com/compute/docs/regions-zones#available)
53
- where you run the Spark workload.
54
- * `GOOGLE_APPLICATION_CREDENTIALS` -
55
- Your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
48
+ If you are running the client outside of Google Cloud, you need to provide
49
+ authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
50
+ variable to point to
51
+ your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
52
+ file.
53
+
54
+ You can specify the project and region either via environment variables or directly
55
+ in your code using the builder API:
56
+
57
+ * Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
58
+ * Builder API: `.projectId()` and `.location()` methods (recommended)
56
59
 
57
60
  ## Usage
58
61
 
59
- 1. Install the latest version of Dataproc Python client and Dataproc Spark
60
- Connect modules:
62
+ 1. Install the latest version of Dataproc Spark Connect:
61
63
 
62
64
  ```sh
63
- pip install google_cloud_dataproc dataproc_spark_connect --force-reinstall
65
+ pip install -U dataproc-spark-connect
64
66
  ```
65
67
 
66
68
  2. Add the required imports into your PySpark application or notebook and start
67
- a Spark session with the following code instead of using
68
- environment variables:
69
+ a Spark session using the fluent API:
70
+
71
+ ```python
72
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
73
+ spark = DataprocSparkSession.builder.getOrCreate()
74
+ ```
75
+
76
+ 3. You can configure Spark properties using the `.config()` method:
77
+
78
+ ```python
79
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
80
+ spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
81
+ ```
82
+
83
+ 4. For advanced configuration, you can use the `Session` class to customize
84
+ settings like subnetwork or other environment configurations:
69
85
 
70
86
  ```python
71
87
  from google.cloud.dataproc_spark_connect import DataprocSparkSession
72
88
  from google.cloud.dataproc_v1 import Session
73
89
  session_config = Session()
74
90
  session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
75
- session_config.runtime_config.version = '2.2'
76
- spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
91
+ session_config.runtime_config.version = '3.0'
92
+ spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
93
+ ```
94
+
95
+ ### Reusing Named Sessions Across Notebooks
96
+
97
+ Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
98
+
99
+ To create or connect to a named session:
100
+
101
+ 1. Create a session with a custom ID in your first notebook:
102
+
103
+ ```python
104
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
105
+ session_id = 'my-ml-pipeline-session'
106
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
107
+ df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
108
+ df.show()
109
+ ```
110
+
111
+ 2. Reuse the same session in another notebook by specifying the same session ID:
112
+
113
+ ```python
114
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
115
+ session_id = 'my-ml-pipeline-session'
116
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
117
+ df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
118
+ df.show()
77
119
  ```
78
120
 
121
+ 3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
122
+
123
+ 4. Named sessions persist until explicitly terminated or reach their configured TTL.
124
+
125
+ 5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
126
+
79
127
  ### Using Spark SQL Magic Commands (Jupyter Notebooks)
80
128
 
81
129
  The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
@@ -14,6 +14,7 @@
14
14
 
15
15
  import atexit
16
16
  import datetime
17
+ import functools
17
18
  import json
18
19
  import logging
19
20
  import os
@@ -25,8 +26,6 @@ import time
25
26
  import uuid
26
27
  import tqdm
27
28
  from packaging import version
28
- from tqdm import tqdm as cli_tqdm
29
- from tqdm.notebook import tqdm as notebook_tqdm
30
29
  from types import MethodType
31
30
  from typing import Any, cast, ClassVar, Dict, Iterable, Optional, Union
32
31
 
@@ -67,6 +66,10 @@ SYSTEM_LABELS = {
67
66
  "goog-colab-notebook-id",
68
67
  }
69
68
 
69
+ _DATAPROC_SESSIONS_BASE_URL = (
70
+ "https://console.cloud.google.com/dataproc/interactive"
71
+ )
72
+
70
73
 
71
74
  def _is_valid_label_value(value: str) -> bool:
72
75
  """
@@ -494,15 +497,21 @@ class DataprocSparkSession(SparkSession):
494
497
  )
495
498
 
496
499
  def _display_session_link_on_creation(self, session_id):
497
- session_url = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
500
+ session_url = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{session_id}?project={self._project_id}"
498
501
  plain_message = f"Creating Dataproc Session: {session_url}"
499
- html_element = f"""
502
+ if environment.is_colab_enterprise():
503
+ html_element = f"""
500
504
  <div>
501
505
  <p>Creating Dataproc Spark Session<p>
502
- <p><a href="{session_url}">Dataproc Session</a></p>
503
506
  </div>
504
- """
505
-
507
+ """
508
+ else:
509
+ html_element = f"""
510
+ <div>
511
+ <p>Creating Dataproc Spark Session<p>
512
+ <p><a href="{session_url}">Dataproc Session</a></p>
513
+ </div>
514
+ """
506
515
  self._output_element_or_message(plain_message, html_element)
507
516
 
508
517
  def _print_session_created_message(self):
@@ -554,7 +563,7 @@ class DataprocSparkSession(SparkSession):
554
563
 
555
564
  if session_response is not None:
556
565
  print(
557
- f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
566
+ f"Using existing Dataproc Session (configuration changes may not be applied): {_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{s8s_session_id}?project={self._project_id}"
558
567
  )
559
568
  self._display_view_session_details_button(s8s_session_id)
560
569
  if session is None:
@@ -583,6 +592,16 @@ class DataprocSparkSession(SparkSession):
583
592
  session = PySparkSQLSession.builder.getOrCreate()
584
593
  return session # type: ignore
585
594
 
595
+ if self._project_id is None:
596
+ raise DataprocSparkConnectException(
597
+ f"Error while creating Dataproc Session: project ID is not set"
598
+ )
599
+
600
+ if self._region is None:
601
+ raise DataprocSparkConnectException(
602
+ f"Error while creating Dataproc Session: location is not set"
603
+ )
604
+
586
605
  # Handle custom session ID by setting it early and letting existing logic handle it
587
606
  if self._custom_session_id:
588
607
  self._handle_custom_session_id()
@@ -711,8 +730,6 @@ class DataprocSparkSession(SparkSession):
711
730
  # Merge default configs with existing properties,
712
731
  # user configs take precedence
713
732
  for k, v in {
714
- "spark.datasource.bigquery.viewsEnabled": "true",
715
- "spark.datasource.bigquery.writeMethod": "direct",
716
733
  "spark.sql.catalog.spark_catalog": "com.google.cloud.spark.bigquery.BigQuerySparkSessionCatalog",
717
734
  "spark.sql.sources.default": "bigquery",
718
735
  }.items():
@@ -734,7 +751,7 @@ class DataprocSparkSession(SparkSession):
734
751
 
735
752
  # Runtime version to server Python version mapping
736
753
  RUNTIME_PYTHON_MAP = {
737
- "3.0": (3, 11),
754
+ "3.0": (3, 12),
738
755
  }
739
756
 
740
757
  client_python = sys.version_info[:2] # (major, minor)
@@ -798,7 +815,7 @@ class DataprocSparkSession(SparkSession):
798
815
  return
799
816
 
800
817
  try:
801
- session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
818
+ session_url = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{session_id}?project={self._project_id}"
802
819
  from IPython.core.interactiveshell import InteractiveShell
803
820
 
804
821
  if not InteractiveShell.initialized():
@@ -981,6 +998,28 @@ class DataprocSparkSession(SparkSession):
981
998
  clearProgressHandlers_wrapper_method, self
982
999
  )
983
1000
 
1001
+ @staticmethod
1002
+ @functools.lru_cache(maxsize=1)
1003
+ def get_tqdm_bar():
1004
+ """
1005
+ Return a tqdm implementation that works in the current environment.
1006
+
1007
+ - Uses CLI tqdm for interactive terminals.
1008
+ - Uses the notebook tqdm if available, otherwise falls back to CLI tqdm.
1009
+ """
1010
+ from tqdm import tqdm as cli_tqdm
1011
+
1012
+ if environment.is_interactive_terminal():
1013
+ return cli_tqdm
1014
+
1015
+ try:
1016
+ import ipywidgets
1017
+ from tqdm.notebook import tqdm as notebook_tqdm
1018
+
1019
+ return notebook_tqdm
1020
+ except ImportError:
1021
+ return cli_tqdm
1022
+
984
1023
  def _register_progress_execution_handler(self):
985
1024
  from pyspark.sql.connect.shell.progress import StageInfo
986
1025
 
@@ -1005,9 +1044,12 @@ class DataprocSparkSession(SparkSession):
1005
1044
  total_tasks += stage.num_tasks
1006
1045
  completed_tasks += stage.num_completed_tasks
1007
1046
 
1008
- tqdm_pbar = notebook_tqdm
1009
- if environment.is_interactive_terminal():
1010
- tqdm_pbar = cli_tqdm
1047
+ # Don't show progress bar till we receive some tasks
1048
+ if total_tasks == 0:
1049
+ return
1050
+
1051
+ # Get correct tqdm (notebook or CLI)
1052
+ tqdm_pbar = self.get_tqdm_bar()
1011
1053
 
1012
1054
  # Use a lock to ensure only one thread can access and modify
1013
1055
  # the shared dictionaries at a time.
@@ -1044,13 +1086,11 @@ class DataprocSparkSession(SparkSession):
1044
1086
  @staticmethod
1045
1087
  def _sql_lazy_transformation(req):
1046
1088
  # Select SQL command
1047
- if req.plan and req.plan.command and req.plan.command.sql_command:
1048
- return (
1049
- "select"
1050
- in req.plan.command.sql_command.sql.strip().lower().split()
1051
- )
1052
-
1053
- return False
1089
+ try:
1090
+ query = req.plan.command.sql_command.input.sql.query
1091
+ return "select" in query.strip().lower().split()
1092
+ except AttributeError:
1093
+ return False
1054
1094
 
1055
1095
  def _repr_html_(self) -> str:
1056
1096
  if not self._active_s8s_session_id:
@@ -1058,7 +1098,7 @@ class DataprocSparkSession(SparkSession):
1058
1098
  <div>No Active Dataproc Session</div>
1059
1099
  """
1060
1100
 
1061
- s8s_session = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{self._active_s8s_session_id}"
1101
+ s8s_session = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{self._active_s8s_session_id}"
1062
1102
  ui = f"{s8s_session}/sparkApplications/applications"
1063
1103
  return f"""
1064
1104
  <div>
@@ -1085,7 +1125,7 @@ class DataprocSparkSession(SparkSession):
1085
1125
  )
1086
1126
 
1087
1127
  url = (
1088
- f"https://console.cloud.google.com/dataproc/interactive/{self._region}/"
1128
+ f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/"
1089
1129
  f"{self._active_s8s_session_id}/sparkApplications/application;"
1090
1130
  f"associatedSqlOperationId={operation_id}?project={self._project_id}"
1091
1131
  )
@@ -1177,20 +1217,52 @@ class DataprocSparkSession(SparkSession):
1177
1217
  def _get_active_session_file_path():
1178
1218
  return os.getenv("DATAPROC_SPARK_CONNECT_ACTIVE_SESSION_FILE_PATH")
1179
1219
 
1180
- def stop(self) -> None:
1220
+ def stop(self, terminate: Optional[bool] = None) -> None:
1221
+ """
1222
+ Stop the Spark session and optionally terminate the server-side session.
1223
+
1224
+ Parameters
1225
+ ----------
1226
+ terminate : bool, optional
1227
+ Control server-side termination behavior.
1228
+
1229
+ - None (default): Auto-detect based on session type
1230
+
1231
+ - Managed sessions (auto-generated ID): terminate server
1232
+ - Named sessions (custom ID): client-side cleanup only
1233
+
1234
+ - True: Always terminate the server-side session
1235
+ - False: Never terminate the server-side session (client cleanup only)
1236
+
1237
+ Examples
1238
+ --------
1239
+ Auto-detect termination behavior (existing behavior):
1240
+
1241
+ >>> spark.stop()
1242
+
1243
+ Force terminate a named session:
1244
+
1245
+ >>> spark.stop(terminate=True)
1246
+
1247
+ Prevent termination of a managed session:
1248
+
1249
+ >>> spark.stop(terminate=False)
1250
+ """
1181
1251
  with DataprocSparkSession._lock:
1182
1252
  if DataprocSparkSession._active_s8s_session_id is not None:
1183
- # Check if this is a managed session (auto-generated ID) or unmanaged session (custom ID)
1184
- if DataprocSparkSession._active_session_uses_custom_id:
1185
- # Unmanaged session (custom ID): Only clean up client-side state
1186
- # Don't terminate as it might be in use by other notebooks or clients
1187
- logger.debug(
1188
- f"Stopping unmanaged session {DataprocSparkSession._active_s8s_session_id} without termination"
1253
+ # Determine if we should terminate the server-side session
1254
+ if terminate is None:
1255
+ # Auto-detect: managed sessions terminate, named sessions don't
1256
+ should_terminate = (
1257
+ not DataprocSparkSession._active_session_uses_custom_id
1189
1258
  )
1190
1259
  else:
1191
- # Managed session (auto-generated ID): Use original behavior and terminate
1260
+ should_terminate = terminate
1261
+
1262
+ if should_terminate:
1263
+ # Terminate the server-side session
1192
1264
  logger.debug(
1193
- f"Terminating managed session {DataprocSparkSession._active_s8s_session_id}"
1265
+ f"Terminating session {DataprocSparkSession._active_s8s_session_id}"
1194
1266
  )
1195
1267
  terminate_s8s_session(
1196
1268
  DataprocSparkSession._project_id,
@@ -1198,6 +1270,11 @@ class DataprocSparkSession(SparkSession):
1198
1270
  DataprocSparkSession._active_s8s_session_id,
1199
1271
  self._client_options,
1200
1272
  )
1273
+ else:
1274
+ # Client-side cleanup only
1275
+ logger.debug(
1276
+ f"Stopping session {DataprocSparkSession._active_s8s_session_id} without termination"
1277
+ )
1201
1278
 
1202
1279
  self._remove_stopped_session_from_file()
1203
1280
 
@@ -0,0 +1,14 @@
1
+ [bdist_wheel]
2
+ universal = 1
3
+
4
+ [check-manifest]
5
+ ignore =
6
+ .github/**
7
+
8
+ [metadata]
9
+ long_description_content_type = text/markdown
10
+
11
+ [egg_info]
12
+ tag_build =
13
+ tag_date = 0
14
+
@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
20
20
 
21
21
  setup(
22
22
  name="dataproc-spark-connect",
23
- version="1.0.0rc6",
23
+ version="1.0.1",
24
24
  description="Dataproc client library for Spark Connect",
25
25
  long_description=long_description,
26
26
  author="Google LLC",
@@ -1,7 +0,0 @@
1
- [bdist_wheel]
2
- universal = 1
3
-
4
- [egg_info]
5
- tag_build =
6
- tag_date = 0
7
-