dataproc-spark-connect 1.0.0rc6__tar.gz → 1.0.0rc7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/PKG-INFO +66 -18
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/README.md +63 -16
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/dataproc_spark_connect.egg-info/PKG-INFO +66 -18
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/dataproc_spark_connect.egg-info/requires.txt +1 -1
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/session.py +100 -33
- dataproc_spark_connect-1.0.0rc7/setup.cfg +14 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/setup.py +2 -2
- dataproc_spark_connect-1.0.0rc6/setup.cfg +0 -7
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/LICENSE +0 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/dataproc_spark_connect.egg-info/SOURCES.txt +0 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/client/core.py +0 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/environment.py +0 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/exceptions.py +0 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
- {dataproc_spark_connect-1.0.0rc6 → dataproc_spark_connect-1.0.0rc7}/pyproject.toml +0 -0
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0rc7
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
7
7
|
License: Apache 2.0
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
8
9
|
License-File: LICENSE
|
|
9
10
|
Requires-Dist: google-api-core>=2.19
|
|
10
11
|
Requires-Dist: google-cloud-dataproc>=5.18
|
|
11
12
|
Requires-Dist: packaging>=20.0
|
|
12
|
-
Requires-Dist: pyspark
|
|
13
|
+
Requires-Dist: pyspark-client~=4.0.0
|
|
13
14
|
Requires-Dist: tqdm>=4.67
|
|
14
15
|
Requires-Dist: websockets>=14.0
|
|
15
16
|
Dynamic: author
|
|
@@ -43,39 +44,86 @@ pip uninstall dataproc_spark_connect
|
|
|
43
44
|
|
|
44
45
|
This client requires permissions to
|
|
45
46
|
manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
|
|
46
|
-
If you are running the client outside of Google Cloud, you must set following
|
|
47
|
-
environment variables:
|
|
48
47
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
48
|
+
If you are running the client outside of Google Cloud, you need to provide
|
|
49
|
+
authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
|
|
50
|
+
variable to point to
|
|
51
|
+
your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
|
|
52
|
+
file.
|
|
53
|
+
|
|
54
|
+
You can specify the project and region either via environment variables or directly
|
|
55
|
+
in your code using the builder API:
|
|
56
|
+
|
|
57
|
+
* Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
|
|
58
|
+
* Builder API: `.projectId()` and `.location()` methods (recommended)
|
|
56
59
|
|
|
57
60
|
## Usage
|
|
58
61
|
|
|
59
|
-
1. Install the latest version of Dataproc
|
|
60
|
-
Connect modules:
|
|
62
|
+
1. Install the latest version of Dataproc Spark Connect:
|
|
61
63
|
|
|
62
64
|
```sh
|
|
63
|
-
pip install
|
|
65
|
+
pip install -U dataproc-spark-connect
|
|
64
66
|
```
|
|
65
67
|
|
|
66
68
|
2. Add the required imports into your PySpark application or notebook and start
|
|
67
|
-
a Spark session
|
|
68
|
-
|
|
69
|
+
a Spark session using the fluent API:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
73
|
+
spark = DataprocSparkSession.builder.getOrCreate()
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
3. You can configure Spark properties using the `.config()` method:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
80
|
+
spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
4. For advanced configuration, you can use the `Session` class to customize
|
|
84
|
+
settings like subnetwork or other environment configurations:
|
|
69
85
|
|
|
70
86
|
```python
|
|
71
87
|
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
72
88
|
from google.cloud.dataproc_v1 import Session
|
|
73
89
|
session_config = Session()
|
|
74
90
|
session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
|
|
75
|
-
session_config.runtime_config.version = '
|
|
76
|
-
spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
|
|
91
|
+
session_config.runtime_config.version = '3.0'
|
|
92
|
+
spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Reusing Named Sessions Across Notebooks
|
|
96
|
+
|
|
97
|
+
Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
|
|
98
|
+
|
|
99
|
+
To create or connect to a named session:
|
|
100
|
+
|
|
101
|
+
1. Create a session with a custom ID in your first notebook:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
105
|
+
session_id = 'my-ml-pipeline-session'
|
|
106
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
107
|
+
df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
|
|
108
|
+
df.show()
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
2. Reuse the same session in another notebook by specifying the same session ID:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
115
|
+
session_id = 'my-ml-pipeline-session'
|
|
116
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
117
|
+
df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
|
|
118
|
+
df.show()
|
|
77
119
|
```
|
|
78
120
|
|
|
121
|
+
3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
|
|
122
|
+
|
|
123
|
+
4. Named sessions persist until explicitly terminated or reach their configured TTL.
|
|
124
|
+
|
|
125
|
+
5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
|
|
126
|
+
|
|
79
127
|
### Using Spark SQL Magic Commands (Jupyter Notebooks)
|
|
80
128
|
|
|
81
129
|
The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
|
|
@@ -21,39 +21,86 @@ pip uninstall dataproc_spark_connect
|
|
|
21
21
|
|
|
22
22
|
This client requires permissions to
|
|
23
23
|
manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
|
|
24
|
-
If you are running the client outside of Google Cloud, you must set following
|
|
25
|
-
environment variables:
|
|
26
24
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
25
|
+
If you are running the client outside of Google Cloud, you need to provide
|
|
26
|
+
authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
|
|
27
|
+
variable to point to
|
|
28
|
+
your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
|
|
29
|
+
file.
|
|
30
|
+
|
|
31
|
+
You can specify the project and region either via environment variables or directly
|
|
32
|
+
in your code using the builder API:
|
|
33
|
+
|
|
34
|
+
* Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
|
|
35
|
+
* Builder API: `.projectId()` and `.location()` methods (recommended)
|
|
34
36
|
|
|
35
37
|
## Usage
|
|
36
38
|
|
|
37
|
-
1. Install the latest version of Dataproc
|
|
38
|
-
Connect modules:
|
|
39
|
+
1. Install the latest version of Dataproc Spark Connect:
|
|
39
40
|
|
|
40
41
|
```sh
|
|
41
|
-
pip install
|
|
42
|
+
pip install -U dataproc-spark-connect
|
|
42
43
|
```
|
|
43
44
|
|
|
44
45
|
2. Add the required imports into your PySpark application or notebook and start
|
|
45
|
-
a Spark session
|
|
46
|
-
|
|
46
|
+
a Spark session using the fluent API:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
50
|
+
spark = DataprocSparkSession.builder.getOrCreate()
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
3. You can configure Spark properties using the `.config()` method:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
57
|
+
spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
4. For advanced configuration, you can use the `Session` class to customize
|
|
61
|
+
settings like subnetwork or other environment configurations:
|
|
47
62
|
|
|
48
63
|
```python
|
|
49
64
|
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
50
65
|
from google.cloud.dataproc_v1 import Session
|
|
51
66
|
session_config = Session()
|
|
52
67
|
session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
|
|
53
|
-
session_config.runtime_config.version = '
|
|
54
|
-
spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
|
|
68
|
+
session_config.runtime_config.version = '3.0'
|
|
69
|
+
spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Reusing Named Sessions Across Notebooks
|
|
73
|
+
|
|
74
|
+
Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
|
|
75
|
+
|
|
76
|
+
To create or connect to a named session:
|
|
77
|
+
|
|
78
|
+
1. Create a session with a custom ID in your first notebook:
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
82
|
+
session_id = 'my-ml-pipeline-session'
|
|
83
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
84
|
+
df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
|
|
85
|
+
df.show()
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
2. Reuse the same session in another notebook by specifying the same session ID:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
92
|
+
session_id = 'my-ml-pipeline-session'
|
|
93
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
94
|
+
df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
|
|
95
|
+
df.show()
|
|
55
96
|
```
|
|
56
97
|
|
|
98
|
+
3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
|
|
99
|
+
|
|
100
|
+
4. Named sessions persist until explicitly terminated or reach their configured TTL.
|
|
101
|
+
|
|
102
|
+
5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
|
|
103
|
+
|
|
57
104
|
### Using Spark SQL Magic Commands (Jupyter Notebooks)
|
|
58
105
|
|
|
59
106
|
The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
|
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0rc7
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
7
7
|
License: Apache 2.0
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
8
9
|
License-File: LICENSE
|
|
9
10
|
Requires-Dist: google-api-core>=2.19
|
|
10
11
|
Requires-Dist: google-cloud-dataproc>=5.18
|
|
11
12
|
Requires-Dist: packaging>=20.0
|
|
12
|
-
Requires-Dist: pyspark
|
|
13
|
+
Requires-Dist: pyspark-client~=4.0.0
|
|
13
14
|
Requires-Dist: tqdm>=4.67
|
|
14
15
|
Requires-Dist: websockets>=14.0
|
|
15
16
|
Dynamic: author
|
|
@@ -43,39 +44,86 @@ pip uninstall dataproc_spark_connect
|
|
|
43
44
|
|
|
44
45
|
This client requires permissions to
|
|
45
46
|
manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
|
|
46
|
-
If you are running the client outside of Google Cloud, you must set following
|
|
47
|
-
environment variables:
|
|
48
47
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
48
|
+
If you are running the client outside of Google Cloud, you need to provide
|
|
49
|
+
authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
|
|
50
|
+
variable to point to
|
|
51
|
+
your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
|
|
52
|
+
file.
|
|
53
|
+
|
|
54
|
+
You can specify the project and region either via environment variables or directly
|
|
55
|
+
in your code using the builder API:
|
|
56
|
+
|
|
57
|
+
* Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
|
|
58
|
+
* Builder API: `.projectId()` and `.location()` methods (recommended)
|
|
56
59
|
|
|
57
60
|
## Usage
|
|
58
61
|
|
|
59
|
-
1. Install the latest version of Dataproc
|
|
60
|
-
Connect modules:
|
|
62
|
+
1. Install the latest version of Dataproc Spark Connect:
|
|
61
63
|
|
|
62
64
|
```sh
|
|
63
|
-
pip install
|
|
65
|
+
pip install -U dataproc-spark-connect
|
|
64
66
|
```
|
|
65
67
|
|
|
66
68
|
2. Add the required imports into your PySpark application or notebook and start
|
|
67
|
-
a Spark session
|
|
68
|
-
|
|
69
|
+
a Spark session using the fluent API:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
73
|
+
spark = DataprocSparkSession.builder.getOrCreate()
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
3. You can configure Spark properties using the `.config()` method:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
80
|
+
spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
4. For advanced configuration, you can use the `Session` class to customize
|
|
84
|
+
settings like subnetwork or other environment configurations:
|
|
69
85
|
|
|
70
86
|
```python
|
|
71
87
|
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
72
88
|
from google.cloud.dataproc_v1 import Session
|
|
73
89
|
session_config = Session()
|
|
74
90
|
session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
|
|
75
|
-
session_config.runtime_config.version = '
|
|
76
|
-
spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
|
|
91
|
+
session_config.runtime_config.version = '3.0'
|
|
92
|
+
spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Reusing Named Sessions Across Notebooks
|
|
96
|
+
|
|
97
|
+
Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
|
|
98
|
+
|
|
99
|
+
To create or connect to a named session:
|
|
100
|
+
|
|
101
|
+
1. Create a session with a custom ID in your first notebook:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
105
|
+
session_id = 'my-ml-pipeline-session'
|
|
106
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
107
|
+
df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
|
|
108
|
+
df.show()
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
2. Reuse the same session in another notebook by specifying the same session ID:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
115
|
+
session_id = 'my-ml-pipeline-session'
|
|
116
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
117
|
+
df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
|
|
118
|
+
df.show()
|
|
77
119
|
```
|
|
78
120
|
|
|
121
|
+
3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
|
|
122
|
+
|
|
123
|
+
4. Named sessions persist until explicitly terminated or reach their configured TTL.
|
|
124
|
+
|
|
125
|
+
5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
|
|
126
|
+
|
|
79
127
|
### Using Spark SQL Magic Commands (Jupyter Notebooks)
|
|
80
128
|
|
|
81
129
|
The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
|
|
15
15
|
import atexit
|
|
16
16
|
import datetime
|
|
17
|
+
import functools
|
|
17
18
|
import json
|
|
18
19
|
import logging
|
|
19
20
|
import os
|
|
@@ -25,8 +26,6 @@ import time
|
|
|
25
26
|
import uuid
|
|
26
27
|
import tqdm
|
|
27
28
|
from packaging import version
|
|
28
|
-
from tqdm import tqdm as cli_tqdm
|
|
29
|
-
from tqdm.notebook import tqdm as notebook_tqdm
|
|
30
29
|
from types import MethodType
|
|
31
30
|
from typing import Any, cast, ClassVar, Dict, Iterable, Optional, Union
|
|
32
31
|
|
|
@@ -67,6 +66,10 @@ SYSTEM_LABELS = {
|
|
|
67
66
|
"goog-colab-notebook-id",
|
|
68
67
|
}
|
|
69
68
|
|
|
69
|
+
_DATAPROC_SESSIONS_BASE_URL = (
|
|
70
|
+
"https://console.cloud.google.com/dataproc/interactive"
|
|
71
|
+
)
|
|
72
|
+
|
|
70
73
|
|
|
71
74
|
def _is_valid_label_value(value: str) -> bool:
|
|
72
75
|
"""
|
|
@@ -494,15 +497,21 @@ class DataprocSparkSession(SparkSession):
|
|
|
494
497
|
)
|
|
495
498
|
|
|
496
499
|
def _display_session_link_on_creation(self, session_id):
|
|
497
|
-
session_url = f"
|
|
500
|
+
session_url = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{session_id}?project={self._project_id}"
|
|
498
501
|
plain_message = f"Creating Dataproc Session: {session_url}"
|
|
499
|
-
|
|
502
|
+
if environment.is_colab_enterprise():
|
|
503
|
+
html_element = f"""
|
|
500
504
|
<div>
|
|
501
505
|
<p>Creating Dataproc Spark Session<p>
|
|
502
|
-
<p><a href="{session_url}">Dataproc Session</a></p>
|
|
503
506
|
</div>
|
|
504
|
-
|
|
505
|
-
|
|
507
|
+
"""
|
|
508
|
+
else:
|
|
509
|
+
html_element = f"""
|
|
510
|
+
<div>
|
|
511
|
+
<p>Creating Dataproc Spark Session<p>
|
|
512
|
+
<p><a href="{session_url}">Dataproc Session</a></p>
|
|
513
|
+
</div>
|
|
514
|
+
"""
|
|
506
515
|
self._output_element_or_message(plain_message, html_element)
|
|
507
516
|
|
|
508
517
|
def _print_session_created_message(self):
|
|
@@ -554,7 +563,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
554
563
|
|
|
555
564
|
if session_response is not None:
|
|
556
565
|
print(
|
|
557
|
-
f"Using existing Dataproc Session (configuration changes may not be applied):
|
|
566
|
+
f"Using existing Dataproc Session (configuration changes may not be applied): {_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{s8s_session_id}?project={self._project_id}"
|
|
558
567
|
)
|
|
559
568
|
self._display_view_session_details_button(s8s_session_id)
|
|
560
569
|
if session is None:
|
|
@@ -711,8 +720,6 @@ class DataprocSparkSession(SparkSession):
|
|
|
711
720
|
# Merge default configs with existing properties,
|
|
712
721
|
# user configs take precedence
|
|
713
722
|
for k, v in {
|
|
714
|
-
"spark.datasource.bigquery.viewsEnabled": "true",
|
|
715
|
-
"spark.datasource.bigquery.writeMethod": "direct",
|
|
716
723
|
"spark.sql.catalog.spark_catalog": "com.google.cloud.spark.bigquery.BigQuerySparkSessionCatalog",
|
|
717
724
|
"spark.sql.sources.default": "bigquery",
|
|
718
725
|
}.items():
|
|
@@ -734,7 +741,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
734
741
|
|
|
735
742
|
# Runtime version to server Python version mapping
|
|
736
743
|
RUNTIME_PYTHON_MAP = {
|
|
737
|
-
"3.0": (3,
|
|
744
|
+
"3.0": (3, 12),
|
|
738
745
|
}
|
|
739
746
|
|
|
740
747
|
client_python = sys.version_info[:2] # (major, minor)
|
|
@@ -798,7 +805,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
798
805
|
return
|
|
799
806
|
|
|
800
807
|
try:
|
|
801
|
-
session_url = f"
|
|
808
|
+
session_url = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{session_id}?project={self._project_id}"
|
|
802
809
|
from IPython.core.interactiveshell import InteractiveShell
|
|
803
810
|
|
|
804
811
|
if not InteractiveShell.initialized():
|
|
@@ -981,6 +988,28 @@ class DataprocSparkSession(SparkSession):
|
|
|
981
988
|
clearProgressHandlers_wrapper_method, self
|
|
982
989
|
)
|
|
983
990
|
|
|
991
|
+
@staticmethod
|
|
992
|
+
@functools.lru_cache(maxsize=1)
|
|
993
|
+
def get_tqdm_bar():
|
|
994
|
+
"""
|
|
995
|
+
Return a tqdm implementation that works in the current environment.
|
|
996
|
+
|
|
997
|
+
- Uses CLI tqdm for interactive terminals.
|
|
998
|
+
- Uses the notebook tqdm if available, otherwise falls back to CLI tqdm.
|
|
999
|
+
"""
|
|
1000
|
+
from tqdm import tqdm as cli_tqdm
|
|
1001
|
+
|
|
1002
|
+
if environment.is_interactive_terminal():
|
|
1003
|
+
return cli_tqdm
|
|
1004
|
+
|
|
1005
|
+
try:
|
|
1006
|
+
import ipywidgets
|
|
1007
|
+
from tqdm.notebook import tqdm as notebook_tqdm
|
|
1008
|
+
|
|
1009
|
+
return notebook_tqdm
|
|
1010
|
+
except ImportError:
|
|
1011
|
+
return cli_tqdm
|
|
1012
|
+
|
|
984
1013
|
def _register_progress_execution_handler(self):
|
|
985
1014
|
from pyspark.sql.connect.shell.progress import StageInfo
|
|
986
1015
|
|
|
@@ -1005,9 +1034,12 @@ class DataprocSparkSession(SparkSession):
|
|
|
1005
1034
|
total_tasks += stage.num_tasks
|
|
1006
1035
|
completed_tasks += stage.num_completed_tasks
|
|
1007
1036
|
|
|
1008
|
-
|
|
1009
|
-
if
|
|
1010
|
-
|
|
1037
|
+
# Don't show progress bar till we receive some tasks
|
|
1038
|
+
if total_tasks == 0:
|
|
1039
|
+
return
|
|
1040
|
+
|
|
1041
|
+
# Get correct tqdm (notebook or CLI)
|
|
1042
|
+
tqdm_pbar = self.get_tqdm_bar()
|
|
1011
1043
|
|
|
1012
1044
|
# Use a lock to ensure only one thread can access and modify
|
|
1013
1045
|
# the shared dictionaries at a time.
|
|
@@ -1044,13 +1076,11 @@ class DataprocSparkSession(SparkSession):
|
|
|
1044
1076
|
@staticmethod
|
|
1045
1077
|
def _sql_lazy_transformation(req):
|
|
1046
1078
|
# Select SQL command
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
return False
|
|
1079
|
+
try:
|
|
1080
|
+
query = req.plan.command.sql_command.input.sql.query
|
|
1081
|
+
return "select" in query.strip().lower().split()
|
|
1082
|
+
except AttributeError:
|
|
1083
|
+
return False
|
|
1054
1084
|
|
|
1055
1085
|
def _repr_html_(self) -> str:
|
|
1056
1086
|
if not self._active_s8s_session_id:
|
|
@@ -1058,7 +1088,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
1058
1088
|
<div>No Active Dataproc Session</div>
|
|
1059
1089
|
"""
|
|
1060
1090
|
|
|
1061
|
-
s8s_session = f"
|
|
1091
|
+
s8s_session = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{self._active_s8s_session_id}"
|
|
1062
1092
|
ui = f"{s8s_session}/sparkApplications/applications"
|
|
1063
1093
|
return f"""
|
|
1064
1094
|
<div>
|
|
@@ -1085,7 +1115,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
1085
1115
|
)
|
|
1086
1116
|
|
|
1087
1117
|
url = (
|
|
1088
|
-
f"
|
|
1118
|
+
f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/"
|
|
1089
1119
|
f"{self._active_s8s_session_id}/sparkApplications/application;"
|
|
1090
1120
|
f"associatedSqlOperationId={operation_id}?project={self._project_id}"
|
|
1091
1121
|
)
|
|
@@ -1177,20 +1207,52 @@ class DataprocSparkSession(SparkSession):
|
|
|
1177
1207
|
def _get_active_session_file_path():
|
|
1178
1208
|
return os.getenv("DATAPROC_SPARK_CONNECT_ACTIVE_SESSION_FILE_PATH")
|
|
1179
1209
|
|
|
1180
|
-
def stop(self) -> None:
|
|
1210
|
+
def stop(self, terminate: Optional[bool] = None) -> None:
|
|
1211
|
+
"""
|
|
1212
|
+
Stop the Spark session and optionally terminate the server-side session.
|
|
1213
|
+
|
|
1214
|
+
Parameters
|
|
1215
|
+
----------
|
|
1216
|
+
terminate : bool, optional
|
|
1217
|
+
Control server-side termination behavior.
|
|
1218
|
+
|
|
1219
|
+
- None (default): Auto-detect based on session type
|
|
1220
|
+
|
|
1221
|
+
- Managed sessions (auto-generated ID): terminate server
|
|
1222
|
+
- Named sessions (custom ID): client-side cleanup only
|
|
1223
|
+
|
|
1224
|
+
- True: Always terminate the server-side session
|
|
1225
|
+
- False: Never terminate the server-side session (client cleanup only)
|
|
1226
|
+
|
|
1227
|
+
Examples
|
|
1228
|
+
--------
|
|
1229
|
+
Auto-detect termination behavior (existing behavior):
|
|
1230
|
+
|
|
1231
|
+
>>> spark.stop()
|
|
1232
|
+
|
|
1233
|
+
Force terminate a named session:
|
|
1234
|
+
|
|
1235
|
+
>>> spark.stop(terminate=True)
|
|
1236
|
+
|
|
1237
|
+
Prevent termination of a managed session:
|
|
1238
|
+
|
|
1239
|
+
>>> spark.stop(terminate=False)
|
|
1240
|
+
"""
|
|
1181
1241
|
with DataprocSparkSession._lock:
|
|
1182
1242
|
if DataprocSparkSession._active_s8s_session_id is not None:
|
|
1183
|
-
#
|
|
1184
|
-
if
|
|
1185
|
-
#
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
f"Stopping unmanaged session {DataprocSparkSession._active_s8s_session_id} without termination"
|
|
1243
|
+
# Determine if we should terminate the server-side session
|
|
1244
|
+
if terminate is None:
|
|
1245
|
+
# Auto-detect: managed sessions terminate, named sessions don't
|
|
1246
|
+
should_terminate = (
|
|
1247
|
+
not DataprocSparkSession._active_session_uses_custom_id
|
|
1189
1248
|
)
|
|
1190
1249
|
else:
|
|
1191
|
-
|
|
1250
|
+
should_terminate = terminate
|
|
1251
|
+
|
|
1252
|
+
if should_terminate:
|
|
1253
|
+
# Terminate the server-side session
|
|
1192
1254
|
logger.debug(
|
|
1193
|
-
f"Terminating
|
|
1255
|
+
f"Terminating session {DataprocSparkSession._active_s8s_session_id}"
|
|
1194
1256
|
)
|
|
1195
1257
|
terminate_s8s_session(
|
|
1196
1258
|
DataprocSparkSession._project_id,
|
|
@@ -1198,6 +1260,11 @@ class DataprocSparkSession(SparkSession):
|
|
|
1198
1260
|
DataprocSparkSession._active_s8s_session_id,
|
|
1199
1261
|
self._client_options,
|
|
1200
1262
|
)
|
|
1263
|
+
else:
|
|
1264
|
+
# Client-side cleanup only
|
|
1265
|
+
logger.debug(
|
|
1266
|
+
f"Stopping session {DataprocSparkSession._active_s8s_session_id} without termination"
|
|
1267
|
+
)
|
|
1201
1268
|
|
|
1202
1269
|
self._remove_stopped_session_from_file()
|
|
1203
1270
|
|
|
@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
|
|
|
20
20
|
|
|
21
21
|
setup(
|
|
22
22
|
name="dataproc-spark-connect",
|
|
23
|
-
version="1.0.
|
|
23
|
+
version="1.0.0rc7",
|
|
24
24
|
description="Dataproc client library for Spark Connect",
|
|
25
25
|
long_description=long_description,
|
|
26
26
|
author="Google LLC",
|
|
@@ -31,7 +31,7 @@ setup(
|
|
|
31
31
|
"google-api-core>=2.19",
|
|
32
32
|
"google-cloud-dataproc>=5.18",
|
|
33
33
|
"packaging>=20.0",
|
|
34
|
-
"pyspark
|
|
34
|
+
"pyspark-client~=4.0.0",
|
|
35
35
|
"tqdm>=4.67",
|
|
36
36
|
"websockets>=14.0",
|
|
37
37
|
],
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|