dataproc-spark-connect 0.9.0__tar.gz → 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataproc_spark_connect-1.0.0/PKG-INFO +200 -0
- dataproc_spark_connect-1.0.0/README.md +177 -0
- dataproc_spark_connect-1.0.0/dataproc_spark_connect.egg-info/PKG-INFO +200 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/dataproc_spark_connect.egg-info/requires.txt +1 -1
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/client/core.py +5 -3
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/environment.py +25 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/exceptions.py +1 -1
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/session.py +531 -86
- dataproc_spark_connect-1.0.0/pyproject.toml +9 -0
- dataproc_spark_connect-1.0.0/setup.cfg +14 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/setup.py +2 -2
- dataproc_spark_connect-0.9.0/PKG-INFO +0 -105
- dataproc_spark_connect-0.9.0/README.md +0 -83
- dataproc_spark_connect-0.9.0/dataproc_spark_connect.egg-info/PKG-INFO +0 -105
- dataproc_spark_connect-0.9.0/pyproject.toml +0 -3
- dataproc_spark_connect-0.9.0/setup.cfg +0 -7
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/LICENSE +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/dataproc_spark_connect.egg-info/SOURCES.txt +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataproc-spark-connect
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Dataproc client library for Spark Connect
|
|
5
|
+
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
|
+
Author: Google LLC
|
|
7
|
+
License: Apache 2.0
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: google-api-core>=2.19
|
|
11
|
+
Requires-Dist: google-cloud-dataproc>=5.18
|
|
12
|
+
Requires-Dist: packaging>=20.0
|
|
13
|
+
Requires-Dist: pyspark-client~=4.0.0
|
|
14
|
+
Requires-Dist: tqdm>=4.67
|
|
15
|
+
Requires-Dist: websockets>=14.0
|
|
16
|
+
Dynamic: author
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: home-page
|
|
19
|
+
Dynamic: license
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
Dynamic: requires-dist
|
|
22
|
+
Dynamic: summary
|
|
23
|
+
|
|
24
|
+
# Dataproc Spark Connect Client
|
|
25
|
+
|
|
26
|
+
A wrapper of the Apache [Spark Connect](https://spark.apache.org/spark-connect/)
|
|
27
|
+
client with additional functionalities that allow applications to communicate
|
|
28
|
+
with a remote Dataproc Spark Session using the Spark Connect protocol without
|
|
29
|
+
requiring additional steps.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```sh
|
|
34
|
+
pip install dataproc_spark_connect
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Uninstall
|
|
38
|
+
|
|
39
|
+
```sh
|
|
40
|
+
pip uninstall dataproc_spark_connect
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Setup
|
|
44
|
+
|
|
45
|
+
This client requires permissions to
|
|
46
|
+
manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
|
|
47
|
+
|
|
48
|
+
If you are running the client outside of Google Cloud, you need to provide
|
|
49
|
+
authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
|
|
50
|
+
variable to point to
|
|
51
|
+
your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
|
|
52
|
+
file.
|
|
53
|
+
|
|
54
|
+
You can specify the project and region either via environment variables or directly
|
|
55
|
+
in your code using the builder API:
|
|
56
|
+
|
|
57
|
+
* Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
|
|
58
|
+
* Builder API: `.projectId()` and `.location()` methods (recommended)
|
|
59
|
+
|
|
60
|
+
## Usage
|
|
61
|
+
|
|
62
|
+
1. Install the latest version of Dataproc Spark Connect:
|
|
63
|
+
|
|
64
|
+
```sh
|
|
65
|
+
pip install -U dataproc-spark-connect
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
2. Add the required imports into your PySpark application or notebook and start
|
|
69
|
+
a Spark session using the fluent API:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
73
|
+
spark = DataprocSparkSession.builder.getOrCreate()
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
3. You can configure Spark properties using the `.config()` method:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
80
|
+
spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
4. For advanced configuration, you can use the `Session` class to customize
|
|
84
|
+
settings like subnetwork or other environment configurations:
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
88
|
+
from google.cloud.dataproc_v1 import Session
|
|
89
|
+
session_config = Session()
|
|
90
|
+
session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
|
|
91
|
+
session_config.runtime_config.version = '3.0'
|
|
92
|
+
spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Reusing Named Sessions Across Notebooks
|
|
96
|
+
|
|
97
|
+
Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
|
|
98
|
+
|
|
99
|
+
To create or connect to a named session:
|
|
100
|
+
|
|
101
|
+
1. Create a session with a custom ID in your first notebook:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
105
|
+
session_id = 'my-ml-pipeline-session'
|
|
106
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
107
|
+
df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
|
|
108
|
+
df.show()
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
2. Reuse the same session in another notebook by specifying the same session ID:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
115
|
+
session_id = 'my-ml-pipeline-session'
|
|
116
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
117
|
+
df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
|
|
118
|
+
df.show()
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
|
|
122
|
+
|
|
123
|
+
4. Named sessions persist until explicitly terminated or reach their configured TTL.
|
|
124
|
+
|
|
125
|
+
5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
|
|
126
|
+
|
|
127
|
+
### Using Spark SQL Magic Commands (Jupyter Notebooks)
|
|
128
|
+
|
|
129
|
+
The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
|
|
130
|
+
|
|
131
|
+
**Installation**: To use magic commands, install the required dependencies manually:
|
|
132
|
+
```bash
|
|
133
|
+
pip install dataproc-spark-connect
|
|
134
|
+
pip install IPython sparksql-magic
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
1. Load the magic extension:
|
|
138
|
+
```python
|
|
139
|
+
%load_ext sparksql_magic
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
2. Configure default settings (optional):
|
|
143
|
+
```python
|
|
144
|
+
%config SparkSql.limit=20
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
3. Execute SQL queries:
|
|
148
|
+
```python
|
|
149
|
+
%%sparksql
|
|
150
|
+
SELECT * FROM your_table
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
4. Advanced usage with options:
|
|
154
|
+
```python
|
|
155
|
+
# Cache results and create a view
|
|
156
|
+
%%sparksql --cache --view result_view df
|
|
157
|
+
SELECT * FROM your_table WHERE condition = true
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Available options:
|
|
161
|
+
- `--cache` / `-c`: Cache the DataFrame
|
|
162
|
+
- `--eager` / `-e`: Cache with eager loading
|
|
163
|
+
- `--view VIEW` / `-v VIEW`: Create a temporary view
|
|
164
|
+
- `--limit N` / `-l N`: Override default row display limit
|
|
165
|
+
- `variable_name`: Store result in a variable
|
|
166
|
+
|
|
167
|
+
See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
|
|
168
|
+
|
|
169
|
+
**Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
|
|
170
|
+
```bash
|
|
171
|
+
pip install dataproc-spark-connect
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Developing
|
|
175
|
+
|
|
176
|
+
For development instructions see [guide](DEVELOPING.md).
|
|
177
|
+
|
|
178
|
+
## Contributing
|
|
179
|
+
|
|
180
|
+
We'd love to accept your patches and contributions to this project. There are
|
|
181
|
+
just a few small guidelines you need to follow.
|
|
182
|
+
|
|
183
|
+
### Contributor License Agreement
|
|
184
|
+
|
|
185
|
+
Contributions to this project must be accompanied by a Contributor License
|
|
186
|
+
Agreement. You (or your employer) retain the copyright to your contribution;
|
|
187
|
+
this simply gives us permission to use and redistribute your contributions as
|
|
188
|
+
part of the project. Head over to <https://cla.developers.google.com> to see
|
|
189
|
+
your current agreements on file or to sign a new one.
|
|
190
|
+
|
|
191
|
+
You generally only need to submit a CLA once, so if you've already submitted one
|
|
192
|
+
(even if it was for a different project), you probably don't need to do it
|
|
193
|
+
again.
|
|
194
|
+
|
|
195
|
+
### Code reviews
|
|
196
|
+
|
|
197
|
+
All submissions, including submissions by project members, require review. We
|
|
198
|
+
use GitHub pull requests for this purpose. Consult
|
|
199
|
+
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
|
|
200
|
+
information on using pull requests.
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# Dataproc Spark Connect Client
|
|
2
|
+
|
|
3
|
+
A wrapper of the Apache [Spark Connect](https://spark.apache.org/spark-connect/)
|
|
4
|
+
client with additional functionalities that allow applications to communicate
|
|
5
|
+
with a remote Dataproc Spark Session using the Spark Connect protocol without
|
|
6
|
+
requiring additional steps.
|
|
7
|
+
|
|
8
|
+
## Install
|
|
9
|
+
|
|
10
|
+
```sh
|
|
11
|
+
pip install dataproc_spark_connect
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Uninstall
|
|
15
|
+
|
|
16
|
+
```sh
|
|
17
|
+
pip uninstall dataproc_spark_connect
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Setup
|
|
21
|
+
|
|
22
|
+
This client requires permissions to
|
|
23
|
+
manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
|
|
24
|
+
|
|
25
|
+
If you are running the client outside of Google Cloud, you need to provide
|
|
26
|
+
authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
|
|
27
|
+
variable to point to
|
|
28
|
+
your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
|
|
29
|
+
file.
|
|
30
|
+
|
|
31
|
+
You can specify the project and region either via environment variables or directly
|
|
32
|
+
in your code using the builder API:
|
|
33
|
+
|
|
34
|
+
* Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
|
|
35
|
+
* Builder API: `.projectId()` and `.location()` methods (recommended)
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
1. Install the latest version of Dataproc Spark Connect:
|
|
40
|
+
|
|
41
|
+
```sh
|
|
42
|
+
pip install -U dataproc-spark-connect
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
2. Add the required imports into your PySpark application or notebook and start
|
|
46
|
+
a Spark session using the fluent API:
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
50
|
+
spark = DataprocSparkSession.builder.getOrCreate()
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
3. You can configure Spark properties using the `.config()` method:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
57
|
+
spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
4. For advanced configuration, you can use the `Session` class to customize
|
|
61
|
+
settings like subnetwork or other environment configurations:
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
65
|
+
from google.cloud.dataproc_v1 import Session
|
|
66
|
+
session_config = Session()
|
|
67
|
+
session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
|
|
68
|
+
session_config.runtime_config.version = '3.0'
|
|
69
|
+
spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Reusing Named Sessions Across Notebooks
|
|
73
|
+
|
|
74
|
+
Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
|
|
75
|
+
|
|
76
|
+
To create or connect to a named session:
|
|
77
|
+
|
|
78
|
+
1. Create a session with a custom ID in your first notebook:
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
82
|
+
session_id = 'my-ml-pipeline-session'
|
|
83
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
84
|
+
df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
|
|
85
|
+
df.show()
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
2. Reuse the same session in another notebook by specifying the same session ID:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
92
|
+
session_id = 'my-ml-pipeline-session'
|
|
93
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
94
|
+
df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
|
|
95
|
+
df.show()
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
|
|
99
|
+
|
|
100
|
+
4. Named sessions persist until explicitly terminated or reach their configured TTL.
|
|
101
|
+
|
|
102
|
+
5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
|
|
103
|
+
|
|
104
|
+
### Using Spark SQL Magic Commands (Jupyter Notebooks)
|
|
105
|
+
|
|
106
|
+
The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
|
|
107
|
+
|
|
108
|
+
**Installation**: To use magic commands, install the required dependencies manually:
|
|
109
|
+
```bash
|
|
110
|
+
pip install dataproc-spark-connect
|
|
111
|
+
pip install IPython sparksql-magic
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
1. Load the magic extension:
|
|
115
|
+
```python
|
|
116
|
+
%load_ext sparksql_magic
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
2. Configure default settings (optional):
|
|
120
|
+
```python
|
|
121
|
+
%config SparkSql.limit=20
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
3. Execute SQL queries:
|
|
125
|
+
```python
|
|
126
|
+
%%sparksql
|
|
127
|
+
SELECT * FROM your_table
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
4. Advanced usage with options:
|
|
131
|
+
```python
|
|
132
|
+
# Cache results and create a view
|
|
133
|
+
%%sparksql --cache --view result_view df
|
|
134
|
+
SELECT * FROM your_table WHERE condition = true
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Available options:
|
|
138
|
+
- `--cache` / `-c`: Cache the DataFrame
|
|
139
|
+
- `--eager` / `-e`: Cache with eager loading
|
|
140
|
+
- `--view VIEW` / `-v VIEW`: Create a temporary view
|
|
141
|
+
- `--limit N` / `-l N`: Override default row display limit
|
|
142
|
+
- `variable_name`: Store result in a variable
|
|
143
|
+
|
|
144
|
+
See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
|
|
145
|
+
|
|
146
|
+
**Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
|
|
147
|
+
```bash
|
|
148
|
+
pip install dataproc-spark-connect
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
## Developing
|
|
152
|
+
|
|
153
|
+
For development instructions see [guide](DEVELOPING.md).
|
|
154
|
+
|
|
155
|
+
## Contributing
|
|
156
|
+
|
|
157
|
+
We'd love to accept your patches and contributions to this project. There are
|
|
158
|
+
just a few small guidelines you need to follow.
|
|
159
|
+
|
|
160
|
+
### Contributor License Agreement
|
|
161
|
+
|
|
162
|
+
Contributions to this project must be accompanied by a Contributor License
|
|
163
|
+
Agreement. You (or your employer) retain the copyright to your contribution;
|
|
164
|
+
this simply gives us permission to use and redistribute your contributions as
|
|
165
|
+
part of the project. Head over to <https://cla.developers.google.com> to see
|
|
166
|
+
your current agreements on file or to sign a new one.
|
|
167
|
+
|
|
168
|
+
You generally only need to submit a CLA once, so if you've already submitted one
|
|
169
|
+
(even if it was for a different project), you probably don't need to do it
|
|
170
|
+
again.
|
|
171
|
+
|
|
172
|
+
### Code reviews
|
|
173
|
+
|
|
174
|
+
All submissions, including submissions by project members, require review. We
|
|
175
|
+
use GitHub pull requests for this purpose. Consult
|
|
176
|
+
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
|
|
177
|
+
information on using pull requests.
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dataproc-spark-connect
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Dataproc client library for Spark Connect
|
|
5
|
+
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
|
+
Author: Google LLC
|
|
7
|
+
License: Apache 2.0
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: google-api-core>=2.19
|
|
11
|
+
Requires-Dist: google-cloud-dataproc>=5.18
|
|
12
|
+
Requires-Dist: packaging>=20.0
|
|
13
|
+
Requires-Dist: pyspark-client~=4.0.0
|
|
14
|
+
Requires-Dist: tqdm>=4.67
|
|
15
|
+
Requires-Dist: websockets>=14.0
|
|
16
|
+
Dynamic: author
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: home-page
|
|
19
|
+
Dynamic: license
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
Dynamic: requires-dist
|
|
22
|
+
Dynamic: summary
|
|
23
|
+
|
|
24
|
+
# Dataproc Spark Connect Client
|
|
25
|
+
|
|
26
|
+
A wrapper of the Apache [Spark Connect](https://spark.apache.org/spark-connect/)
|
|
27
|
+
client with additional functionalities that allow applications to communicate
|
|
28
|
+
with a remote Dataproc Spark Session using the Spark Connect protocol without
|
|
29
|
+
requiring additional steps.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```sh
|
|
34
|
+
pip install dataproc_spark_connect
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Uninstall
|
|
38
|
+
|
|
39
|
+
```sh
|
|
40
|
+
pip uninstall dataproc_spark_connect
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Setup
|
|
44
|
+
|
|
45
|
+
This client requires permissions to
|
|
46
|
+
manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
|
|
47
|
+
|
|
48
|
+
If you are running the client outside of Google Cloud, you need to provide
|
|
49
|
+
authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
|
|
50
|
+
variable to point to
|
|
51
|
+
your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
|
|
52
|
+
file.
|
|
53
|
+
|
|
54
|
+
You can specify the project and region either via environment variables or directly
|
|
55
|
+
in your code using the builder API:
|
|
56
|
+
|
|
57
|
+
* Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
|
|
58
|
+
* Builder API: `.projectId()` and `.location()` methods (recommended)
|
|
59
|
+
|
|
60
|
+
## Usage
|
|
61
|
+
|
|
62
|
+
1. Install the latest version of Dataproc Spark Connect:
|
|
63
|
+
|
|
64
|
+
```sh
|
|
65
|
+
pip install -U dataproc-spark-connect
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
2. Add the required imports into your PySpark application or notebook and start
|
|
69
|
+
a Spark session using the fluent API:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
73
|
+
spark = DataprocSparkSession.builder.getOrCreate()
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
3. You can configure Spark properties using the `.config()` method:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
80
|
+
spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
4. For advanced configuration, you can use the `Session` class to customize
|
|
84
|
+
settings like subnetwork or other environment configurations:
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
88
|
+
from google.cloud.dataproc_v1 import Session
|
|
89
|
+
session_config = Session()
|
|
90
|
+
session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
|
|
91
|
+
session_config.runtime_config.version = '3.0'
|
|
92
|
+
spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Reusing Named Sessions Across Notebooks
|
|
96
|
+
|
|
97
|
+
Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
|
|
98
|
+
|
|
99
|
+
To create or connect to a named session:
|
|
100
|
+
|
|
101
|
+
1. Create a session with a custom ID in your first notebook:
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
105
|
+
session_id = 'my-ml-pipeline-session'
|
|
106
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
107
|
+
df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
|
|
108
|
+
df.show()
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
2. Reuse the same session in another notebook by specifying the same session ID:
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from google.cloud.dataproc_spark_connect import DataprocSparkSession
|
|
115
|
+
session_id = 'my-ml-pipeline-session'
|
|
116
|
+
spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
|
|
117
|
+
df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
|
|
118
|
+
df.show()
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
|
|
122
|
+
|
|
123
|
+
4. Named sessions persist until explicitly terminated or reach their configured TTL.
|
|
124
|
+
|
|
125
|
+
5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
|
|
126
|
+
|
|
127
|
+
### Using Spark SQL Magic Commands (Jupyter Notebooks)
|
|
128
|
+
|
|
129
|
+
The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
|
|
130
|
+
|
|
131
|
+
**Installation**: To use magic commands, install the required dependencies manually:
|
|
132
|
+
```bash
|
|
133
|
+
pip install dataproc-spark-connect
|
|
134
|
+
pip install IPython sparksql-magic
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
1. Load the magic extension:
|
|
138
|
+
```python
|
|
139
|
+
%load_ext sparksql_magic
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
2. Configure default settings (optional):
|
|
143
|
+
```python
|
|
144
|
+
%config SparkSql.limit=20
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
3. Execute SQL queries:
|
|
148
|
+
```python
|
|
149
|
+
%%sparksql
|
|
150
|
+
SELECT * FROM your_table
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
4. Advanced usage with options:
|
|
154
|
+
```python
|
|
155
|
+
# Cache results and create a view
|
|
156
|
+
%%sparksql --cache --view result_view df
|
|
157
|
+
SELECT * FROM your_table WHERE condition = true
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Available options:
|
|
161
|
+
- `--cache` / `-c`: Cache the DataFrame
|
|
162
|
+
- `--eager` / `-e`: Cache with eager loading
|
|
163
|
+
- `--view VIEW` / `-v VIEW`: Create a temporary view
|
|
164
|
+
- `--limit N` / `-l N`: Override default row display limit
|
|
165
|
+
- `variable_name`: Store result in a variable
|
|
166
|
+
|
|
167
|
+
See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
|
|
168
|
+
|
|
169
|
+
**Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
|
|
170
|
+
```bash
|
|
171
|
+
pip install dataproc-spark-connect
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Developing
|
|
175
|
+
|
|
176
|
+
For development instructions see [guide](DEVELOPING.md).
|
|
177
|
+
|
|
178
|
+
## Contributing
|
|
179
|
+
|
|
180
|
+
We'd love to accept your patches and contributions to this project. There are
|
|
181
|
+
just a few small guidelines you need to follow.
|
|
182
|
+
|
|
183
|
+
### Contributor License Agreement
|
|
184
|
+
|
|
185
|
+
Contributions to this project must be accompanied by a Contributor License
|
|
186
|
+
Agreement. You (or your employer) retain the copyright to your contribution;
|
|
187
|
+
this simply gives us permission to use and redistribute your contributions as
|
|
188
|
+
part of the project. Head over to <https://cla.developers.google.com> to see
|
|
189
|
+
your current agreements on file or to sign a new one.
|
|
190
|
+
|
|
191
|
+
You generally only need to submit a CLA once, so if you've already submitted one
|
|
192
|
+
(even if it was for a different project), you probably don't need to do it
|
|
193
|
+
again.
|
|
194
|
+
|
|
195
|
+
### Code reviews
|
|
196
|
+
|
|
197
|
+
All submissions, including submissions by project members, require review. We
|
|
198
|
+
use GitHub pull requests for this purpose. Consult
|
|
199
|
+
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
|
|
200
|
+
information on using pull requests.
|
|
@@ -15,14 +15,14 @@ import logging
|
|
|
15
15
|
|
|
16
16
|
import google
|
|
17
17
|
import grpc
|
|
18
|
-
from pyspark.sql.connect.client import
|
|
18
|
+
from pyspark.sql.connect.client import DefaultChannelBuilder
|
|
19
19
|
|
|
20
20
|
from . import proxy
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
class DataprocChannelBuilder(
|
|
25
|
+
class DataprocChannelBuilder(DefaultChannelBuilder):
|
|
26
26
|
"""
|
|
27
27
|
This is a helper class that is used to create a GRPC channel based on the given
|
|
28
28
|
connection string per the documentation of Spark Connect.
|
|
@@ -88,7 +88,9 @@ class ProxiedChannel(grpc.Channel):
|
|
|
88
88
|
self._proxy = proxy.DataprocSessionProxy(0, target_host)
|
|
89
89
|
self._proxy.start()
|
|
90
90
|
self._proxied_connect_url = f"sc://localhost:{self._proxy.port}"
|
|
91
|
-
self._wrapped =
|
|
91
|
+
self._wrapped = DefaultChannelBuilder(
|
|
92
|
+
self._proxied_connect_url
|
|
93
|
+
).toChannel()
|
|
92
94
|
|
|
93
95
|
def __enter__(self):
|
|
94
96
|
return self
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import os
|
|
16
|
+
import sys
|
|
16
17
|
from typing import Callable, Tuple, List
|
|
17
18
|
|
|
18
19
|
|
|
@@ -46,6 +47,30 @@ def is_jetbrains_ide() -> bool:
|
|
|
46
47
|
return "jetbrains" in os.getenv("TERMINAL_EMULATOR", "").lower()
|
|
47
48
|
|
|
48
49
|
|
|
50
|
+
def is_interactive():
|
|
51
|
+
try:
|
|
52
|
+
from IPython import get_ipython
|
|
53
|
+
|
|
54
|
+
if get_ipython() is not None:
|
|
55
|
+
return True
|
|
56
|
+
except ImportError:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
return hasattr(sys, "ps1") or sys.flags.interactive
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def is_terminal():
|
|
63
|
+
return sys.stdin.isatty()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def is_interactive_terminal():
|
|
67
|
+
return is_interactive() and is_terminal()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def is_dataproc_batch() -> bool:
|
|
71
|
+
return os.getenv("DATAPROC_WORKLOAD_TYPE") == "batch"
|
|
72
|
+
|
|
73
|
+
|
|
49
74
|
def get_client_environment_label() -> str:
|
|
50
75
|
"""
|
|
51
76
|
Map current environment to a standardized client label.
|