dataproc-spark-connect 0.8.3__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. dataproc_spark_connect-1.0.0/PKG-INFO +200 -0
  2. dataproc_spark_connect-1.0.0/README.md +177 -0
  3. dataproc_spark_connect-1.0.0/dataproc_spark_connect.egg-info/PKG-INFO +200 -0
  4. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/dataproc_spark_connect.egg-info/SOURCES.txt +1 -0
  5. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/dataproc_spark_connect.egg-info/requires.txt +1 -1
  6. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/client/core.py +5 -3
  7. dataproc_spark_connect-1.0.0/google/cloud/dataproc_spark_connect/environment.py +101 -0
  8. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/exceptions.py +1 -1
  9. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/session.py +644 -76
  10. dataproc_spark_connect-1.0.0/pyproject.toml +9 -0
  11. dataproc_spark_connect-1.0.0/setup.cfg +14 -0
  12. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/setup.py +2 -2
  13. dataproc_spark_connect-0.8.3/PKG-INFO +0 -105
  14. dataproc_spark_connect-0.8.3/README.md +0 -83
  15. dataproc_spark_connect-0.8.3/dataproc_spark_connect.egg-info/PKG-INFO +0 -105
  16. dataproc_spark_connect-0.8.3/pyproject.toml +0 -3
  17. dataproc_spark_connect-0.8.3/setup.cfg +0 -7
  18. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/LICENSE +0 -0
  19. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
  20. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
  21. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
  22. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
  23. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
  24. {dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
@@ -0,0 +1,200 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataproc-spark-connect
3
+ Version: 1.0.0
4
+ Summary: Dataproc client library for Spark Connect
5
+ Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
+ Author: Google LLC
7
+ License: Apache 2.0
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: google-api-core>=2.19
11
+ Requires-Dist: google-cloud-dataproc>=5.18
12
+ Requires-Dist: packaging>=20.0
13
+ Requires-Dist: pyspark-client~=4.0.0
14
+ Requires-Dist: tqdm>=4.67
15
+ Requires-Dist: websockets>=14.0
16
+ Dynamic: author
17
+ Dynamic: description
18
+ Dynamic: home-page
19
+ Dynamic: license
20
+ Dynamic: license-file
21
+ Dynamic: requires-dist
22
+ Dynamic: summary
23
+
24
+ # Dataproc Spark Connect Client
25
+
26
+ A wrapper of the Apache [Spark Connect](https://spark.apache.org/spark-connect/)
27
+ client with additional functionalities that allow applications to communicate
28
+ with a remote Dataproc Spark Session using the Spark Connect protocol without
29
+ requiring additional steps.
30
+
31
+ ## Install
32
+
33
+ ```sh
34
+ pip install dataproc_spark_connect
35
+ ```
36
+
37
+ ## Uninstall
38
+
39
+ ```sh
40
+ pip uninstall dataproc_spark_connect
41
+ ```
42
+
43
+ ## Setup
44
+
45
+ This client requires permissions to
46
+ manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
47
+
48
+ If you are running the client outside of Google Cloud, you need to provide
49
+ authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
50
+ variable to point to
51
+ your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
52
+ file.
53
+
54
+ You can specify the project and region either via environment variables or directly
55
+ in your code using the builder API:
56
+
57
+ * Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
58
+ * Builder API: `.projectId()` and `.location()` methods (recommended)
59
+
60
+ ## Usage
61
+
62
+ 1. Install the latest version of Dataproc Spark Connect:
63
+
64
+ ```sh
65
+ pip install -U dataproc-spark-connect
66
+ ```
67
+
68
+ 2. Add the required imports into your PySpark application or notebook and start
69
+ a Spark session using the fluent API:
70
+
71
+ ```python
72
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
73
+ spark = DataprocSparkSession.builder.getOrCreate()
74
+ ```
75
+
76
+ 3. You can configure Spark properties using the `.config()` method:
77
+
78
+ ```python
79
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
80
+ spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
81
+ ```
82
+
83
+ 4. For advanced configuration, you can use the `Session` class to customize
84
+ settings like subnetwork or other environment configurations:
85
+
86
+ ```python
87
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
88
+ from google.cloud.dataproc_v1 import Session
89
+ session_config = Session()
90
+ session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
91
+ session_config.runtime_config.version = '3.0'
92
+ spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
93
+ ```
94
+
95
+ ### Reusing Named Sessions Across Notebooks
96
+
97
+ Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
98
+
99
+ To create or connect to a named session:
100
+
101
+ 1. Create a session with a custom ID in your first notebook:
102
+
103
+ ```python
104
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
105
+ session_id = 'my-ml-pipeline-session'
106
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
107
+ df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
108
+ df.show()
109
+ ```
110
+
111
+ 2. Reuse the same session in another notebook by specifying the same session ID:
112
+
113
+ ```python
114
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
115
+ session_id = 'my-ml-pipeline-session'
116
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
117
+ df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
118
+ df.show()
119
+ ```
120
+
121
+ 3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
122
+
123
+ 4. Named sessions persist until explicitly terminated or reach their configured TTL.
124
+
125
+ 5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
126
+
127
+ ### Using Spark SQL Magic Commands (Jupyter Notebooks)
128
+
129
+ The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
130
+
131
+ **Installation**: To use magic commands, install the required dependencies manually:
132
+ ```bash
133
+ pip install dataproc-spark-connect
134
+ pip install IPython sparksql-magic
135
+ ```
136
+
137
+ 1. Load the magic extension:
138
+ ```python
139
+ %load_ext sparksql_magic
140
+ ```
141
+
142
+ 2. Configure default settings (optional):
143
+ ```python
144
+ %config SparkSql.limit=20
145
+ ```
146
+
147
+ 3. Execute SQL queries:
148
+ ```python
149
+ %%sparksql
150
+ SELECT * FROM your_table
151
+ ```
152
+
153
+ 4. Advanced usage with options:
154
+ ```python
155
+ # Cache results and create a view
156
+ %%sparksql --cache --view result_view df
157
+ SELECT * FROM your_table WHERE condition = true
158
+ ```
159
+
160
+ Available options:
161
+ - `--cache` / `-c`: Cache the DataFrame
162
+ - `--eager` / `-e`: Cache with eager loading
163
+ - `--view VIEW` / `-v VIEW`: Create a temporary view
164
+ - `--limit N` / `-l N`: Override default row display limit
165
+ - `variable_name`: Store result in a variable
166
+
167
+ See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
168
+
169
+ **Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
170
+ ```bash
171
+ pip install dataproc-spark-connect
172
+ ```
173
+
174
+ ## Developing
175
+
176
+ For development instructions see [guide](DEVELOPING.md).
177
+
178
+ ## Contributing
179
+
180
+ We'd love to accept your patches and contributions to this project. There are
181
+ just a few small guidelines you need to follow.
182
+
183
+ ### Contributor License Agreement
184
+
185
+ Contributions to this project must be accompanied by a Contributor License
186
+ Agreement. You (or your employer) retain the copyright to your contribution;
187
+ this simply gives us permission to use and redistribute your contributions as
188
+ part of the project. Head over to <https://cla.developers.google.com> to see
189
+ your current agreements on file or to sign a new one.
190
+
191
+ You generally only need to submit a CLA once, so if you've already submitted one
192
+ (even if it was for a different project), you probably don't need to do it
193
+ again.
194
+
195
+ ### Code reviews
196
+
197
+ All submissions, including submissions by project members, require review. We
198
+ use GitHub pull requests for this purpose. Consult
199
+ [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
200
+ information on using pull requests.
@@ -0,0 +1,177 @@
1
+ # Dataproc Spark Connect Client
2
+
3
+ A wrapper of the Apache [Spark Connect](https://spark.apache.org/spark-connect/)
4
+ client with additional functionalities that allow applications to communicate
5
+ with a remote Dataproc Spark Session using the Spark Connect protocol without
6
+ requiring additional steps.
7
+
8
+ ## Install
9
+
10
+ ```sh
11
+ pip install dataproc_spark_connect
12
+ ```
13
+
14
+ ## Uninstall
15
+
16
+ ```sh
17
+ pip uninstall dataproc_spark_connect
18
+ ```
19
+
20
+ ## Setup
21
+
22
+ This client requires permissions to
23
+ manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
24
+
25
+ If you are running the client outside of Google Cloud, you need to provide
26
+ authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
27
+ variable to point to
28
+ your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
29
+ file.
30
+
31
+ You can specify the project and region either via environment variables or directly
32
+ in your code using the builder API:
33
+
34
+ * Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
35
+ * Builder API: `.projectId()` and `.location()` methods (recommended)
36
+
37
+ ## Usage
38
+
39
+ 1. Install the latest version of Dataproc Spark Connect:
40
+
41
+ ```sh
42
+ pip install -U dataproc-spark-connect
43
+ ```
44
+
45
+ 2. Add the required imports into your PySpark application or notebook and start
46
+ a Spark session using the fluent API:
47
+
48
+ ```python
49
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
50
+ spark = DataprocSparkSession.builder.getOrCreate()
51
+ ```
52
+
53
+ 3. You can configure Spark properties using the `.config()` method:
54
+
55
+ ```python
56
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
57
+ spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
58
+ ```
59
+
60
+ 4. For advanced configuration, you can use the `Session` class to customize
61
+ settings like subnetwork or other environment configurations:
62
+
63
+ ```python
64
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
65
+ from google.cloud.dataproc_v1 import Session
66
+ session_config = Session()
67
+ session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
68
+ session_config.runtime_config.version = '3.0'
69
+ spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
70
+ ```
71
+
72
+ ### Reusing Named Sessions Across Notebooks
73
+
74
+ Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
75
+
76
+ To create or connect to a named session:
77
+
78
+ 1. Create a session with a custom ID in your first notebook:
79
+
80
+ ```python
81
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
82
+ session_id = 'my-ml-pipeline-session'
83
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
84
+ df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
85
+ df.show()
86
+ ```
87
+
88
+ 2. Reuse the same session in another notebook by specifying the same session ID:
89
+
90
+ ```python
91
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
92
+ session_id = 'my-ml-pipeline-session'
93
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
94
+ df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
95
+ df.show()
96
+ ```
97
+
98
+ 3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
99
+
100
+ 4. Named sessions persist until explicitly terminated or reach their configured TTL.
101
+
102
+ 5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
103
+
104
+ ### Using Spark SQL Magic Commands (Jupyter Notebooks)
105
+
106
+ The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
107
+
108
+ **Installation**: To use magic commands, install the required dependencies manually:
109
+ ```bash
110
+ pip install dataproc-spark-connect
111
+ pip install IPython sparksql-magic
112
+ ```
113
+
114
+ 1. Load the magic extension:
115
+ ```python
116
+ %load_ext sparksql_magic
117
+ ```
118
+
119
+ 2. Configure default settings (optional):
120
+ ```python
121
+ %config SparkSql.limit=20
122
+ ```
123
+
124
+ 3. Execute SQL queries:
125
+ ```python
126
+ %%sparksql
127
+ SELECT * FROM your_table
128
+ ```
129
+
130
+ 4. Advanced usage with options:
131
+ ```python
132
+ # Cache results and create a view
133
+ %%sparksql --cache --view result_view df
134
+ SELECT * FROM your_table WHERE condition = true
135
+ ```
136
+
137
+ Available options:
138
+ - `--cache` / `-c`: Cache the DataFrame
139
+ - `--eager` / `-e`: Cache with eager loading
140
+ - `--view VIEW` / `-v VIEW`: Create a temporary view
141
+ - `--limit N` / `-l N`: Override default row display limit
142
+ - `variable_name`: Store result in a variable
143
+
144
+ See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
145
+
146
+ **Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
147
+ ```bash
148
+ pip install dataproc-spark-connect
149
+ ```
150
+
151
+ ## Developing
152
+
153
+ For development instructions see [guide](DEVELOPING.md).
154
+
155
+ ## Contributing
156
+
157
+ We'd love to accept your patches and contributions to this project. There are
158
+ just a few small guidelines you need to follow.
159
+
160
+ ### Contributor License Agreement
161
+
162
+ Contributions to this project must be accompanied by a Contributor License
163
+ Agreement. You (or your employer) retain the copyright to your contribution;
164
+ this simply gives us permission to use and redistribute your contributions as
165
+ part of the project. Head over to <https://cla.developers.google.com> to see
166
+ your current agreements on file or to sign a new one.
167
+
168
+ You generally only need to submit a CLA once, so if you've already submitted one
169
+ (even if it was for a different project), you probably don't need to do it
170
+ again.
171
+
172
+ ### Code reviews
173
+
174
+ All submissions, including submissions by project members, require review. We
175
+ use GitHub pull requests for this purpose. Consult
176
+ [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
177
+ information on using pull requests.
@@ -0,0 +1,200 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataproc-spark-connect
3
+ Version: 1.0.0
4
+ Summary: Dataproc client library for Spark Connect
5
+ Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
+ Author: Google LLC
7
+ License: Apache 2.0
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: google-api-core>=2.19
11
+ Requires-Dist: google-cloud-dataproc>=5.18
12
+ Requires-Dist: packaging>=20.0
13
+ Requires-Dist: pyspark-client~=4.0.0
14
+ Requires-Dist: tqdm>=4.67
15
+ Requires-Dist: websockets>=14.0
16
+ Dynamic: author
17
+ Dynamic: description
18
+ Dynamic: home-page
19
+ Dynamic: license
20
+ Dynamic: license-file
21
+ Dynamic: requires-dist
22
+ Dynamic: summary
23
+
24
+ # Dataproc Spark Connect Client
25
+
26
+ A wrapper of the Apache [Spark Connect](https://spark.apache.org/spark-connect/)
27
+ client with additional functionalities that allow applications to communicate
28
+ with a remote Dataproc Spark Session using the Spark Connect protocol without
29
+ requiring additional steps.
30
+
31
+ ## Install
32
+
33
+ ```sh
34
+ pip install dataproc_spark_connect
35
+ ```
36
+
37
+ ## Uninstall
38
+
39
+ ```sh
40
+ pip uninstall dataproc_spark_connect
41
+ ```
42
+
43
+ ## Setup
44
+
45
+ This client requires permissions to
46
+ manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
47
+
48
+ If you are running the client outside of Google Cloud, you need to provide
49
+ authentication credentials. Set the `GOOGLE_APPLICATION_CREDENTIALS` environment
50
+ variable to point to
51
+ your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
52
+ file.
53
+
54
+ You can specify the project and region either via environment variables or directly
55
+ in your code using the builder API:
56
+
57
+ * Environment variables: `GOOGLE_CLOUD_PROJECT` and `GOOGLE_CLOUD_REGION`
58
+ * Builder API: `.projectId()` and `.location()` methods (recommended)
59
+
60
+ ## Usage
61
+
62
+ 1. Install the latest version of Dataproc Spark Connect:
63
+
64
+ ```sh
65
+ pip install -U dataproc-spark-connect
66
+ ```
67
+
68
+ 2. Add the required imports into your PySpark application or notebook and start
69
+ a Spark session using the fluent API:
70
+
71
+ ```python
72
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
73
+ spark = DataprocSparkSession.builder.getOrCreate()
74
+ ```
75
+
76
+ 3. You can configure Spark properties using the `.config()` method:
77
+
78
+ ```python
79
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
80
+ spark = DataprocSparkSession.builder.config('spark.executor.memory', '4g').config('spark.executor.cores', '2').getOrCreate()
81
+ ```
82
+
83
+ 4. For advanced configuration, you can use the `Session` class to customize
84
+ settings like subnetwork or other environment configurations:
85
+
86
+ ```python
87
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
88
+ from google.cloud.dataproc_v1 import Session
89
+ session_config = Session()
90
+ session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
91
+ session_config.runtime_config.version = '3.0'
92
+ spark = DataprocSparkSession.builder.projectId('my-project').location('us-central1').dataprocSessionConfig(session_config).getOrCreate()
93
+ ```
94
+
95
+ ### Reusing Named Sessions Across Notebooks
96
+
97
+ Named sessions allow you to share a single Spark session across multiple notebooks, improving efficiency by avoiding repeated session startup times and reducing costs.
98
+
99
+ To create or connect to a named session:
100
+
101
+ 1. Create a session with a custom ID in your first notebook:
102
+
103
+ ```python
104
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
105
+ session_id = 'my-ml-pipeline-session'
106
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
107
+ df = spark.createDataFrame([(1, 'data')], ['id', 'value'])
108
+ df.show()
109
+ ```
110
+
111
+ 2. Reuse the same session in another notebook by specifying the same session ID:
112
+
113
+ ```python
114
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
115
+ session_id = 'my-ml-pipeline-session'
116
+ spark = DataprocSparkSession.builder.dataprocSessionId(session_id).getOrCreate()
117
+ df = spark.createDataFrame([(2, 'more-data')], ['id', 'value'])
118
+ df.show()
119
+ ```
120
+
121
+ 3. Session IDs must be 4-63 characters long, start with a lowercase letter, contain only lowercase letters, numbers, and hyphens, and not end with a hyphen.
122
+
123
+ 4. Named sessions persist until explicitly terminated or reach their configured TTL.
124
+
125
+ 5. A session with a given ID that is in a TERMINATED state cannot be reused. It must be deleted before a new session with the same ID can be created.
126
+
127
+ ### Using Spark SQL Magic Commands (Jupyter Notebooks)
128
+
129
+ The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
130
+
131
+ **Installation**: To use magic commands, install the required dependencies manually:
132
+ ```bash
133
+ pip install dataproc-spark-connect
134
+ pip install IPython sparksql-magic
135
+ ```
136
+
137
+ 1. Load the magic extension:
138
+ ```python
139
+ %load_ext sparksql_magic
140
+ ```
141
+
142
+ 2. Configure default settings (optional):
143
+ ```python
144
+ %config SparkSql.limit=20
145
+ ```
146
+
147
+ 3. Execute SQL queries:
148
+ ```python
149
+ %%sparksql
150
+ SELECT * FROM your_table
151
+ ```
152
+
153
+ 4. Advanced usage with options:
154
+ ```python
155
+ # Cache results and create a view
156
+ %%sparksql --cache --view result_view df
157
+ SELECT * FROM your_table WHERE condition = true
158
+ ```
159
+
160
+ Available options:
161
+ - `--cache` / `-c`: Cache the DataFrame
162
+ - `--eager` / `-e`: Cache with eager loading
163
+ - `--view VIEW` / `-v VIEW`: Create a temporary view
164
+ - `--limit N` / `-l N`: Override default row display limit
165
+ - `variable_name`: Store result in a variable
166
+
167
+ See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
168
+
169
+ **Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
170
+ ```bash
171
+ pip install dataproc-spark-connect
172
+ ```
173
+
174
+ ## Developing
175
+
176
+ For development instructions see [guide](DEVELOPING.md).
177
+
178
+ ## Contributing
179
+
180
+ We'd love to accept your patches and contributions to this project. There are
181
+ just a few small guidelines you need to follow.
182
+
183
+ ### Contributor License Agreement
184
+
185
+ Contributions to this project must be accompanied by a Contributor License
186
+ Agreement. You (or your employer) retain the copyright to your contribution;
187
+ this simply gives us permission to use and redistribute your contributions as
188
+ part of the project. Head over to <https://cla.developers.google.com> to see
189
+ your current agreements on file or to sign a new one.
190
+
191
+ You generally only need to submit a CLA once, so if you've already submitted one
192
+ (even if it was for a different project), you probably don't need to do it
193
+ again.
194
+
195
+ ### Code reviews
196
+
197
+ All submissions, including submissions by project members, require review. We
198
+ use GitHub pull requests for this purpose. Consult
199
+ [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
200
+ information on using pull requests.
@@ -9,6 +9,7 @@ dataproc_spark_connect.egg-info/dependency_links.txt
9
9
  dataproc_spark_connect.egg-info/requires.txt
10
10
  dataproc_spark_connect.egg-info/top_level.txt
11
11
  google/cloud/dataproc_spark_connect/__init__.py
12
+ google/cloud/dataproc_spark_connect/environment.py
12
13
  google/cloud/dataproc_spark_connect/exceptions.py
13
14
  google/cloud/dataproc_spark_connect/pypi_artifacts.py
14
15
  google/cloud/dataproc_spark_connect/session.py
@@ -1,6 +1,6 @@
1
1
  google-api-core>=2.19
2
2
  google-cloud-dataproc>=5.18
3
3
  packaging>=20.0
4
- pyspark[connect]~=3.5.1
4
+ pyspark-client~=4.0.0
5
5
  tqdm>=4.67
6
6
  websockets>=14.0
@@ -15,14 +15,14 @@ import logging
15
15
 
16
16
  import google
17
17
  import grpc
18
- from pyspark.sql.connect.client import ChannelBuilder
18
+ from pyspark.sql.connect.client import DefaultChannelBuilder
19
19
 
20
20
  from . import proxy
21
21
 
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
24
 
25
- class DataprocChannelBuilder(ChannelBuilder):
25
+ class DataprocChannelBuilder(DefaultChannelBuilder):
26
26
  """
27
27
  This is a helper class that is used to create a GRPC channel based on the given
28
28
  connection string per the documentation of Spark Connect.
@@ -88,7 +88,9 @@ class ProxiedChannel(grpc.Channel):
88
88
  self._proxy = proxy.DataprocSessionProxy(0, target_host)
89
89
  self._proxy.start()
90
90
  self._proxied_connect_url = f"sc://localhost:{self._proxy.port}"
91
- self._wrapped = ChannelBuilder(self._proxied_connect_url).toChannel()
91
+ self._wrapped = DefaultChannelBuilder(
92
+ self._proxied_connect_url
93
+ ).toChannel()
92
94
 
93
95
  def __enter__(self):
94
96
  return self