dataproc-spark-connect 0.2.1__py2.py3-none-any.whl → 0.7.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,98 @@
1
+ Metadata-Version: 2.1
2
+ Name: dataproc-spark-connect
3
+ Version: 0.7.0
4
+ Summary: Dataproc client library for Spark Connect
5
+ Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
+ Author: Google LLC
7
+ License: Apache 2.0
8
+ License-File: LICENSE
9
+ Requires-Dist: google-api-core>=2.19
10
+ Requires-Dist: google-cloud-dataproc>=5.18
11
+ Requires-Dist: packaging>=20.0
12
+ Requires-Dist: pyspark[connect]>=3.5
13
+ Requires-Dist: tqdm>=4.67
14
+ Requires-Dist: websockets>=15.0
15
+
16
+ # Dataproc Spark Connect Client
17
+
18
+ A wrapper of the Apache [Spark Connect](https://spark.apache.org/spark-connect/)
19
+ client with additional functionalities that allow applications to communicate
20
+ with a remote Dataproc Spark Session using the Spark Connect protocol without
21
+ requiring additional steps.
22
+
23
+ ## Install
24
+
25
+ ```sh
26
+ pip install dataproc_spark_connect
27
+ ```
28
+
29
+ ## Uninstall
30
+
31
+ ```sh
32
+ pip uninstall dataproc_spark_connect
33
+ ```
34
+
35
+ ## Setup
36
+
37
+ This client requires permissions to
38
+ manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
39
+ If you are running the client outside of Google Cloud, you must set following
40
+ environment variables:
41
+
42
+ * `GOOGLE_CLOUD_PROJECT` - The Google Cloud project you use to run Spark
43
+ workloads
44
+ * `GOOGLE_CLOUD_REGION` - The Compute
45
+ Engine [region](https://cloud.google.com/compute/docs/regions-zones#available)
46
+ where you run the Spark workload.
47
+ * `GOOGLE_APPLICATION_CREDENTIALS` -
48
+ Your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
49
+
50
+ ## Usage
51
+
52
+ 1. Install the latest version of Dataproc Python client and Dataproc Spark
53
+ Connect modules:
54
+
55
+ ```sh
56
+ pip install google_cloud_dataproc dataproc_spark_connect --force-reinstall
57
+ ```
58
+
59
+ 2. Add the required imports into your PySpark application or notebook and start
60
+ a Spark session with the following code instead of using
61
+ environment variables:
62
+
63
+ ```python
64
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
65
+ from google.cloud.dataproc_v1 import Session
66
+ session_config = Session()
67
+ session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
68
+ session_config.runtime_config.version = '2.2'
69
+ spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
70
+ ```
71
+
72
+ ## Developing
73
+
74
+ For development instructions see [guide](DEVELOPING.md).
75
+
76
+ ## Contributing
77
+
78
+ We'd love to accept your patches and contributions to this project. There are
79
+ just a few small guidelines you need to follow.
80
+
81
+ ### Contributor License Agreement
82
+
83
+ Contributions to this project must be accompanied by a Contributor License
84
+ Agreement. You (or your employer) retain the copyright to your contribution;
85
+ this simply gives us permission to use and redistribute your contributions as
86
+ part of the project. Head over to <https://cla.developers.google.com> to see
87
+ your current agreements on file or to sign a new one.
88
+
89
+ You generally only need to submit a CLA once, so if you've already submitted one
90
+ (even if it was for a different project), you probably don't need to do it
91
+ again.
92
+
93
+ ### Code reviews
94
+
95
+ All submissions, including submissions by project members, require review. We
96
+ use GitHub pull requests for this purpose. Consult
97
+ [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
98
+ information on using pull requests.
@@ -0,0 +1,12 @@
1
+ google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
2
+ google/cloud/dataproc_spark_connect/exceptions.py,sha256=ilGyHD5M_yBQ3IC58-Y5miRGIQVJsLaNKvEGcHuk_BE,969
3
+ google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
4
+ google/cloud/dataproc_spark_connect/session.py,sha256=98Zrn0Vyl2ajcF5hltdSp8LgYTOzDa-eqeYxxmZVKds,26398
5
+ google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
6
+ google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
7
+ google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
8
+ dataproc_spark_connect-0.7.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
9
+ dataproc_spark_connect-0.7.0.dist-info/METADATA,sha256=fFJLyzjo3CKLx1d18U4i1csJEPgWxoAjok4qFghtOyE,3328
10
+ dataproc_spark_connect-0.7.0.dist-info/WHEEL,sha256=OpXWERl2xLPRHTvd2ZXo_iluPEQd8uSbYkJ53NAER_Y,109
11
+ dataproc_spark_connect-0.7.0.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
12
+ dataproc_spark_connect-0.7.0.dist-info/RECORD,,
@@ -11,13 +11,19 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- from .session import DataprocSparkSession
14
+ import importlib.metadata
15
15
  import warnings
16
16
 
17
- warnings.warn(
18
- "The package 'dataproc-spark-connect' has been renamed to 'google-spark-connect'. "
19
- "'dataproc-spark-connect' will no longer be updated. "
20
- "For help using 'google-spark-connect', "
21
- "see https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python/blob/main/README.md. ",
22
- DeprecationWarning,
23
- )
17
+ from .session import DataprocSparkSession
18
+
19
+ old_package_name = "google-spark-connect"
20
+ current_package_name = "dataproc-spark-connect"
21
+ try:
22
+ importlib.metadata.distribution(old_package_name)
23
+ warnings.warn(
24
+ f"Package '{old_package_name}' is already installed in your environment. "
25
+ f"This might cause conflicts with '{current_package_name}'. "
26
+ f"Consider uninstalling '{old_package_name}' and only install '{current_package_name}'."
27
+ )
28
+ except:
29
+ pass
@@ -11,12 +11,16 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
+ import logging
15
+
14
16
  import google
15
17
  import grpc
16
18
  from pyspark.sql.connect.client import ChannelBuilder
17
19
 
18
20
  from . import proxy
19
21
 
22
+ logger = logging.getLogger(__name__)
23
+
20
24
 
21
25
  class DataprocChannelBuilder(ChannelBuilder):
22
26
  """
@@ -36,6 +40,10 @@ class DataprocChannelBuilder(ChannelBuilder):
36
40
  True
37
41
  """
38
42
 
43
+ def __init__(self, url, is_active_callback=None):
44
+ self._is_active_callback = is_active_callback
45
+ super().__init__(url)
46
+
39
47
  def toChannel(self) -> grpc.Channel:
40
48
  """
41
49
  Applies the parameters of the connection string and creates a new
@@ -51,7 +59,7 @@ class DataprocChannelBuilder(ChannelBuilder):
51
59
  return self._proxied_channel()
52
60
 
53
61
  def _proxied_channel(self) -> grpc.Channel:
54
- return ProxiedChannel(self.host)
62
+ return ProxiedChannel(self.host, self._is_active_callback)
55
63
 
56
64
  def _direct_channel(self) -> grpc.Channel:
57
65
  destination = f"{self.host}:{self.port}"
@@ -75,7 +83,8 @@ class DataprocChannelBuilder(ChannelBuilder):
75
83
 
76
84
  class ProxiedChannel(grpc.Channel):
77
85
 
78
- def __init__(self, target_host):
86
+ def __init__(self, target_host, is_active_callback):
87
+ self._is_active_callback = is_active_callback
79
88
  self._proxy = proxy.DataprocSessionProxy(0, target_host)
80
89
  self._proxy.start()
81
90
  self._proxied_connect_url = f"sc://localhost:{self._proxy.port}"
@@ -94,20 +103,37 @@ class ProxiedChannel(grpc.Channel):
94
103
  self._proxy.stop()
95
104
  return ret
96
105
 
106
+ def _wrap_method(self, wrapped_method):
107
+ if self._is_active_callback is None:
108
+ return wrapped_method
109
+
110
+ def checked_method(*margs, **mkwargs):
111
+ if (
112
+ self._is_active_callback is not None
113
+ and not self._is_active_callback()
114
+ ):
115
+ logger.warning(f"Session is no longer active")
116
+ raise RuntimeError(
117
+ "Session not active. Please create a new session"
118
+ )
119
+ return wrapped_method(*margs, **mkwargs)
120
+
121
+ return checked_method
122
+
97
123
  def stream_stream(self, *args, **kwargs):
98
- return self._wrapped.stream_stream(*args, **kwargs)
124
+ return self._wrap_method(self._wrapped.stream_stream(*args, **kwargs))
99
125
 
100
126
  def stream_unary(self, *args, **kwargs):
101
- return self._wrapped.stream_unary(*args, **kwargs)
127
+ return self._wrap_method(self._wrapped.stream_unary(*args, **kwargs))
102
128
 
103
129
  def subscribe(self, *args, **kwargs):
104
- return self._wrapped.subscribe(*args, **kwargs)
130
+ return self._wrap_method(self._wrapped.subscribe(*args, **kwargs))
105
131
 
106
132
  def unary_stream(self, *args, **kwargs):
107
- return self._wrapped.unary_stream(*args, **kwargs)
133
+ return self._wrap_method(self._wrapped.unary_stream(*args, **kwargs))
108
134
 
109
135
  def unary_unary(self, *args, **kwargs):
110
- return self._wrapped.unary_unary(*args, **kwargs)
136
+ return self._wrap_method(self._wrapped.unary_unary(*args, **kwargs))
111
137
 
112
138
  def unsubscribe(self, *args, **kwargs):
113
- return self._wrapped.unsubscribe(*args, **kwargs)
139
+ return self._wrap_method(self._wrapped.unsubscribe(*args, **kwargs))
@@ -18,7 +18,6 @@ import contextlib
18
18
  import logging
19
19
  import socket
20
20
  import threading
21
- import time
22
21
 
23
22
  import websockets.sync.client as websocketclient
24
23
 
@@ -81,6 +80,7 @@ def connect_tcp_bridge(hostname):
81
80
  return websocketclient.connect(
82
81
  f"wss://{hostname}/{path}",
83
82
  additional_headers={"Authorization": f"Bearer {creds.token}"},
83
+ open_timeout=30,
84
84
  )
85
85
 
86
86
 
@@ -94,6 +94,7 @@ def forward_bytes(name, from_sock, to_sock):
94
94
  This method is intended to be run in a separate thread of execution.
95
95
 
96
96
  Args:
97
+ name: forwarding thread name
97
98
  from_sock: A socket-like object to stream bytes from.
98
99
  to_sock: A socket-like object to stream bytes to.
99
100
  """
@@ -101,8 +102,11 @@ def forward_bytes(name, from_sock, to_sock):
101
102
  try:
102
103
  bs = from_sock.recv(1024)
103
104
  if not bs:
105
+ to_sock.close()
104
106
  return
105
- while bs:
107
+ attempt = 0
108
+ while bs and (attempt < 10):
109
+ attempt += 1
106
110
  try:
107
111
  to_sock.send(bs)
108
112
  bs = None
@@ -110,6 +114,8 @@ def forward_bytes(name, from_sock, to_sock):
110
114
  # On timeouts during a send, we retry just the send
111
115
  # to make sure we don't lose any bytes.
112
116
  pass
117
+ if bs:
118
+ raise Exception(f"Failed to forward bytes for {name}")
113
119
  except TimeoutError:
114
120
  # On timeouts during a receive, we retry the entire flow.
115
121
  pass
@@ -125,7 +131,7 @@ def connect_sockets(conn_number, from_sock, to_sock):
125
131
  This method continuously streams bytes in both directions between the
126
132
  given `from_sock` and `to_sock` socket-like objects.
127
133
 
128
- The caller is responsible for creating and closing the supplied socekts.
134
+ The caller is responsible for creating and closing the supplied sockets.
129
135
  """
130
136
  forward_name = f"{conn_number}-forward"
131
137
  t1 = threading.Thread(
@@ -157,12 +163,17 @@ def forward_connection(conn_number, conn, addr, target_host):
157
163
  Both the supplied incoming connection (`conn`) and the created outgoing
158
164
  connection are automatically closed when this method terminates.
159
165
 
160
- This method should be run inside of a daemon thread so that it will not
166
+ This method should be run inside a daemon thread so that it will not
161
167
  block program termination.
162
168
  """
163
169
  with conn:
164
170
  with connect_tcp_bridge(target_host) as websocket_conn:
165
171
  backend_socket = bridged_socket(websocket_conn)
172
+ # Set a timeout on how long we will allow send/recv calls to block
173
+ #
174
+ # The code that reads and writes to this connection will retry
175
+ # on timeouts, so this is a safe change.
176
+ conn.settimeout(10)
166
177
  connect_sockets(conn_number, conn, backend_socket)
167
178
 
168
179
 
@@ -210,14 +221,6 @@ class DataprocSessionProxy(object):
210
221
  s.release()
211
222
  while not self._killed:
212
223
  conn, addr = frontend_socket.accept()
213
- # Set a timeout on how long we will allow send/recv calls to block
214
- #
215
- # The code that reads and writes to this connection will retry
216
- # on timeouts, so this is a safe change.
217
- #
218
- # The chosen timeout is a very short one because it allows us
219
- # to more quickly detect when a connection has been closed.
220
- conn.settimeout(1)
221
224
  logger.debug(f"Accepted a connection from {addr}...")
222
225
  self._conn_number += 1
223
226
  threading.Thread(
@@ -0,0 +1,27 @@
1
+ # Copyright 2025 Google LLC
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ class DataprocSparkConnectException(Exception):
17
+ """A custom exception class to only print the error messages.
18
+ This would be used for exceptions where the stack trace
19
+ doesn't provide any additional information.h
20
+ """
21
+
22
+ def __init__(self, message):
23
+ self.message = message
24
+ super().__init__(message)
25
+
26
+ def _render_traceback_(self):
27
+ return self.message
@@ -0,0 +1,48 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import tempfile
5
+
6
+ from packaging.requirements import Requirement
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class PyPiArtifacts:
12
+ """
13
+ This is a helper class to serialize the PYPI package installation request with a "magic" file name
14
+ that Spark Connect server understands
15
+ """
16
+
17
+ @staticmethod
18
+ def __try_parsing_package(packages: set[str]) -> list[Requirement]:
19
+ reqs = [Requirement(p) for p in packages]
20
+ if 0 in [len(req.specifier) for req in reqs]:
21
+ logger.info("It is recommended to pin the version of the package")
22
+ return reqs
23
+
24
+ def __init__(self, packages: set[str]):
25
+ self.requirements = PyPiArtifacts.__try_parsing_package(packages)
26
+
27
+ def write_packages_config(self, s8s_session_uuid: str) -> str:
28
+ """
29
+ Can't use the same file-name as Spark throws exception that file already exists
30
+ Keep the filename/format in sync with server
31
+ """
32
+ dependencies = {
33
+ "version": "0.5",
34
+ "packageType": "PYPI",
35
+ "packages": [str(req) for req in self.requirements],
36
+ }
37
+
38
+ file_path = os.path.join(
39
+ tempfile.gettempdir(),
40
+ s8s_session_uuid,
41
+ "add-artifacts-1729-" + self.__str__() + ".json",
42
+ )
43
+
44
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
45
+ with open(file_path, "w") as json_file:
46
+ json.dump(dependencies, json_file, indent=4)
47
+ logger.debug("Dumping dependencies request in file: " + file_path)
48
+ return file_path