dataproc-spark-connect 0.1.0__py2.py3-none-any.whl → 0.2.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,114 @@
1
+ Metadata-Version: 2.1
2
+ Name: dataproc-spark-connect
3
+ Version: 0.2.0
4
+ Summary: Dataproc client library for Spark Connect
5
+ Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
+ Author: Google LLC
7
+ License: Apache 2.0
8
+ License-File: LICENSE
9
+ Requires-Dist: google-api-core>=2.19.1
10
+ Requires-Dist: google-cloud-dataproc>=5.15.1
11
+ Requires-Dist: wheel
12
+ Requires-Dist: websockets
13
+ Requires-Dist: pyspark>=3.5
14
+ Requires-Dist: pandas
15
+ Requires-Dist: pyarrow
16
+
17
+ # Dataproc Spark Connect Client
18
+
19
+ A wrapper of the Apache [Spark Connect](https://spark.apache.org/spark-connect/) client with
20
+ additional functionalities that allow applications to communicate with a remote Dataproc
21
+ Spark cluster using the Spark Connect protocol without requiring additional steps.
22
+
23
+ ## Install
24
+
25
+ .. code-block:: console
26
+
27
+ pip install dataproc_spark_connect
28
+
29
+ ## Uninstall
30
+
31
+ .. code-block:: console
32
+
33
+ pip uninstall dataproc_spark_connect
34
+
35
+
36
+ ## Setup
37
+ This client requires permissions to manage [Dataproc sessions and session templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
38
+ If you are running the client outside of Google Cloud, you must set following environment variables:
39
+
40
+ * GOOGLE_CLOUD_PROJECT - The Google Cloud project you use to run Spark workloads
41
+ * GOOGLE_CLOUD_REGION - The Compute Engine [region](https://cloud.google.com/compute/docs/regions-zones#available) where you run the Spark workload.
42
+ * GOOGLE_APPLICATION_CREDENTIALS - Your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
43
+ * DATAPROC_SPARK_CONNECT_SESSION_DEFAULT_CONFIG (Optional) - The config location, such as `tests/integration/resources/session.textproto`
44
+
45
+ ## Usage
46
+
47
+ 1. Install the latest version of Dataproc Python client and Dataproc Spark Connect modules:
48
+
49
+ .. code-block:: console
50
+
51
+ pip install google_cloud_dataproc --force-reinstall
52
+ pip install dataproc_spark_connect --force-reinstall
53
+
54
+ 2. Add the required import into your PySpark application or notebook:
55
+
56
+ .. code-block:: python
57
+
58
+ from google.cloud.dataproc_spark_connect import DataprocSparkSession
59
+
60
+ 3. There are two ways to create a spark session,
61
+
62
+ 1. Start a Spark session using properties defined in `DATAPROC_SPARK_CONNECT_SESSION_DEFAULT_CONFIG`:
63
+
64
+ .. code-block:: python
65
+
66
+ spark = DataprocSparkSession.builder.getOrCreate()
67
+
68
+ 2. Start a Spark session with the following code instead of using a config file:
69
+
70
+ .. code-block:: python
71
+
72
+ from google.cloud.dataproc_v1 import SparkConnectConfig
73
+ from google.cloud.dataproc_v1 import Session
74
+ dataproc_config = Session()
75
+ dataproc_config.spark_connect_session = SparkConnectConfig()
76
+ dataproc_config.environment_config.execution_config.subnetwork_uri = "<subnet>"
77
+ dataproc_config.runtime_config.version = '3.0'
78
+ spark = DataprocSparkSession.builder.dataprocConfig(dataproc_config).getOrCreate()
79
+
80
+ ## Billing
81
+ As this client runs the spark workload on Dataproc, your project will be billed as per [Dataproc Serverless Pricing](https://cloud.google.com/dataproc-serverless/pricing).
82
+ This will happen even if you are running the client from a non-GCE instance.
83
+
84
+ ## Contributing
85
+ ### Building and Deploying SDK
86
+
87
+ 1. Install the requirements in virtual environment.
88
+
89
+ .. code-block:: console
90
+
91
+ pip install -r requirements.txt
92
+
93
+ 2. Build the code.
94
+
95
+ .. code-block:: console
96
+
97
+ python setup.py sdist bdist_wheel
98
+
99
+
100
+ 3. Copy the generated `.whl` file to Cloud Storage. Use the version specified in the `setup.py` file.
101
+
102
+ .. code-block:: console
103
+
104
+ VERSION=<version> gsutil cp dist/dataproc_spark_connect-${VERSION}-py2.py3-none-any.whl gs://<your_bucket_name>
105
+
106
+ 4. Download the new SDK on Vertex, then uninstall the old version and install the new one.
107
+
108
+ .. code-block:: console
109
+
110
+ %%bash
111
+ export VERSION=<version>
112
+ gsutil cp gs://<your_bucket_name>/dataproc_spark_connect-${VERSION}-py2.py3-none-any.whl .
113
+ yes | pip uninstall dataproc_spark_connect
114
+ pip install dataproc_spark_connect-${VERSION}-py2.py3-none-any.whl
@@ -0,0 +1,10 @@
1
+ google/cloud/dataproc_spark_connect/__init__.py,sha256=pybAofW6rmWI-4C8VYm1q0NOZD_sBvFQz43jUBSQW30,616
2
+ google/cloud/dataproc_spark_connect/session.py,sha256=A42Wo87VSunG0D3sB-biWyNvU33WhI92mmrJbXI1oNo,23017
3
+ google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
4
+ google/cloud/dataproc_spark_connect/client/core.py,sha256=7Wy6QwkcWxlHBdo4NsktJEknggPpGkx9F5CS5IpQ7iM,3630
5
+ google/cloud/dataproc_spark_connect/client/proxy.py,sha256=ScrbaGsEvqi8wp4ngfD-T9K9mFHXBkVMZkTSr7mdNBs,8926
6
+ dataproc_spark_connect-0.2.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ dataproc_spark_connect-0.2.0.dist-info/METADATA,sha256=UivMTIfzkp6fzGHG4hiXPUAsRP9P7VBQMKJdEcjmowk,4200
8
+ dataproc_spark_connect-0.2.0.dist-info/WHEEL,sha256=OpXWERl2xLPRHTvd2ZXo_iluPEQd8uSbYkJ53NAER_Y,109
9
+ dataproc_spark_connect-0.2.0.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
10
+ dataproc_spark_connect-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.0.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py2-none-any
5
5
  Tag: py3-none-any
@@ -43,13 +43,22 @@ class bridged_socket(object):
43
43
  self._conn = websocket_conn
44
44
 
45
45
  def recv(self, buff_size):
46
- msg = self._conn.recv()
46
+ # N.B. The websockets [recv method](https://websockets.readthedocs.io/en/stable/reference/sync/client.html#websockets.sync.client.ClientConnection.recv)
47
+ # does not support the buff_size parameter, but it does add a `timeout` keyword parameter not supported by normal
48
+ # socket objects.
49
+ #
50
+ # We set that timeout to 60 seconds to prevent any scenarios where we wind up stuck waiting for a message from a websocket connection
51
+ # that never comes.
52
+ msg = self._conn.recv(timeout=60)
47
53
  return bytes.fromhex(msg)
48
54
 
49
55
  def send(self, msg_bytes):
50
56
  msg = bytes.hex(msg_bytes)
51
57
  self._conn.send(msg)
52
58
 
59
+ def close(self):
60
+ return self._conn.close()
61
+
53
62
 
54
63
  def connect_tcp_bridge(hostname):
55
64
  """Create a socket-like connection to the given hostname using websocket.
@@ -93,12 +102,51 @@ def forward_bytes(name, from_sock, to_sock):
93
102
  bs = from_sock.recv(1024)
94
103
  if not bs:
95
104
  return
96
- to_sock.send(bs)
105
+ while bs:
106
+ try:
107
+ to_sock.send(bs)
108
+ bs = None
109
+ except TimeoutError:
110
+ # On timeouts during a send, we retry just the send
111
+ # to make sure we don't lose any bytes.
112
+ pass
113
+ except TimeoutError:
114
+ # On timeouts during a receive, we retry the entire flow.
115
+ pass
97
116
  except Exception as ex:
98
117
  logger.debug(f"[{name}] Exception forwarding bytes: {ex}")
118
+ to_sock.close()
99
119
  return
100
120
 
101
121
 
122
+ def connect_sockets(conn_number, from_sock, to_sock):
123
+ """Create a connection between the two given ports.
124
+
125
+ This method continuously streams bytes in both directions between the
126
+ given `from_sock` and `to_sock` socket-like objects.
127
+
128
+ The caller is responsible for creating and closing the supplied socekts.
129
+ """
130
+ forward_name = f"{conn_number}-forward"
131
+ t1 = threading.Thread(
132
+ name=forward_name,
133
+ target=forward_bytes,
134
+ args=[forward_name, from_sock, to_sock],
135
+ daemon=True,
136
+ )
137
+ t1.start()
138
+ backward_name = f"{conn_number}-backward"
139
+ t2 = threading.Thread(
140
+ name=backward_name,
141
+ target=forward_bytes,
142
+ args=[backward_name, to_sock, from_sock],
143
+ daemon=True,
144
+ )
145
+ t2.start()
146
+ t1.join()
147
+ t2.join()
148
+
149
+
102
150
  def forward_connection(conn_number, conn, addr, target_host):
103
151
  """Create a connection to the target and forward `conn` to it.
104
152
 
@@ -115,24 +163,7 @@ def forward_connection(conn_number, conn, addr, target_host):
115
163
  with conn:
116
164
  with connect_tcp_bridge(target_host) as websocket_conn:
117
165
  backend_socket = bridged_socket(websocket_conn)
118
- forward_name = f"{conn_number}-forward"
119
- t1 = threading.Thread(
120
- name=forward_name,
121
- target=forward_bytes,
122
- args=[forward_name, conn, backend_socket],
123
- daemon=True,
124
- )
125
- t1.start()
126
- backward_name = f"{conn_number}-backward"
127
- t2 = threading.Thread(
128
- name=backward_name,
129
- target=forward_bytes,
130
- args=[backward_name, backend_socket, conn],
131
- daemon=True,
132
- )
133
- t2.start()
134
- t1.join()
135
- t2.join()
166
+ connect_sockets(conn_number, conn, backend_socket)
136
167
 
137
168
 
138
169
  class DataprocSessionProxy(object):
@@ -179,6 +210,14 @@ class DataprocSessionProxy(object):
179
210
  s.release()
180
211
  while not self._killed:
181
212
  conn, addr = frontend_socket.accept()
213
+ # Set a timeout on how long we will allow send/recv calls to block
214
+ #
215
+ # The code that reads and writes to this connection will retry
216
+ # on timeouts, so this is a safe change.
217
+ #
218
+ # The chosen timeout is a very short one because it allows us
219
+ # to more quickly detect when a connection has been closed.
220
+ conn.settimeout(1)
182
221
  logger.debug(f"Accepted a connection from {addr}...")
183
222
  self._conn_number += 1
184
223
  threading.Thread(
@@ -196,13 +196,13 @@ class DataprocSparkSession(SparkSession):
196
196
  session_id = self.generate_dataproc_session_id()
197
197
 
198
198
  session_request.session_id = session_id
199
- dataproc_config.name = f"projects/{self._project_id}/regions/{self._region}/sessions/{session_id}"
199
+ dataproc_config.name = f"projects/{self._project_id}/locations/{self._region}/sessions/{session_id}"
200
200
  logger.debug(
201
201
  f"Configurations used to create serverless session:\n {dataproc_config}"
202
202
  )
203
203
  session_request.session = dataproc_config
204
204
  session_request.parent = (
205
- f"projects/{self._project_id}/regions/{self._region}"
205
+ f"projects/{self._project_id}/locations/{self._region}"
206
206
  )
207
207
 
208
208
  logger.debug("Creating serverless session")
@@ -1,11 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: dataproc-spark-connect
3
- Version: 0.1.0
4
- Summary: Dataproc client library for Spark Connect
5
- License-File: LICENSE
6
- Requires-Dist: wheel
7
- Requires-Dist: websockets
8
- Requires-Dist: pyspark >=3.5
9
- Requires-Dist: pandas
10
- Requires-Dist: pyarrow
11
-
@@ -1,10 +0,0 @@
1
- google/cloud/dataproc_spark_connect/__init__.py,sha256=pybAofW6rmWI-4C8VYm1q0NOZD_sBvFQz43jUBSQW30,616
2
- google/cloud/dataproc_spark_connect/session.py,sha256=XCOym1Llp_vXCgMQ0EMsORoImY1BSKfJrZpooGCurNc,23013
3
- google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
4
- google/cloud/dataproc_spark_connect/client/core.py,sha256=7Wy6QwkcWxlHBdo4NsktJEknggPpGkx9F5CS5IpQ7iM,3630
5
- google/cloud/dataproc_spark_connect/client/proxy.py,sha256=o5ppDkvUUQLy4uDMjW6roTMgRFKlVfQm5faC5b60rRk,7263
6
- dataproc_spark_connect-0.1.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
- dataproc_spark_connect-0.1.0.dist-info/METADATA,sha256=Ke0zfRLuybTquQJgm-Ve3t8tVMT26l6q5i3R8wUEwq4,261
8
- dataproc_spark_connect-0.1.0.dist-info/WHEEL,sha256=ED2S3aolPA63OeAfpmbtiQd0NXmZX4SrTHXvXF6JNGc,109
9
- dataproc_spark_connect-0.1.0.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
10
- dataproc_spark_connect-0.1.0.dist-info/RECORD,,