PyPI - dataproc-spark-connect - Versions diffs - 0.2.1__py2.py3-none-any.whl → 0.7.0__py2.py3-none-any.whl - Mend

dataproc-spark-connect 0.2.1py2.py3-none-any.whl → 0.7.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

dataproc_spark_connect-0.7.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,98 @@
+Metadata-Version: 2.1
+Name: dataproc-spark-connect
+Version: 0.7.0
+Summary: Dataproc client library for Spark Connect
+Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
+Author: Google LLC
+License: Apache 2.0
+License-File: LICENSE
+Requires-Dist: google-api-core>=2.19
+Requires-Dist: google-cloud-dataproc>=5.18
+Requires-Dist: packaging>=20.0
+Requires-Dist: pyspark[connect]>=3.5
+Requires-Dist: tqdm>=4.67
+Requires-Dist: websockets>=15.0
+# Dataproc Spark Connect Client
+A wrapper of the Apache [Spark Connect](https://spark.apache.org/spark-connect/)
+client with additional functionalities that allow applications to communicate
+with a remote Dataproc Spark Session using the Spark Connect protocol without
+requiring additional steps.
+## Install
+```sh
+pip install dataproc_spark_connect
+```
+## Uninstall
+```sh
+pip uninstall dataproc_spark_connect
+```
+## Setup
+This client requires permissions to
+manage [Dataproc Sessions and Session Templates](https://cloud.google.com/dataproc-serverless/docs/concepts/iam).
+If you are running the client outside of Google Cloud, you must set following
+environment variables:
+* `GOOGLE_CLOUD_PROJECT` - The Google Cloud project you use to run Spark
+  workloads
+* `GOOGLE_CLOUD_REGION` - The Compute
+  Engine [region](https://cloud.google.com/compute/docs/regions-zones#available)
+  where you run the Spark workload.
+* `GOOGLE_APPLICATION_CREDENTIALS` -
+  Your [Application Credentials](https://cloud.google.com/docs/authentication/provide-credentials-adc)
+## Usage
+1. Install the latest version of Dataproc Python client and Dataproc Spark
+   Connect modules:
+   ```sh
+   pip install google_cloud_dataproc dataproc_spark_connect --force-reinstall
+   ```
+2. Add the required imports into your PySpark application or notebook and start
+   a Spark session with the following code instead of using
+   environment variables:
+   ```python
+   from google.cloud.dataproc_spark_connect import DataprocSparkSession
+   from google.cloud.dataproc_v1 import Session
+   session_config = Session()
+   session_config.environment_config.execution_config.subnetwork_uri = '<subnet>'
+   session_config.runtime_config.version = '2.2'
+   spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
+   ```
+## Developing
+For development instructions see [guide](DEVELOPING.md).
+## Contributing
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+### Contributor License Agreement
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com> to see
+your current agreements on file or to sign a new one.
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+### Code reviews
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.

dataproc_spark_connect-0.7.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,12 @@
+google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
+google/cloud/dataproc_spark_connect/exceptions.py,sha256=ilGyHD5M_yBQ3IC58-Y5miRGIQVJsLaNKvEGcHuk_BE,969
+google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
+google/cloud/dataproc_spark_connect/session.py,sha256=98Zrn0Vyl2ajcF5hltdSp8LgYTOzDa-eqeYxxmZVKds,26398
+google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
+google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
+google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
+dataproc_spark_connect-0.7.0.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+dataproc_spark_connect-0.7.0.dist-info/METADATA,sha256=fFJLyzjo3CKLx1d18U4i1csJEPgWxoAjok4qFghtOyE,3328
+dataproc_spark_connect-0.7.0.dist-info/WHEEL,sha256=OpXWERl2xLPRHTvd2ZXo_iluPEQd8uSbYkJ53NAER_Y,109
+dataproc_spark_connect-0.7.0.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
+dataproc_spark_connect-0.7.0.dist-info/RECORD,,

google/cloud/dataproc_spark_connect/__init__.py CHANGED Viewed

@@ -11,13 +11,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .session import DataprocSparkSession
+import importlib.metadata
 import warnings
-warnings.warn(
-    "The package 'dataproc-spark-connect' has been renamed to 'google-spark-connect'. "
-    "'dataproc-spark-connect' will no longer be updated. "
-    "For help using 'google-spark-connect', "
-    "see https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python/blob/main/README.md. ",
-    DeprecationWarning,
-)
+from .session import DataprocSparkSession
+old_package_name = "google-spark-connect"
+current_package_name = "dataproc-spark-connect"
+try:
+    importlib.metadata.distribution(old_package_name)
+    warnings.warn(
+        f"Package '{old_package_name}' is already installed in your environment. "
+        f"This might cause conflicts with '{current_package_name}'. "
+        f"Consider uninstalling '{old_package_name}' and only install '{current_package_name}'."
+    )
+except:
+    pass

google/cloud/dataproc_spark_connect/client/core.py CHANGED Viewed

@@ -11,12 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 import google
 import grpc
 from pyspark.sql.connect.client import ChannelBuilder
 from . import proxy
+logger = logging.getLogger(__name__)
 class DataprocChannelBuilder(ChannelBuilder):
     """
@@ -36,6 +40,10 @@ class DataprocChannelBuilder(ChannelBuilder):
     True
     """
+    def __init__(self, url, is_active_callback=None):
+        self._is_active_callback = is_active_callback
+        super().__init__(url)
     def toChannel(self) -> grpc.Channel:
         """
         Applies the parameters of the connection string and creates a new
@@ -51,7 +59,7 @@ class DataprocChannelBuilder(ChannelBuilder):
         return self._proxied_channel()
     def _proxied_channel(self) -> grpc.Channel:
-        return ProxiedChannel(self.host)
+        return ProxiedChannel(self.host, self._is_active_callback)
     def _direct_channel(self) -> grpc.Channel:
         destination = f"{self.host}:{self.port}"
@@ -75,7 +83,8 @@ class DataprocChannelBuilder(ChannelBuilder):
 class ProxiedChannel(grpc.Channel):
-    def __init__(self, target_host):
+    def __init__(self, target_host, is_active_callback):
+        self._is_active_callback = is_active_callback
         self._proxy = proxy.DataprocSessionProxy(0, target_host)
         self._proxy.start()
         self._proxied_connect_url = f"sc://localhost:{self._proxy.port}"
@@ -94,20 +103,37 @@ class ProxiedChannel(grpc.Channel):
         self._proxy.stop()
         return ret
+    def _wrap_method(self, wrapped_method):
+        if self._is_active_callback is None:
+            return wrapped_method
+        def checked_method(*margs, **mkwargs):
+            if (
+                self._is_active_callback is not None
+                and not self._is_active_callback()
+            ):
+                logger.warning(f"Session is no longer active")
+                raise RuntimeError(
+                    "Session not active. Please create a new session"
+                )
+            return wrapped_method(*margs, **mkwargs)
+        return checked_method
     def stream_stream(self, *args, **kwargs):
-        return self._wrapped.stream_stream(*args, **kwargs)
+        return self._wrap_method(self._wrapped.stream_stream(*args, **kwargs))
     def stream_unary(self, *args, **kwargs):
-        return self._wrapped.stream_unary(*args, **kwargs)
+        return self._wrap_method(self._wrapped.stream_unary(*args, **kwargs))
     def subscribe(self, *args, **kwargs):
-        return self._wrapped.subscribe(*args, **kwargs)
+        return self._wrap_method(self._wrapped.subscribe(*args, **kwargs))
     def unary_stream(self, *args, **kwargs):
-        return self._wrapped.unary_stream(*args, **kwargs)
+        return self._wrap_method(self._wrapped.unary_stream(*args, **kwargs))
     def unary_unary(self, *args, **kwargs):
-        return self._wrapped.unary_unary(*args, **kwargs)
+        return self._wrap_method(self._wrapped.unary_unary(*args, **kwargs))
     def unsubscribe(self, *args, **kwargs):
-        return self._wrapped.unsubscribe(*args, **kwargs)
+        return self._wrap_method(self._wrapped.unsubscribe(*args, **kwargs))

google/cloud/dataproc_spark_connect/client/proxy.py CHANGED Viewed

@@ -18,7 +18,6 @@ import contextlib
 import logging
 import socket
 import threading
-import time
 import websockets.sync.client as websocketclient
@@ -81,6 +80,7 @@ def connect_tcp_bridge(hostname):
     return websocketclient.connect(
         f"wss://{hostname}/{path}",
         additional_headers={"Authorization": f"Bearer {creds.token}"},
+        open_timeout=30,
     )
@@ -94,6 +94,7 @@ def forward_bytes(name, from_sock, to_sock):
     This method is intended to be run in a separate thread of execution.
     Args:
+        name: forwarding thread name
         from_sock: A socket-like object to stream bytes from.
         to_sock: A socket-like object to stream bytes to.
     """
@@ -101,8 +102,11 @@ def forward_bytes(name, from_sock, to_sock):
         try:
             bs = from_sock.recv(1024)
             if not bs:
+                to_sock.close()
                 return
-            while bs:
+            attempt = 0
+            while bs and (attempt < 10):
+                attempt += 1
                 try:
                     to_sock.send(bs)
                     bs = None
@@ -110,6 +114,8 @@ def forward_bytes(name, from_sock, to_sock):
                     # On timeouts during a send, we retry just the send
                     # to make sure we don't lose any bytes.
                     pass
+            if bs:
+                raise Exception(f"Failed to forward bytes for {name}")
         except TimeoutError:
             # On timeouts during a receive, we retry the entire flow.
             pass
@@ -125,7 +131,7 @@ def connect_sockets(conn_number, from_sock, to_sock):
     This method continuously streams bytes in both directions between the
     given `from_sock` and `to_sock` socket-like objects.
-    The caller is responsible for creating and closing the supplied socekts.
+    The caller is responsible for creating and closing the supplied sockets.
     """
     forward_name = f"{conn_number}-forward"
     t1 = threading.Thread(
@@ -157,12 +163,17 @@ def forward_connection(conn_number, conn, addr, target_host):
     Both the supplied incoming connection (`conn`) and the created outgoing
     connection are automatically closed when this method terminates.
-    This method should be run inside of a daemon thread so that it will not
+    This method should be run inside a daemon thread so that it will not
     block program termination.
     """
     with conn:
         with connect_tcp_bridge(target_host) as websocket_conn:
             backend_socket = bridged_socket(websocket_conn)
+            # Set a timeout on how long we will allow send/recv calls to block
+            #
+            # The code that reads and writes to this connection will retry
+            # on timeouts, so this is a safe change.
+            conn.settimeout(10)
             connect_sockets(conn_number, conn, backend_socket)
@@ -210,14 +221,6 @@ class DataprocSessionProxy(object):
             s.release()
             while not self._killed:
                 conn, addr = frontend_socket.accept()
-                # Set a timeout on how long we will allow send/recv calls to block
-                #
-                # The code that reads and writes to this connection will retry
-                # on timeouts, so this is a safe change.
-                #
-                # The chosen timeout is a very short one because it allows us
-                # to more quickly detect when a connection has been closed.
-                conn.settimeout(1)
                 logger.debug(f"Accepted a connection from {addr}...")
                 self._conn_number += 1
                 threading.Thread(

google/cloud/dataproc_spark_connect/exceptions.py ADDED Viewed

@@ -0,0 +1,27 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+class DataprocSparkConnectException(Exception):
+    """A custom exception class to only print the error messages.
+    This would be used for exceptions where the stack trace
+    doesn't provide any additional information.h
+    """
+    def __init__(self, message):
+        self.message = message
+        super().__init__(message)
+    def _render_traceback_(self):
+        return self.message

google/cloud/dataproc_spark_connect/pypi_artifacts.py ADDED Viewed

@@ -0,0 +1,48 @@
+import json
+import logging
+import os
+import tempfile
+from packaging.requirements import Requirement
+logger = logging.getLogger(__name__)
+class PyPiArtifacts:
+    """
+    This is a helper class to serialize the PYPI package installation request with a "magic" file name
+    that Spark Connect server understands
+    """
+    @staticmethod
+    def __try_parsing_package(packages: set[str]) -> list[Requirement]:
+        reqs = [Requirement(p) for p in packages]
+        if 0 in [len(req.specifier) for req in reqs]:
+            logger.info("It is recommended to pin the version of the package")
+        return reqs
+    def __init__(self, packages: set[str]):
+        self.requirements = PyPiArtifacts.__try_parsing_package(packages)
+    def write_packages_config(self, s8s_session_uuid: str) -> str:
+        """
+        Can't use the same file-name as Spark throws exception that file already exists
+        Keep the filename/format in sync with server
+        """
+        dependencies = {
+            "version": "0.5",
+            "packageType": "PYPI",
+            "packages": [str(req) for req in self.requirements],
+        }
+        file_path = os.path.join(
+            tempfile.gettempdir(),
+            s8s_session_uuid,
+            "add-artifacts-1729-" + self.__str__() + ".json",
+        )
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
+        with open(file_path, "w") as json_file:
+            json.dump(dependencies, json_file, indent=4)
+        logger.debug("Dumping dependencies request in file: " + file_path)
+        return file_path

dataproc-spark-connect 0.2.1__py2.py3-none-any.whl → 0.7.0__py2.py3-none-any.whl

dataproc-spark-connect 0.2.1py2.py3-none-any.whl → 0.7.0py2.py3-none-any.whl