PyPI - dataproc-spark-connect - Versions diffs - 0.9.0__py2.py3-none-any.whl → 1.0.0rc2__py2.py3-none-any.whl - Mend

dataproc-spark-connect 0.9.0py2.py3-none-any.whl → 1.0.0rc2py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{dataproc_spark_connect-0.9.0.dist-info → dataproc_spark_connect-1.0.0rc2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataproc-spark-connect
-Version: 0.9.0
+Version: 1.0.0rc2
 Summary: Dataproc client library for Spark Connect
 Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
 Author: Google LLC
@@ -9,7 +9,7 @@ License-File: LICENSE
 Requires-Dist: google-api-core>=2.19
 Requires-Dist: google-cloud-dataproc>=5.18
 Requires-Dist: packaging>=20.0
-Requires-Dist: pyspark[connect]~=3.5.1
+Requires-Dist: pyspark[connect]~=4.0.0
 Requires-Dist: tqdm>=4.67
 Requires-Dist: websockets>=14.0
 Dynamic: author

dataproc_spark_connect-1.0.0rc2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+dataproc_spark_connect-1.0.0rc2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
+google/cloud/dataproc_spark_connect/environment.py,sha256=l1wWiHMHtBQ9YonE-kHTpaZlN9vLE4fyJSTn7RZP6kA,2503
+google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
+google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
+google/cloud/dataproc_spark_connect/session.py,sha256=FdJI_F9k6EfIvlgC1-f-Qb_Uwg9SmkIyWhpNZlqGQhw,40405
+google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
+google/cloud/dataproc_spark_connect/client/core.py,sha256=GRc4OCTBvIvdagjxOPoDO22vLtt8xDSerdREMRDeUBY,4659
+google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
+dataproc_spark_connect-1.0.0rc2.dist-info/METADATA,sha256=o2vfu5NRn2Pb0N7cavrBm2OLwP_LXQBVrclNjEtb9Do,3468
+dataproc_spark_connect-1.0.0rc2.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
+dataproc_spark_connect-1.0.0rc2.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
+dataproc_spark_connect-1.0.0rc2.dist-info/RECORD,,

google/cloud/dataproc_spark_connect/client/core.py CHANGED Viewed

@@ -15,14 +15,14 @@ import logging
 import google
 import grpc
-from pyspark.sql.connect.client import ChannelBuilder
+from pyspark.sql.connect.client import DefaultChannelBuilder
 from . import proxy
 logger = logging.getLogger(__name__)
-class DataprocChannelBuilder(ChannelBuilder):
+class DataprocChannelBuilder(DefaultChannelBuilder):
     """
     This is a helper class that is used to create a GRPC channel based on the given
     connection string per the documentation of Spark Connect.
@@ -88,7 +88,9 @@ class ProxiedChannel(grpc.Channel):
         self._proxy = proxy.DataprocSessionProxy(0, target_host)
         self._proxy.start()
         self._proxied_connect_url = f"sc://localhost:{self._proxy.port}"
-        self._wrapped = ChannelBuilder(self._proxied_connect_url).toChannel()
+        self._wrapped = DefaultChannelBuilder(
+            self._proxied_connect_url
+        ).toChannel()
     def __enter__(self):
         return self

google/cloud/dataproc_spark_connect/environment.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 import os
+import sys
 from typing import Callable, Tuple, List
@@ -46,6 +47,18 @@ def is_jetbrains_ide() -> bool:
     return "jetbrains" in os.getenv("TERMINAL_EMULATOR", "").lower()
+def is_interactive():
+    return hasattr(sys, "ps1")
+def is_terminal():
+    return sys.stdin.isatty()
+def is_interactive_terminal():
+    return is_interactive() and is_terminal()
 def get_client_environment_label() -> str:
     """
     Map current environment to a standardized client label.

google/cloud/dataproc_spark_connect/session.py CHANGED Viewed

@@ -24,8 +24,10 @@ import threading
 import time
 import uuid
 import tqdm
+from tqdm import tqdm as cli_tqdm
+from tqdm.notebook import tqdm as notebook_tqdm
 from types import MethodType
-from typing import Any, cast, ClassVar, Dict, Optional, Union
+from typing import Any, cast, ClassVar, Dict, Iterable, Optional, Union
 from google.api_core import retry
 from google.api_core.client_options import ClientOptions
@@ -103,13 +105,14 @@ class DataprocSparkSession(SparkSession):
     ... ) # doctest: +SKIP
     """
-    _DEFAULT_RUNTIME_VERSION = "2.3"
+    _DEFAULT_RUNTIME_VERSION = "3.0"
     _active_s8s_session_uuid: ClassVar[Optional[str]] = None
     _project_id = None
     _region = None
     _client_options = None
     _active_s8s_session_id: ClassVar[Optional[str]] = None
+    _execution_progress_bar = dict()
     class Builder(SparkSession.Builder):
@@ -158,19 +161,6 @@ class DataprocSparkSession(SparkSession):
             self.dataproc_config.environment_config.execution_config.service_account = (
                 account
             )
-            # Automatically set auth type to SERVICE_ACCOUNT when service account is provided
-            # This overrides any env var setting to simplify user experience
-            self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
-                AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
-            )
-            return self
-        def authType(
-            self, auth_type: "AuthenticationConfig.AuthenticationType"
-        ):
-            self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
-                auth_type
-            )
             return self
         def subnetwork(self, subnet: str):
@@ -181,10 +171,7 @@ class DataprocSparkSession(SparkSession):
         def ttl(self, duration: datetime.timedelta):
             """Set the time-to-live (TTL) for the session using a timedelta object."""
-            self.dataproc_config.environment_config.execution_config.ttl = {
-                "seconds": int(duration.total_seconds())
-            }
-            return self
+            return self.ttlSeconds(int(duration.total_seconds()))
         def ttlSeconds(self, seconds: int):
             """Set the time-to-live (TTL) for the session in seconds."""
@@ -195,10 +182,7 @@ class DataprocSparkSession(SparkSession):
         def idleTtl(self, duration: datetime.timedelta):
             """Set the idle time-to-live (idle TTL) for the session using a timedelta object."""
-            self.dataproc_config.environment_config.execution_config.idle_ttl = {
-                "seconds": int(duration.total_seconds())
-            }
-            return self
+            return self.idleTtlSeconds(int(duration.total_seconds()))
         def idleTtlSeconds(self, seconds: int):
             """Set the idle time-to-live (idle TTL) for the session in seconds."""
@@ -266,6 +250,9 @@ class DataprocSparkSession(SparkSession):
             assert self._channel_builder is not None
             session = DataprocSparkSession(connection=self._channel_builder)
+            # Register handler for Cell Execution Progress bar
+            session._register_progress_execution_handler()
             DataprocSparkSession._set_default_and_active_session(session)
             return session
@@ -568,27 +555,24 @@ class DataprocSparkSession(SparkSession):
             default_datasource = os.getenv(
                 "DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
             )
-            if (
-                default_datasource
-                and dataproc_config.runtime_config.version == "2.3"
-            ):
-                if default_datasource == "bigquery":
-                    bq_datasource_properties = {
+            match default_datasource:
+                case "bigquery":
+                    # Merge default configs with existing properties,
+                    # user configs take precedence
+                    for k, v in {
                         "spark.datasource.bigquery.viewsEnabled": "true",
                         "spark.datasource.bigquery.writeMethod": "direct",
                         "spark.sql.catalog.spark_catalog": "com.google.cloud.spark.bigquery.BigQuerySparkSessionCatalog",
-                        "spark.sql.legacy.createHiveTableByDefault": "false",
                         "spark.sql.sources.default": "bigquery",
-                    }
-                    # Merge default configs with existing properties, user configs take precedence
-                    for k, v in bq_datasource_properties.items():
+                    }.items():
                         if k not in dataproc_config.runtime_config.properties:
                             dataproc_config.runtime_config.properties[k] = v
-                else:
-                    logger.warning(
-                        f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
-                        f" {default_datasource}. Supported value is 'bigquery'."
-                    )
+                case _:
+                    if default_datasource:
+                        logger.warning(
+                            f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
+                            f" {default_datasource}. Supported value is 'bigquery'."
+                        )
             return dataproc_config
         def _check_python_version_compatibility(self, runtime_version):
@@ -598,9 +582,7 @@ class DataprocSparkSession(SparkSession):
             # Runtime version to server Python version mapping
             RUNTIME_PYTHON_MAP = {
-                "1.2": (3, 12),
-                "2.2": (3, 12),
-                "2.3": (3, 11),
+                "3.0": (3, 11),
             }
             client_python = sys.version_info[:2]  # (major, minor)
@@ -706,6 +688,78 @@ class DataprocSparkSession(SparkSession):
             execute_and_fetch_as_iterator_wrapped_method, self.client
         )
+        # Patching clearProgressHandlers method to not remove Dataproc Progress Handler
+        clearProgressHandlers_base_method = self.clearProgressHandlers
+        def clearProgressHandlers_wrapper_method(_, *args, **kwargs):
+            clearProgressHandlers_base_method(*args, **kwargs)
+            self._register_progress_execution_handler()
+        self.clearProgressHandlers = MethodType(
+            clearProgressHandlers_wrapper_method, self
+        )
+    def _register_progress_execution_handler(self):
+        from pyspark.sql.connect.shell.progress import StageInfo
+        def handler(
+            stages: Optional[Iterable[StageInfo]],
+            inflight_tasks: int,
+            operation_id: Optional[str],
+            done: bool,
+        ):
+            if operation_id is None:
+                return
+            # Don't build / render progress bar for non-interactive (despite
+            # Ipython or non-IPython)
+            if not environment.is_interactive():
+                return
+            total_tasks = 0
+            completed_tasks = 0
+            for stage in stages or []:
+                total_tasks += stage.num_tasks
+                completed_tasks += stage.num_completed_tasks
+            tqdm_pbar = notebook_tqdm
+            if environment.is_interactive_terminal():
+                tqdm_pbar = cli_tqdm
+            # Use a lock to ensure only one thread can access and modify
+            # the shared dictionaries at a time.
+            with self._lock:
+                if operation_id in self._execution_progress_bar:
+                    pbar = self._execution_progress_bar[operation_id]
+                    if pbar.total != total_tasks:
+                        pbar.reset(
+                            total=total_tasks
+                        )  # This force resets the progress bar % too on next refresh
+                else:
+                    pbar = tqdm_pbar(
+                        total=total_tasks,
+                        leave=True,
+                        dynamic_ncols=True,
+                        bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} Tasks",
+                    )
+                    self._execution_progress_bar[operation_id] = pbar
+                # To handle skipped or failed tasks.
+                # StageInfo proto doesn't have skipped and failed tasks information to process.
+                if done and completed_tasks < total_tasks:
+                    completed_tasks = total_tasks
+                pbar.n = completed_tasks
+                pbar.refresh()
+                if done:
+                    pbar.close()
+                    self._execution_progress_bar.pop(operation_id, None)
+        self.registerProgressHandler(handler)
     @staticmethod
     def _sql_lazy_transformation(req):
         # Select SQL command
@@ -813,7 +867,7 @@ class DataprocSparkSession(SparkSession):
         This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws
         an exception.
         Regarding pypi: Popular packages are already pre-installed in s8s runtime.
-        https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.2#python_libraries
+        https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.3#python_libraries
         If there are conflicts/package doesn't exist, it throws an exception.
         """
         if sum([pypi, file, pyfile, archive]) > 1:

dataproc_spark_connect-0.9.0.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-dataproc_spark_connect-0.9.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
-google/cloud/dataproc_spark_connect/environment.py,sha256=UICy9XyqAxL-cryVWx7GZPRAxoir5LKk0dtqqY_l--c,2307
-google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
-google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
-google/cloud/dataproc_spark_connect/session.py,sha256=ELj5hDhofK1967eE5YaG_LP5B80KWFQWJn5gxi9yYt0,38577
-google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
-google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
-google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
-dataproc_spark_connect-0.9.0.dist-info/METADATA,sha256=1z8Ag1P_Lh9db0Rk9nGFoOu6sdeRs0UlrgtOqN_OhIQ,3465
-dataproc_spark_connect-0.9.0.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
-dataproc_spark_connect-0.9.0.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
-dataproc_spark_connect-0.9.0.dist-info/RECORD,,

{dataproc_spark_connect-0.9.0.dist-info → dataproc_spark_connect-1.0.0rc2.dist-info}/WHEEL RENAMED Viewed

File without changes

{dataproc_spark_connect-0.9.0.dist-info → dataproc_spark_connect-1.0.0rc2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dataproc_spark_connect-0.9.0.dist-info → dataproc_spark_connect-1.0.0rc2.dist-info}/top_level.txt RENAMED Viewed

File without changes

dataproc-spark-connect 0.9.0__py2.py3-none-any.whl → 1.0.0rc2__py2.py3-none-any.whl

dataproc-spark-connect 0.9.0py2.py3-none-any.whl → 1.0.0rc2py2.py3-none-any.whl