PyPI - dataproc-spark-connect - Versions diffs - 0.8.2__py2.py3-none-any.whl → 0.9.0__py2.py3-none-any.whl - Mend

dataproc-spark-connect 0.8.2py2.py3-none-any.whl → 0.9.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

{dataproc_spark_connect-0.8.2.dist-info → dataproc_spark_connect-0.9.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataproc-spark-connect
-Version: 0.8.2
+Version: 0.9.0
 Summary: Dataproc client library for Spark Connect
 Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
 Author: Google LLC

{dataproc_spark_connect-0.8.2.dist-info → dataproc_spark_connect-0.9.0.dist-info}/RECORD RENAMED Viewed

@@ -1,12 +1,13 @@
-dataproc_spark_connect-0.8.2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+dataproc_spark_connect-0.9.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
 google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
+google/cloud/dataproc_spark_connect/environment.py,sha256=UICy9XyqAxL-cryVWx7GZPRAxoir5LKk0dtqqY_l--c,2307
 google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
 google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
-google/cloud/dataproc_spark_connect/session.py,sha256=Sr9ISKIJ6U5dJ13FzKQ8UC_pGeFXbchc7X3d9U5Hj48,32144
+google/cloud/dataproc_spark_connect/session.py,sha256=ELj5hDhofK1967eE5YaG_LP5B80KWFQWJn5gxi9yYt0,38577
 google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
 google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
 google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
-dataproc_spark_connect-0.8.2.dist-info/METADATA,sha256=2PCMrKtuuab4232elYKFHiTdaJcqiM4N38ceD_AhS-E,3465
-dataproc_spark_connect-0.8.2.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
-dataproc_spark_connect-0.8.2.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
-dataproc_spark_connect-0.8.2.dist-info/RECORD,,
+dataproc_spark_connect-0.9.0.dist-info/METADATA,sha256=1z8Ag1P_Lh9db0Rk9nGFoOu6sdeRs0UlrgtOqN_OhIQ,3465
+dataproc_spark_connect-0.9.0.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
+dataproc_spark_connect-0.9.0.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
+dataproc_spark_connect-0.9.0.dist-info/RECORD,,

google/cloud/dataproc_spark_connect/environment.py ADDED Viewed

@@ -0,0 +1,76 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Callable, Tuple, List
+def is_vscode() -> bool:
+    """True if running inside VS Code at all."""
+    return os.getenv("VSCODE_PID") is not None
+def is_jupyter() -> bool:
+    """True if running in a Jupyter environment."""
+    return os.getenv("JPY_PARENT_PID") is not None
+def is_colab_enterprise() -> bool:
+    """True if running in Colab Enterprise (Vertex AI)."""
+    return os.getenv("VERTEX_PRODUCT") == "COLAB_ENTERPRISE"
+def is_colab() -> bool:
+    """True if running in Google Colab."""
+    return os.getenv("COLAB_RELEASE_TAG") is not None
+def is_workbench() -> bool:
+    """True if running in AI Workbench (managed Jupyter)."""
+    return os.getenv("VERTEX_PRODUCT") == "WORKBENCH_INSTANCE"
+def is_jetbrains_ide() -> bool:
+    """True if running inside any JetBrains IDE."""
+    return "jetbrains" in os.getenv("TERMINAL_EMULATOR", "").lower()
+def get_client_environment_label() -> str:
+    """
+    Map current environment to a standardized client label.
+    Priority order:
+      1. Colab Enterprise ("colab-enterprise")
+      2. Colab ("colab")
+      3. Workbench ("workbench-jupyter")
+      4. VS Code ("vscode")
+      5. JetBrains IDE ("jetbrains")
+      6. Jupyter ("jupyter")
+      7. Unknown ("unknown")
+    """
+    checks: List[Tuple[Callable[[], bool], str]] = [
+        (is_colab_enterprise, "colab-enterprise"),
+        (is_colab, "colab"),
+        (is_workbench, "workbench-jupyter"),
+        (is_vscode, "vscode"),
+        (is_jetbrains_ide, "jetbrains"),
+        (is_jupyter, "jupyter"),
+    ]
+    for detector, label in checks:
+        try:
+            if detector():
+                return label
+        except Exception:
+            pass
+    return "unknown"

google/cloud/dataproc_spark_connect/session.py CHANGED Viewed

@@ -22,9 +22,10 @@ import re
 import string
 import threading
 import time
-from typing import Any, cast, ClassVar, Dict, Optional, Union
 import uuid
 import tqdm
+from types import MethodType
+from typing import Any, cast, ClassVar, Dict, Optional, Union
 from google.api_core import retry
 from google.api_core.client_options import ClientOptions
@@ -48,6 +49,7 @@ from google.cloud.dataproc_v1 import (
     TerminateSessionRequest,
 )
 from google.cloud.dataproc_v1.types import sessions
+from google.cloud.dataproc_spark_connect import environment
 from pyspark.sql.connect.session import SparkSession
 from pyspark.sql.utils import to_str
@@ -55,6 +57,12 @@ from pyspark.sql.utils import to_str
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# System labels that should not be overridden by user
+SYSTEM_LABELS = {
+    "dataproc-session-client",
+    "goog-colab-notebook-id",
+}
 def _is_valid_label_value(value: str) -> bool:
     """
@@ -131,11 +139,95 @@ class DataprocSparkSession(SparkSession):
             return self
         def dataprocSessionConfig(self, dataproc_config: Session):
+            self._dataproc_config = dataproc_config
+            for k, v in dataproc_config.runtime_config.properties.items():
+                self._options[cast(str, k)] = to_str(v)
+            return self
+        @property
+        def dataproc_config(self):
             with self._lock:
-                self._dataproc_config = dataproc_config
-                for k, v in dataproc_config.runtime_config.properties.items():
-                    self._options[cast(str, k)] = to_str(v)
-                return self
+                self._dataproc_config = self._dataproc_config or Session()
+                return self._dataproc_config
+        def runtimeVersion(self, version: str):
+            self.dataproc_config.runtime_config.version = version
+            return self
+        def serviceAccount(self, account: str):
+            self.dataproc_config.environment_config.execution_config.service_account = (
+                account
+            )
+            # Automatically set auth type to SERVICE_ACCOUNT when service account is provided
+            # This overrides any env var setting to simplify user experience
+            self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
+                AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
+            )
+            return self
+        def authType(
+            self, auth_type: "AuthenticationConfig.AuthenticationType"
+        ):
+            self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
+                auth_type
+            )
+            return self
+        def subnetwork(self, subnet: str):
+            self.dataproc_config.environment_config.execution_config.subnetwork_uri = (
+                subnet
+            )
+            return self
+        def ttl(self, duration: datetime.timedelta):
+            """Set the time-to-live (TTL) for the session using a timedelta object."""
+            self.dataproc_config.environment_config.execution_config.ttl = {
+                "seconds": int(duration.total_seconds())
+            }
+            return self
+        def ttlSeconds(self, seconds: int):
+            """Set the time-to-live (TTL) for the session in seconds."""
+            self.dataproc_config.environment_config.execution_config.ttl = {
+                "seconds": seconds
+            }
+            return self
+        def idleTtl(self, duration: datetime.timedelta):
+            """Set the idle time-to-live (idle TTL) for the session using a timedelta object."""
+            self.dataproc_config.environment_config.execution_config.idle_ttl = {
+                "seconds": int(duration.total_seconds())
+            }
+            return self
+        def idleTtlSeconds(self, seconds: int):
+            """Set the idle time-to-live (idle TTL) for the session in seconds."""
+            self.dataproc_config.environment_config.execution_config.idle_ttl = {
+                "seconds": seconds
+            }
+            return self
+        def sessionTemplate(self, template: str):
+            self.dataproc_config.session_template = template
+            return self
+        def label(self, key: str, value: str):
+            """Add a single label to the session."""
+            return self.labels({key: value})
+        def labels(self, labels: Dict[str, str]):
+            # Filter out system labels and warn user
+            filtered_labels = {}
+            for key, value in labels.items():
+                if key in SYSTEM_LABELS:
+                    logger.warning(
+                        f"Label '{key}' is a system label and cannot be overridden by user. Ignoring."
+                    )
+                else:
+                    filtered_labels[key] = value
+            self.dataproc_config.labels.update(filtered_labels)
+            return self
         def remote(self, url: Optional[str] = None) -> "SparkSession.Builder":
             if url:
@@ -257,8 +349,7 @@ class DataprocSparkSession(SparkSession):
                         client_options=self._client_options
                     ).create_session(session_request)
                     self._display_session_link_on_creation(session_id)
-                    # TODO: Add the 'View Session Details' button once the UI changes are done.
-                    # self._display_view_session_details_button(session_id)
+                    self._display_view_session_details_button(session_id)
                     create_session_pbar_thread.start()
                     session_response: Session = operation.result(
                         polling=retry.Retry(
@@ -376,8 +467,7 @@ class DataprocSparkSession(SparkSession):
                 print(
                     f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
                 )
-                # TODO: Add the 'View Session Details' button once the UI changes are done.
-                # self._display_view_session_details_button(s8s_session_id)
+                self._display_view_session_details_button(s8s_session_id)
                 if session is None:
                     session = self.__create_spark_connect_session_from_s8s(
                         session_response, session_name
@@ -400,11 +490,10 @@ class DataprocSparkSession(SparkSession):
                 return session
         def _get_dataproc_config(self):
-            dataproc_config = Session()
-            if self._dataproc_config:
-                dataproc_config = self._dataproc_config
-                for k, v in self._options.items():
-                    dataproc_config.runtime_config.properties[k] = v
+            # Use the property to ensure we always have a config
+            dataproc_config = self.dataproc_config
+            for k, v in self._options.items():
+                dataproc_config.runtime_config.properties[k] = v
             dataproc_config.spark_connect_session = (
                 sessions.SparkConnectConfig()
             )
@@ -412,6 +501,11 @@ class DataprocSparkSession(SparkSession):
                 dataproc_config.runtime_config.version = (
                     DataprocSparkSession._DEFAULT_RUNTIME_VERSION
                 )
+            # Check for Python version mismatch with runtime for UDF compatibility
+            self._check_python_version_compatibility(
+                dataproc_config.runtime_config.version
+            )
             if (
                 not dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type
                 and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
@@ -451,6 +545,10 @@ class DataprocSparkSession(SparkSession):
                         os.getenv("DATAPROC_SPARK_CONNECT_IDLE_TTL_SECONDS")
                     )
                 }
+            client_environment = environment.get_client_environment_label()
+            dataproc_config.labels["dataproc-session-client"] = (
+                client_environment
+            )
             if "COLAB_NOTEBOOK_ID" in os.environ:
                 colab_notebook_name = os.environ["COLAB_NOTEBOOK_ID"]
                 # Extract the last part of the path, which is the ID
@@ -465,7 +563,7 @@ class DataprocSparkSession(SparkSession):
                         f"Only lowercase letters, numbers, and dashes are allowed. "
                         f"The value must start with lowercase letter or number and end with a lowercase letter or number. "
                         f"Maximum length is 63 characters. "
-                        f"Skipping notebook ID label."
+                        f"Ignoring notebook ID label."
                     )
             default_datasource = os.getenv(
                 "DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
@@ -493,6 +591,32 @@ class DataprocSparkSession(SparkSession):
                     )
             return dataproc_config
+        def _check_python_version_compatibility(self, runtime_version):
+            """Check if client Python version matches server Python version for UDF compatibility."""
+            import sys
+            import warnings
+            # Runtime version to server Python version mapping
+            RUNTIME_PYTHON_MAP = {
+                "1.2": (3, 12),
+                "2.2": (3, 12),
+                "2.3": (3, 11),
+            }
+            client_python = sys.version_info[:2]  # (major, minor)
+            if runtime_version in RUNTIME_PYTHON_MAP:
+                server_python = RUNTIME_PYTHON_MAP[runtime_version]
+                if client_python != server_python:
+                    warnings.warn(
+                        f"Python version mismatch detected: Client is using Python {client_python[0]}.{client_python[1]}, "
+                        f"but Dataproc runtime {runtime_version} uses Python {server_python[0]}.{server_python[1]}. "
+                        f"This mismatch may cause issues with Python UDF (User Defined Function) compatibility. "
+                        f"Consider using Python {server_python[0]}.{server_python[1]} for optimal UDF execution.",
+                        stacklevel=3,
+                    )
         def _display_view_session_details_button(self, session_id):
             try:
                 session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
@@ -541,19 +665,57 @@ class DataprocSparkSession(SparkSession):
         super().__init__(connection, user_id)
-        base_method = self.client._execute_plan_request_with_metadata
+        execute_plan_request_base_method = (
+            self.client._execute_plan_request_with_metadata
+        )
+        execute_base_method = self.client._execute
+        execute_and_fetch_as_iterator_base_method = (
+            self.client._execute_and_fetch_as_iterator
+        )
-        def wrapped_method(*args, **kwargs):
-            req = base_method(*args, **kwargs)
+        def execute_plan_request_wrapped_method(*args, **kwargs):
+            req = execute_plan_request_base_method(*args, **kwargs)
             if not req.operation_id:
                 req.operation_id = str(uuid.uuid4())
                 logger.debug(
                     f"No operation_id found. Setting operation_id: {req.operation_id}"
                 )
-            self._display_operation_link(req.operation_id)
             return req
-        self.client._execute_plan_request_with_metadata = wrapped_method
+        self.client._execute_plan_request_with_metadata = (
+            execute_plan_request_wrapped_method
+        )
+        def execute_wrapped_method(client_self, req, *args, **kwargs):
+            if not self._sql_lazy_transformation(req):
+                self._display_operation_link(req.operation_id)
+            execute_base_method(req, *args, **kwargs)
+        self.client._execute = MethodType(execute_wrapped_method, self.client)
+        def execute_and_fetch_as_iterator_wrapped_method(
+            client_self, req, *args, **kwargs
+        ):
+            if not self._sql_lazy_transformation(req):
+                self._display_operation_link(req.operation_id)
+            return execute_and_fetch_as_iterator_base_method(
+                req, *args, **kwargs
+            )
+        self.client._execute_and_fetch_as_iterator = MethodType(
+            execute_and_fetch_as_iterator_wrapped_method, self.client
+        )
+    @staticmethod
+    def _sql_lazy_transformation(req):
+        # Select SQL command
+        if req.plan and req.plan.command and req.plan.command.sql_command:
+            return (
+                "select"
+                in req.plan.command.sql_command.sql.strip().lower().split()
+            )
+        return False
     def _repr_html_(self) -> str:
         if not self._active_s8s_session_id:
@@ -596,7 +758,7 @@ class DataprocSparkSession(SparkSession):
                 return
             html_element = f"""
               <div>
-                  <p><a href="{url}">Spark UI</a> (Operation: {operation_id})</p>
+                  <p><a href="{url}">Spark Query</a> (Operation: {operation_id})</p>
               </div>
               """
             display(HTML(html_element))

{dataproc_spark_connect-0.8.2.dist-info → dataproc_spark_connect-0.9.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{dataproc_spark_connect-0.8.2.dist-info → dataproc_spark_connect-0.9.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dataproc_spark_connect-0.8.2.dist-info → dataproc_spark_connect-0.9.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

dataproc-spark-connect 0.8.2__py2.py3-none-any.whl → 0.9.0__py2.py3-none-any.whl

dataproc-spark-connect 0.8.2py2.py3-none-any.whl → 0.9.0py2.py3-none-any.whl