PyPI - dataproc-spark-connect - Versions diffs - 0.8.3__py2.py3-none-any.whl → 1.0.0__py2.py3-none-any.whl - Mend

dataproc-spark-connect 0.8.3py2.py3-none-any.whl → 1.0.0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

google/cloud/dataproc_spark_connect/session.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import atexit
 import datetime
+import functools
 import json
 import logging
 import os
@@ -24,8 +25,9 @@ import threading
 import time
 import uuid
 import tqdm
+from packaging import version
 from types import MethodType
-from typing import Any, cast, ClassVar, Dict, Optional, Union
+from typing import Any, cast, ClassVar, Dict, Iterable, Optional, Union
 from google.api_core import retry
 from google.api_core.client_options import ClientOptions
@@ -43,12 +45,14 @@ from google.cloud.dataproc_spark_connect.pypi_artifacts import PyPiArtifacts
 from google.cloud.dataproc_v1 import (
     AuthenticationConfig,
     CreateSessionRequest,
+    DeleteSessionRequest,
     GetSessionRequest,
     Session,
     SessionControllerClient,
     TerminateSessionRequest,
 )
 from google.cloud.dataproc_v1.types import sessions
+from google.cloud.dataproc_spark_connect import environment
 from pyspark.sql.connect.session import SparkSession
 from pyspark.sql.utils import to_str
@@ -56,6 +60,16 @@ from pyspark.sql.utils import to_str
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# System labels that should not be overridden by user
+SYSTEM_LABELS = {
+    "dataproc-session-client",
+    "goog-colab-notebook-id",
+}
+_DATAPROC_SESSIONS_BASE_URL = (
+    "https://console.cloud.google.com/dataproc/interactive"
+)
 def _is_valid_label_value(value: str) -> bool:
     """
@@ -77,6 +91,22 @@ def _is_valid_label_value(value: str) -> bool:
     return bool(re.match(pattern, value))
+def _is_valid_session_id(session_id: str) -> bool:
+    """
+    Validates if a string complies with Google Cloud session ID format.
+    - Must be 4-63 characters
+    - Only lowercase letters, numbers, and dashes are allowed
+    - Must start with a lowercase letter
+    - Cannot end with a dash
+    """
+    if not session_id:
+        return False
+    # The pattern is sufficient for validation and already enforces length constraints.
+    pattern = r"^[a-z][a-z0-9-]{2,61}[a-z0-9]$"
+    return bool(re.match(pattern, session_id))
 class DataprocSparkSession(SparkSession):
     """The entry point to programming Spark with the Dataset and DataFrame API.
@@ -96,13 +126,16 @@ class DataprocSparkSession(SparkSession):
     ... ) # doctest: +SKIP
     """
-    _DEFAULT_RUNTIME_VERSION = "2.3"
+    _DEFAULT_RUNTIME_VERSION = "3.0"
+    _MIN_RUNTIME_VERSION = "3.0"
     _active_s8s_session_uuid: ClassVar[Optional[str]] = None
     _project_id = None
     _region = None
     _client_options = None
     _active_s8s_session_id: ClassVar[Optional[str]] = None
+    _active_session_uses_custom_id: ClassVar[bool] = False
+    _execution_progress_bar = dict()
     class Builder(SparkSession.Builder):
@@ -110,6 +143,7 @@ class DataprocSparkSession(SparkSession):
             self._options: Dict[str, Any] = {}
             self._channel_builder: Optional[DataprocChannelBuilder] = None
             self._dataproc_config: Optional[Session] = None
+            self._custom_session_id: Optional[str] = None
             self._project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
             self._region = os.getenv("GOOGLE_CLOUD_REGION")
             self._client_options = ClientOptions(
@@ -118,6 +152,18 @@ class DataprocSparkSession(SparkSession):
                     f"{self._region}-dataproc.googleapis.com",
                 )
             )
+            self._session_controller_client: Optional[
+                SessionControllerClient
+            ] = None
+        @property
+        def session_controller_client(self) -> SessionControllerClient:
+            """Get or create a SessionControllerClient instance."""
+            if self._session_controller_client is None:
+                self._session_controller_client = SessionControllerClient(
+                    client_options=self._client_options
+                )
+            return self._session_controller_client
         def projectId(self, project_id):
             self._project_id = project_id
@@ -131,12 +177,106 @@ class DataprocSparkSession(SparkSession):
             )
             return self
+        def dataprocSessionId(self, session_id: str):
+            """
+            Set a custom session ID for creating or reusing sessions.
+            The session ID must:
+            - Be 4-63 characters long
+            - Start with a lowercase letter
+            - Contain only lowercase letters, numbers, and hyphens
+            - Not end with a hyphen
+            Args:
+                session_id: The custom session ID to use
+            Returns:
+                This Builder instance for method chaining
+            Raises:
+                ValueError: If the session ID format is invalid
+            """
+            if not _is_valid_session_id(session_id):
+                raise ValueError(
+                    f"Invalid session ID: '{session_id}'. "
+                    "Session ID must be 4-63 characters, start with a lowercase letter, "
+                    "contain only lowercase letters, numbers, and hyphens, "
+                    "and not end with a hyphen."
+                )
+            self._custom_session_id = session_id
+            return self
         def dataprocSessionConfig(self, dataproc_config: Session):
+            self._dataproc_config = dataproc_config
+            for k, v in dataproc_config.runtime_config.properties.items():
+                self._options[cast(str, k)] = to_str(v)
+            return self
+        @property
+        def dataproc_config(self):
             with self._lock:
-                self._dataproc_config = dataproc_config
-                for k, v in dataproc_config.runtime_config.properties.items():
-                    self._options[cast(str, k)] = to_str(v)
-                return self
+                self._dataproc_config = self._dataproc_config or Session()
+                return self._dataproc_config
+        def runtimeVersion(self, version: str):
+            self.dataproc_config.runtime_config.version = version
+            return self
+        def serviceAccount(self, account: str):
+            self.dataproc_config.environment_config.execution_config.service_account = (
+                account
+            )
+            return self
+        def subnetwork(self, subnet: str):
+            self.dataproc_config.environment_config.execution_config.subnetwork_uri = (
+                subnet
+            )
+            return self
+        def ttl(self, duration: datetime.timedelta):
+            """Set the time-to-live (TTL) for the session using a timedelta object."""
+            return self.ttlSeconds(int(duration.total_seconds()))
+        def ttlSeconds(self, seconds: int):
+            """Set the time-to-live (TTL) for the session in seconds."""
+            self.dataproc_config.environment_config.execution_config.ttl = {
+                "seconds": seconds
+            }
+            return self
+        def idleTtl(self, duration: datetime.timedelta):
+            """Set the idle time-to-live (idle TTL) for the session using a timedelta object."""
+            return self.idleTtlSeconds(int(duration.total_seconds()))
+        def idleTtlSeconds(self, seconds: int):
+            """Set the idle time-to-live (idle TTL) for the session in seconds."""
+            self.dataproc_config.environment_config.execution_config.idle_ttl = {
+                "seconds": seconds
+            }
+            return self
+        def sessionTemplate(self, template: str):
+            self.dataproc_config.session_template = template
+            return self
+        def label(self, key: str, value: str):
+            """Add a single label to the session."""
+            return self.labels({key: value})
+        def labels(self, labels: Dict[str, str]):
+            # Filter out system labels and warn user
+            filtered_labels = {}
+            for key, value in labels.items():
+                if key in SYSTEM_LABELS:
+                    logger.warning(
+                        f"Label '{key}' is a system label and cannot be overridden by user. Ignoring."
+                    )
+                else:
+                    filtered_labels[key] = value
+            self.dataproc_config.labels.update(filtered_labels)
+            return self
         def remote(self, url: Optional[str] = None) -> "SparkSession.Builder":
             if url:
@@ -175,7 +315,11 @@ class DataprocSparkSession(SparkSession):
             assert self._channel_builder is not None
             session = DataprocSparkSession(connection=self._channel_builder)
+            # Register handler for Cell Execution Progress bar
+            session._register_progress_execution_handler()
             DataprocSparkSession._set_default_and_active_session(session)
             return session
         def __create(self) -> "DataprocSparkSession":
@@ -190,7 +334,16 @@ class DataprocSparkSession(SparkSession):
                 dataproc_config: Session = self._get_dataproc_config()
-                session_id = self.generate_dataproc_session_id()
+                # Check runtime version compatibility before creating session
+                self._check_runtime_compatibility(dataproc_config)
+                # Use custom session ID if provided, otherwise generate one
+                session_id = (
+                    self._custom_session_id
+                    if self._custom_session_id
+                    else self.generate_dataproc_session_id()
+                )
                 dataproc_config.name = f"projects/{self._project_id}/locations/{self._region}/sessions/{session_id}"
                 logger.debug(
                     f"Dataproc Session configuration:\n{dataproc_config}"
@@ -205,6 +358,10 @@ class DataprocSparkSession(SparkSession):
                 logger.debug("Creating Dataproc Session")
                 DataprocSparkSession._active_s8s_session_id = session_id
+                # Track whether this session uses a custom ID (unmanaged) or auto-generated ID (managed)
+                DataprocSparkSession._active_session_uses_custom_id = (
+                    self._custom_session_id is not None
+                )
                 s8s_creation_start_time = time.time()
                 stop_create_session_pbar_event = threading.Event()
@@ -258,8 +415,7 @@ class DataprocSparkSession(SparkSession):
                         client_options=self._client_options
                     ).create_session(session_request)
                     self._display_session_link_on_creation(session_id)
-                    # TODO: Add the 'View Session Details' button once the UI changes are done.
-                    # self._display_view_session_details_button(session_id)
+                    self._display_view_session_details_button(session_id)
                     create_session_pbar_thread.start()
                     session_response: Session = operation.result(
                         polling=retry.Retry(
@@ -296,6 +452,7 @@ class DataprocSparkSession(SparkSession):
                     if create_session_pbar_thread.is_alive():
                         create_session_pbar_thread.join()
                     DataprocSparkSession._active_s8s_session_id = None
+                    DataprocSparkSession._active_session_uses_custom_id = False
                     raise DataprocSparkConnectException(
                         f"Error while creating Dataproc Session: {e.message}"
                     )
@@ -304,6 +461,7 @@ class DataprocSparkSession(SparkSession):
                     if create_session_pbar_thread.is_alive():
                         create_session_pbar_thread.join()
                     DataprocSparkSession._active_s8s_session_id = None
+                    DataprocSparkSession._active_session_uses_custom_id = False
                     raise RuntimeError(
                         f"Error while creating Dataproc Session"
                     ) from e
@@ -317,16 +475,43 @@ class DataprocSparkSession(SparkSession):
                     session_response, dataproc_config.name
                 )
+        def _wait_for_session_available(
+            self, session_name: str, timeout: int = 300
+        ) -> Session:
+            start_time = time.time()
+            while time.time() - start_time < timeout:
+                try:
+                    session = self.session_controller_client.get_session(
+                        name=session_name
+                    )
+                    if "Spark Connect Server" in session.runtime_info.endpoints:
+                        return session
+                    time.sleep(5)
+                except Exception as e:
+                    logger.warning(
+                        f"Error while polling for Spark Connect endpoint: {e}"
+                    )
+                    time.sleep(5)
+            raise RuntimeError(
+                f"Spark Connect endpoint not available for session {session_name} after {timeout} seconds."
+            )
         def _display_session_link_on_creation(self, session_id):
-            session_url = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
+            session_url = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{session_id}?project={self._project_id}"
             plain_message = f"Creating Dataproc Session: {session_url}"
-            html_element = f"""
+            if environment.is_colab_enterprise():
+                html_element = f"""
                 <div>
                     <p>Creating Dataproc Spark Session<p>
-                    <p><a href="{session_url}">Dataproc Session</a></p>
                 </div>
-            """
+                """
+            else:
+                html_element = f"""
+                    <div>
+                        <p>Creating Dataproc Spark Session<p>
+                        <p><a href="{session_url}">Dataproc Session</a></p>
+                    </div>
+                """
             self._output_element_or_message(plain_message, html_element)
         def _print_session_created_message(self):
@@ -345,16 +530,19 @@ class DataprocSparkSession(SparkSession):
             :param html_element: HTML element to display for interactive IPython
                 environment
             """
+            # Don't print any output (Rich or Plain) for non-interactive
+            if not environment.is_interactive():
+                return
+            if environment.is_interactive_terminal():
+                print(plain_message)
+                return
             try:
                 from IPython.display import display, HTML
-                from IPython.core.interactiveshell import InteractiveShell
-                if not InteractiveShell.initialized():
-                    raise DataprocSparkConnectException(
-                        "Not in an Interactive IPython Environment"
-                    )
                 display(HTML(html_element))
-            except (ImportError, DataprocSparkConnectException):
+            except ImportError:
                 print(plain_message)
         def _get_exiting_active_session(
@@ -375,11 +563,13 @@ class DataprocSparkSession(SparkSession):
             if session_response is not None:
                 print(
-                    f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
+                    f"Using existing Dataproc Session (configuration changes may not be applied): {_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{s8s_session_id}?project={self._project_id}"
                 )
-                # TODO: Add the 'View Session Details' button once the UI changes are done.
-                # self._display_view_session_details_button(s8s_session_id)
+                self._display_view_session_details_button(s8s_session_id)
                 if session is None:
+                    session_response = self._wait_for_session_available(
+                        session_name
+                    )
                     session = self.__create_spark_connect_session_from_s8s(
                         session_response, session_name
                     )
@@ -395,17 +585,59 @@ class DataprocSparkSession(SparkSession):
         def getOrCreate(self) -> "DataprocSparkSession":
             with DataprocSparkSession._lock:
+                if environment.is_dataproc_batch():
+                    # For Dataproc batch workloads, connect to the already initialized local SparkSession
+                    from pyspark.sql import SparkSession as PySparkSQLSession
+                    session = PySparkSQLSession.builder.getOrCreate()
+                    return session  # type: ignore
+                if self._project_id is None:
+                    raise DataprocSparkConnectException(
+                        f"Error while creating Dataproc Session: project ID is not set"
+                    )
+                if self._region is None:
+                    raise DataprocSparkConnectException(
+                        f"Error while creating Dataproc Session: location is not set"
+                    )
+                # Handle custom session ID by setting it early and letting existing logic handle it
+                if self._custom_session_id:
+                    self._handle_custom_session_id()
                 session = self._get_exiting_active_session()
                 if session is None:
                     session = self.__create()
+                # Register this session as the instantiated SparkSession for compatibility
+                # with tools and libraries that expect SparkSession._instantiatedSession
+                from pyspark.sql import SparkSession as PySparkSQLSession
+                PySparkSQLSession._instantiatedSession = session
                 return session
+        def _handle_custom_session_id(self):
+            """Handle custom session ID by checking if it exists and setting _active_s8s_session_id."""
+            session_response = self._get_session_by_id(self._custom_session_id)
+            if session_response is not None:
+                # Found an active session with the custom ID, set it as the active session
+                DataprocSparkSession._active_s8s_session_id = (
+                    self._custom_session_id
+                )
+                # Mark that this session uses a custom ID
+                DataprocSparkSession._active_session_uses_custom_id = True
+            else:
+                # No existing session found, clear any existing active session ID
+                # so we'll create a new one with the custom ID
+                DataprocSparkSession._active_s8s_session_id = None
         def _get_dataproc_config(self):
-            dataproc_config = Session()
-            if self._dataproc_config:
-                dataproc_config = self._dataproc_config
-                for k, v in self._options.items():
-                    dataproc_config.runtime_config.properties[k] = v
+            # Use the property to ensure we always have a config
+            dataproc_config = self.dataproc_config
+            for k, v in self._options.items():
+                dataproc_config.runtime_config.properties[k] = v
             dataproc_config.spark_connect_session = (
                 sessions.SparkConnectConfig()
             )
@@ -413,20 +645,38 @@ class DataprocSparkSession(SparkSession):
                 dataproc_config.runtime_config.version = (
                     DataprocSparkSession._DEFAULT_RUNTIME_VERSION
                 )
+            # Check for Python version mismatch with runtime for UDF compatibility
+            self._check_python_version_compatibility(
+                dataproc_config.runtime_config.version
+            )
+            # Use local variable to improve readability of deeply nested attribute access
+            exec_config = dataproc_config.environment_config.execution_config
+            # Set service account from environment if not already set
             if (
-                not dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type
-                and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
-            ):
-                dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = AuthenticationConfig.AuthenticationType[
-                    os.getenv("DATAPROC_SPARK_CONNECT_AUTH_TYPE")
-                ]
-            if (
-                not dataproc_config.environment_config.execution_config.service_account
+                not exec_config.service_account
                 and "DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT" in os.environ
             ):
-                dataproc_config.environment_config.execution_config.service_account = os.getenv(
+                exec_config.service_account = os.getenv(
                     "DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT"
                 )
+            # Auto-set authentication type to SERVICE_ACCOUNT when service account is provided
+            if exec_config.service_account:
+                # When service account is provided, explicitly set auth type to SERVICE_ACCOUNT
+                exec_config.authentication_config.user_workload_authentication_type = (
+                    AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
+                )
+            elif (
+                not exec_config.authentication_config.user_workload_authentication_type
+                and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
+            ):
+                # Only set auth type from environment if no service account is present
+                exec_config.authentication_config.user_workload_authentication_type = AuthenticationConfig.AuthenticationType[
+                    os.getenv("DATAPROC_SPARK_CONNECT_AUTH_TYPE")
+                ]
             if (
                 not dataproc_config.environment_config.execution_config.subnetwork_uri
                 and "DATAPROC_SPARK_CONNECT_SUBNET" in os.environ
@@ -452,6 +702,10 @@ class DataprocSparkSession(SparkSession):
                         os.getenv("DATAPROC_SPARK_CONNECT_IDLE_TTL_SECONDS")
                     )
                 }
+            client_environment = environment.get_client_environment_label()
+            dataproc_config.labels["dataproc-session-client"] = (
+                client_environment
+            )
             if "COLAB_NOTEBOOK_ID" in os.environ:
                 colab_notebook_name = os.environ["COLAB_NOTEBOOK_ID"]
                 # Extract the last part of the path, which is the ID
@@ -466,37 +720,102 @@ class DataprocSparkSession(SparkSession):
                         f"Only lowercase letters, numbers, and dashes are allowed. "
                         f"The value must start with lowercase letter or number and end with a lowercase letter or number. "
                         f"Maximum length is 63 characters. "
-                        f"Skipping notebook ID label."
+                        f"Ignoring notebook ID label."
                     )
             default_datasource = os.getenv(
                 "DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
             )
-            if (
-                default_datasource
-                and dataproc_config.runtime_config.version == "2.3"
-            ):
-                if default_datasource == "bigquery":
-                    bq_datasource_properties = {
-                        "spark.datasource.bigquery.viewsEnabled": "true",
-                        "spark.datasource.bigquery.writeMethod": "direct",
+            match default_datasource:
+                case "bigquery":
+                    # Merge default configs with existing properties,
+                    # user configs take precedence
+                    for k, v in {
                         "spark.sql.catalog.spark_catalog": "com.google.cloud.spark.bigquery.BigQuerySparkSessionCatalog",
-                        "spark.sql.legacy.createHiveTableByDefault": "false",
                         "spark.sql.sources.default": "bigquery",
-                    }
-                    # Merge default configs with existing properties, user configs take precedence
-                    for k, v in bq_datasource_properties.items():
+                    }.items():
                         if k not in dataproc_config.runtime_config.properties:
                             dataproc_config.runtime_config.properties[k] = v
-                else:
-                    logger.warning(
-                        f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
-                        f" {default_datasource}. Supported value is 'bigquery'."
-                    )
+                case _:
+                    if default_datasource:
+                        logger.warning(
+                            f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
+                            f" {default_datasource}. Supported value is 'bigquery'."
+                        )
             return dataproc_config
+        def _check_python_version_compatibility(self, runtime_version):
+            """Check if client Python version matches server Python version for UDF compatibility."""
+            import sys
+            import warnings
+            # Runtime version to server Python version mapping
+            RUNTIME_PYTHON_MAP = {
+                "3.0": (3, 12),
+            }
+            client_python = sys.version_info[:2]  # (major, minor)
+            if runtime_version in RUNTIME_PYTHON_MAP:
+                server_python = RUNTIME_PYTHON_MAP[runtime_version]
+                if client_python != server_python:
+                    warnings.warn(
+                        f"Python version mismatch detected: Client is using Python {client_python[0]}.{client_python[1]}, "
+                        f"but Dataproc runtime {runtime_version} uses Python {server_python[0]}.{server_python[1]}. "
+                        f"This mismatch may cause issues with Python UDF (User Defined Function) compatibility. "
+                        f"Consider using Python {server_python[0]}.{server_python[1]} for optimal UDF execution.",
+                        stacklevel=3,
+                    )
+        def _check_runtime_compatibility(self, dataproc_config):
+            """Check if runtime version 3.0 client is compatible with older runtime versions.
+            Runtime version 3.0 clients do not support older runtime versions (pre-3.0).
+            There is no backward or forward compatibility between different runtime versions.
+            Args:
+                dataproc_config: The Session configuration containing runtime version
+            Raises:
+                DataprocSparkConnectException: If server is using pre-3.0 runtime version
+            """
+            runtime_version = dataproc_config.runtime_config.version
+            if not runtime_version:
+                return
+            logger.debug(f"Detected server runtime version: {runtime_version}")
+            # Parse runtime version to check if it's below minimum supported version
+            try:
+                server_version = version.parse(runtime_version)
+                min_version = version.parse(
+                    DataprocSparkSession._MIN_RUNTIME_VERSION
+                )
+                if server_version < min_version:
+                    raise DataprocSparkConnectException(
+                        f"Specified {runtime_version} Dataproc Runtime version is not supported, "
+                        f"use {DataprocSparkSession._MIN_RUNTIME_VERSION} version or higher."
+                    )
+            except version.InvalidVersion:
+                # If we can't parse the version, log a warning but continue
+                logger.warning(
+                    f"Could not parse runtime version: {runtime_version}"
+                )
         def _display_view_session_details_button(self, session_id):
+            # Display button is only supported in colab enterprise
+            if not environment.is_colab_enterprise():
+                return
+            # Skip button display for colab enterprise IPython terminals
+            if environment.is_interactive_terminal():
+                return
             try:
-                session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
+                session_url = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{session_id}?project={self._project_id}"
                 from IPython.core.interactiveshell import InteractiveShell
                 if not InteractiveShell.initialized():
@@ -510,6 +829,90 @@ class DataprocSparkSession(SparkSession):
             except ImportError as e:
                 logger.debug(f"Import error: {e}")
+        def _get_session_by_id(self, session_id: str) -> Optional[Session]:
+            """
+            Get existing session by ID.
+            Returns:
+                Session if ACTIVE/CREATING, None if not found or not usable
+            """
+            session_name = f"projects/{self._project_id}/locations/{self._region}/sessions/{session_id}"
+            try:
+                get_request = GetSessionRequest(name=session_name)
+                session = self.session_controller_client.get_session(
+                    get_request
+                )
+                logger.debug(
+                    f"Found existing session {session_id} in state: {session.state}"
+                )
+                if session.state in [
+                    Session.State.ACTIVE,
+                    Session.State.CREATING,
+                ]:
+                    # Reuse the active session
+                    logger.info(f"Reusing existing session: {session_id}")
+                    return session
+                else:
+                    # Session exists but is not usable (terminated/failed/terminating)
+                    logger.info(
+                        f"Session {session_id} in {session.state.name} state, cannot reuse"
+                    )
+                    return None
+            except NotFound:
+                # Session doesn't exist, can create new one
+                logger.debug(
+                    f"Session {session_id} not found, can create new one"
+                )
+                return None
+            except Exception as e:
+                logger.error(f"Error checking session {session_id}: {e}")
+                return None
+        def _delete_session(self, session_name: str):
+            """Delete a session to free up the session ID for reuse."""
+            try:
+                delete_request = DeleteSessionRequest(name=session_name)
+                self.session_controller_client.delete_session(delete_request)
+                logger.debug(f"Deleted session: {session_name}")
+            except NotFound:
+                logger.debug(f"Session already deleted: {session_name}")
+        def _wait_for_termination(self, session_name: str, timeout: int = 180):
+            """Wait for a session to finish terminating."""
+            start_time = time.time()
+            while time.time() - start_time < timeout:
+                try:
+                    get_request = GetSessionRequest(name=session_name)
+                    session = self.session_controller_client.get_session(
+                        get_request
+                    )
+                    if session.state in [
+                        Session.State.TERMINATED,
+                        Session.State.FAILED,
+                    ]:
+                        return
+                    elif session.state != Session.State.TERMINATING:
+                        # Session is in unexpected state
+                        logger.warning(
+                            f"Session {session_name} in unexpected state while waiting for termination: {session.state}"
+                        )
+                        return
+                    time.sleep(2)
+                except NotFound:
+                    # Session was deleted
+                    return
+            logger.warning(
+                f"Timeout waiting for session {session_name} to terminate"
+            )
         @staticmethod
         def generate_dataproc_session_id():
             timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -583,16 +986,111 @@ class DataprocSparkSession(SparkSession):
             execute_and_fetch_as_iterator_wrapped_method, self.client
         )
+        # Patching clearProgressHandlers method to not remove Dataproc Progress Handler
+        clearProgressHandlers_base_method = self.clearProgressHandlers
+        def clearProgressHandlers_wrapper_method(_, *args, **kwargs):
+            clearProgressHandlers_base_method(*args, **kwargs)
+            self._register_progress_execution_handler()
+        self.clearProgressHandlers = MethodType(
+            clearProgressHandlers_wrapper_method, self
+        )
+    @staticmethod
+    @functools.lru_cache(maxsize=1)
+    def get_tqdm_bar():
+        """
+        Return a tqdm implementation that works in the current environment.
+        - Uses CLI tqdm for interactive terminals.
+        - Uses the notebook tqdm if available, otherwise falls back to CLI tqdm.
+        """
+        from tqdm import tqdm as cli_tqdm
+        if environment.is_interactive_terminal():
+            return cli_tqdm
+        try:
+            import ipywidgets
+            from tqdm.notebook import tqdm as notebook_tqdm
+            return notebook_tqdm
+        except ImportError:
+            return cli_tqdm
+    def _register_progress_execution_handler(self):
+        from pyspark.sql.connect.shell.progress import StageInfo
+        def handler(
+            stages: Optional[Iterable[StageInfo]],
+            inflight_tasks: int,
+            operation_id: Optional[str],
+            done: bool,
+        ):
+            if operation_id is None:
+                return
+            # Don't build / render progress bar for non-interactive (despite
+            # Ipython or non-IPython)
+            if not environment.is_interactive():
+                return
+            total_tasks = 0
+            completed_tasks = 0
+            for stage in stages or []:
+                total_tasks += stage.num_tasks
+                completed_tasks += stage.num_completed_tasks
+            # Don't show progress bar till we receive some tasks
+            if total_tasks == 0:
+                return
+            # Get correct tqdm (notebook or CLI)
+            tqdm_pbar = self.get_tqdm_bar()
+            # Use a lock to ensure only one thread can access and modify
+            # the shared dictionaries at a time.
+            with self._lock:
+                if operation_id in self._execution_progress_bar:
+                    pbar = self._execution_progress_bar[operation_id]
+                    if pbar.total != total_tasks:
+                        pbar.reset(
+                            total=total_tasks
+                        )  # This force resets the progress bar % too on next refresh
+                else:
+                    pbar = tqdm_pbar(
+                        total=total_tasks,
+                        leave=True,
+                        dynamic_ncols=True,
+                        bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} Tasks",
+                    )
+                    self._execution_progress_bar[operation_id] = pbar
+                # To handle skipped or failed tasks.
+                # StageInfo proto doesn't have skipped and failed tasks information to process.
+                if done and completed_tasks < total_tasks:
+                    completed_tasks = total_tasks
+                pbar.n = completed_tasks
+                pbar.refresh()
+                if done:
+                    pbar.close()
+                    self._execution_progress_bar.pop(operation_id, None)
+        self.registerProgressHandler(handler)
     @staticmethod
     def _sql_lazy_transformation(req):
         # Select SQL command
-        if req.plan and req.plan.command and req.plan.command.sql_command:
-            return (
-                "select"
-                in req.plan.command.sql_command.sql.strip().lower().split()
-            )
-        return False
+        try:
+            query = req.plan.command.sql_command.input.sql.query
+            return "select" in query.strip().lower().split()
+        except AttributeError:
+            return False
     def _repr_html_(self) -> str:
         if not self._active_s8s_session_id:
@@ -600,7 +1098,7 @@ class DataprocSparkSession(SparkSession):
             <div>No Active Dataproc Session</div>
             """
-        s8s_session = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{self._active_s8s_session_id}"
+        s8s_session = f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/{self._active_s8s_session_id}"
         ui = f"{s8s_session}/sparkApplications/applications"
         return f"""
         <div>
@@ -612,6 +1110,11 @@ class DataprocSparkSession(SparkSession):
         """
     def _display_operation_link(self, operation_id: str):
+        # Don't print per-operation Spark UI link for non-interactive (despite
+        # Ipython or non-IPython)
+        if not environment.is_interactive():
+            return
         assert all(
             [
                 operation_id is not None,
@@ -622,17 +1125,18 @@ class DataprocSparkSession(SparkSession):
         )
         url = (
-            f"https://console.cloud.google.com/dataproc/interactive/{self._region}/"
+            f"{_DATAPROC_SESSIONS_BASE_URL}/{self._region}/"
             f"{self._active_s8s_session_id}/sparkApplications/application;"
             f"associatedSqlOperationId={operation_id}?project={self._project_id}"
         )
+        if environment.is_interactive_terminal():
+            print(f"Spark Query: {url}")
+            return
         try:
             from IPython.display import display, HTML
-            from IPython.core.interactiveshell import InteractiveShell
-            if not InteractiveShell.initialized():
-                return
             html_element = f"""
               <div>
                   <p><a href="{url}">Spark Query</a> (Operation: {operation_id})</p>
@@ -690,7 +1194,7 @@ class DataprocSparkSession(SparkSession):
         This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws
         an exception.
         Regarding pypi: Popular packages are already pre-installed in s8s runtime.
-        https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.2#python_libraries
+        https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.3#python_libraries
         If there are conflicts/package doesn't exist, it throws an exception.
         """
         if sum([pypi, file, pyfile, archive]) > 1:
@@ -713,19 +1217,83 @@ class DataprocSparkSession(SparkSession):
     def _get_active_session_file_path():
         return os.getenv("DATAPROC_SPARK_CONNECT_ACTIVE_SESSION_FILE_PATH")
-    def stop(self) -> None:
+    def stop(self, terminate: Optional[bool] = None) -> None:
+        """
+        Stop the Spark session and optionally terminate the server-side session.
+        Parameters
+        ----------
+        terminate : bool, optional
+            Control server-side termination behavior.
+            - None (default): Auto-detect based on session type
+              - Managed sessions (auto-generated ID): terminate server
+              - Named sessions (custom ID): client-side cleanup only
+            - True: Always terminate the server-side session
+            - False: Never terminate the server-side session (client cleanup only)
+        Examples
+        --------
+        Auto-detect termination behavior (existing behavior):
+        >>> spark.stop()
+        Force terminate a named session:
+        >>> spark.stop(terminate=True)
+        Prevent termination of a managed session:
+        >>> spark.stop(terminate=False)
+        """
         with DataprocSparkSession._lock:
             if DataprocSparkSession._active_s8s_session_id is not None:
-                terminate_s8s_session(
-                    DataprocSparkSession._project_id,
-                    DataprocSparkSession._region,
-                    DataprocSparkSession._active_s8s_session_id,
-                    self._client_options,
-                )
+                # Determine if we should terminate the server-side session
+                if terminate is None:
+                    # Auto-detect: managed sessions terminate, named sessions don't
+                    should_terminate = (
+                        not DataprocSparkSession._active_session_uses_custom_id
+                    )
+                else:
+                    should_terminate = terminate
+                if should_terminate:
+                    # Terminate the server-side session
+                    logger.debug(
+                        f"Terminating session {DataprocSparkSession._active_s8s_session_id}"
+                    )
+                    terminate_s8s_session(
+                        DataprocSparkSession._project_id,
+                        DataprocSparkSession._region,
+                        DataprocSparkSession._active_s8s_session_id,
+                        self._client_options,
+                    )
+                else:
+                    # Client-side cleanup only
+                    logger.debug(
+                        f"Stopping session {DataprocSparkSession._active_s8s_session_id} without termination"
+                    )
                 self._remove_stopped_session_from_file()
+                # Clean up SparkSession._instantiatedSession if it points to this session
+                try:
+                    from pyspark.sql import SparkSession as PySparkSQLSession
+                    if PySparkSQLSession._instantiatedSession is self:
+                        PySparkSQLSession._instantiatedSession = None
+                        logger.debug(
+                            "Cleared SparkSession._instantiatedSession reference"
+                        )
+                except (ImportError, AttributeError):
+                    # PySpark not available or _instantiatedSession doesn't exist
+                    pass
                 DataprocSparkSession._active_s8s_session_uuid = None
                 DataprocSparkSession._active_s8s_session_id = None
+                DataprocSparkSession._active_session_uses_custom_id = False
                 DataprocSparkSession._project_id = None
                 DataprocSparkSession._region = None
                 DataprocSparkSession._client_options = None

dataproc-spark-connect 0.8.3__py2.py3-none-any.whl → 1.0.0__py2.py3-none-any.whl

dataproc-spark-connect 0.8.3py2.py3-none-any.whl → 1.0.0py2.py3-none-any.whl