PyPI - dataproc-spark-connect - Versions diffs - 0.8.3__tar.gz → 1.0.0rc1__tar.gz - Mend

dataproc-spark-connect 0.8.3tar.gz → 1.0.0rc1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

{dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataproc-spark-connect
-Version: 0.8.3
+Version: 1.0.0rc1
 Summary: Dataproc client library for Spark Connect
 Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
 Author: Google LLC
@@ -9,7 +9,7 @@ License-File: LICENSE
 Requires-Dist: google-api-core>=2.19
 Requires-Dist: google-cloud-dataproc>=5.18
 Requires-Dist: packaging>=20.0
-Requires-Dist: pyspark[connect]~=3.5.1
+Requires-Dist: pyspark[connect]~=4.0.0
 Requires-Dist: tqdm>=4.67
 Requires-Dist: websockets>=14.0
 Dynamic: author

{dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataproc-spark-connect
-Version: 0.8.3
+Version: 1.0.0rc1
 Summary: Dataproc client library for Spark Connect
 Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
 Author: Google LLC
@@ -9,7 +9,7 @@ License-File: LICENSE
 Requires-Dist: google-api-core>=2.19
 Requires-Dist: google-cloud-dataproc>=5.18
 Requires-Dist: packaging>=20.0
-Requires-Dist: pyspark[connect]~=3.5.1
+Requires-Dist: pyspark[connect]~=4.0.0
 Requires-Dist: tqdm>=4.67
 Requires-Dist: websockets>=14.0
 Dynamic: author

{dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/SOURCES.txt RENAMED Viewed

@@ -9,6 +9,7 @@ dataproc_spark_connect.egg-info/dependency_links.txt
 dataproc_spark_connect.egg-info/requires.txt
 dataproc_spark_connect.egg-info/top_level.txt
 google/cloud/dataproc_spark_connect/__init__.py
+google/cloud/dataproc_spark_connect/environment.py
 google/cloud/dataproc_spark_connect/exceptions.py
 google/cloud/dataproc_spark_connect/pypi_artifacts.py
 google/cloud/dataproc_spark_connect/session.py

{dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/requires.txt RENAMED Viewed

@@ -1,6 +1,6 @@
 google-api-core>=2.19
 google-cloud-dataproc>=5.18
 packaging>=20.0
-pyspark[connect]~=3.5.1
+pyspark[connect]~=4.0.0
 tqdm>=4.67
 websockets>=14.0

{dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/client/core.py RENAMED Viewed

@@ -15,14 +15,14 @@ import logging
 import google
 import grpc
-from pyspark.sql.connect.client import ChannelBuilder
+from pyspark.sql.connect.client import DefaultChannelBuilder
 from . import proxy
 logger = logging.getLogger(__name__)
-class DataprocChannelBuilder(ChannelBuilder):
+class DataprocChannelBuilder(DefaultChannelBuilder):
     """
     This is a helper class that is used to create a GRPC channel based on the given
     connection string per the documentation of Spark Connect.
@@ -88,7 +88,9 @@ class ProxiedChannel(grpc.Channel):
         self._proxy = proxy.DataprocSessionProxy(0, target_host)
         self._proxy.start()
         self._proxied_connect_url = f"sc://localhost:{self._proxy.port}"
-        self._wrapped = ChannelBuilder(self._proxied_connect_url).toChannel()
+        self._wrapped = DefaultChannelBuilder(
+            self._proxied_connect_url
+        ).toChannel()
     def __enter__(self):
         return self

dataproc_spark_connect-1.0.0rc1/google/cloud/dataproc_spark_connect/environment.py ADDED Viewed

@@ -0,0 +1,76 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Callable, Tuple, List
+def is_vscode() -> bool:
+    """True if running inside VS Code at all."""
+    return os.getenv("VSCODE_PID") is not None
+def is_jupyter() -> bool:
+    """True if running in a Jupyter environment."""
+    return os.getenv("JPY_PARENT_PID") is not None
+def is_colab_enterprise() -> bool:
+    """True if running in Colab Enterprise (Vertex AI)."""
+    return os.getenv("VERTEX_PRODUCT") == "COLAB_ENTERPRISE"
+def is_colab() -> bool:
+    """True if running in Google Colab."""
+    return os.getenv("COLAB_RELEASE_TAG") is not None
+def is_workbench() -> bool:
+    """True if running in AI Workbench (managed Jupyter)."""
+    return os.getenv("VERTEX_PRODUCT") == "WORKBENCH_INSTANCE"
+def is_jetbrains_ide() -> bool:
+    """True if running inside any JetBrains IDE."""
+    return "jetbrains" in os.getenv("TERMINAL_EMULATOR", "").lower()
+def get_client_environment_label() -> str:
+    """
+    Map current environment to a standardized client label.
+    Priority order:
+      1. Colab Enterprise ("colab-enterprise")
+      2. Colab ("colab")
+      3. Workbench ("workbench-jupyter")
+      4. VS Code ("vscode")
+      5. JetBrains IDE ("jetbrains")
+      6. Jupyter ("jupyter")
+      7. Unknown ("unknown")
+    """
+    checks: List[Tuple[Callable[[], bool], str]] = [
+        (is_colab_enterprise, "colab-enterprise"),
+        (is_colab, "colab"),
+        (is_workbench, "workbench-jupyter"),
+        (is_vscode, "vscode"),
+        (is_jetbrains_ide, "jetbrains"),
+        (is_jupyter, "jupyter"),
+    ]
+    for detector, label in checks:
+        try:
+            if detector():
+                return label
+        except Exception:
+            pass
+    return "unknown"

{dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/session.py RENAMED Viewed

@@ -49,6 +49,7 @@ from google.cloud.dataproc_v1 import (
     TerminateSessionRequest,
 )
 from google.cloud.dataproc_v1.types import sessions
+from google.cloud.dataproc_spark_connect import environment
 from pyspark.sql.connect.session import SparkSession
 from pyspark.sql.utils import to_str
@@ -56,6 +57,12 @@ from pyspark.sql.utils import to_str
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# System labels that should not be overridden by user
+SYSTEM_LABELS = {
+    "dataproc-session-client",
+    "goog-colab-notebook-id",
+}
 def _is_valid_label_value(value: str) -> bool:
     """
@@ -96,7 +103,7 @@ class DataprocSparkSession(SparkSession):
     ... ) # doctest: +SKIP
     """
-    _DEFAULT_RUNTIME_VERSION = "2.3"
+    _DEFAULT_RUNTIME_VERSION = "3.0"
     _active_s8s_session_uuid: ClassVar[Optional[str]] = None
     _project_id = None
@@ -132,11 +139,76 @@ class DataprocSparkSession(SparkSession):
             return self
         def dataprocSessionConfig(self, dataproc_config: Session):
+            self._dataproc_config = dataproc_config
+            for k, v in dataproc_config.runtime_config.properties.items():
+                self._options[cast(str, k)] = to_str(v)
+            return self
+        @property
+        def dataproc_config(self):
             with self._lock:
-                self._dataproc_config = dataproc_config
-                for k, v in dataproc_config.runtime_config.properties.items():
-                    self._options[cast(str, k)] = to_str(v)
-                return self
+                self._dataproc_config = self._dataproc_config or Session()
+                return self._dataproc_config
+        def runtimeVersion(self, version: str):
+            self.dataproc_config.runtime_config.version = version
+            return self
+        def serviceAccount(self, account: str):
+            self.dataproc_config.environment_config.execution_config.service_account = (
+                account
+            )
+            return self
+        def subnetwork(self, subnet: str):
+            self.dataproc_config.environment_config.execution_config.subnetwork_uri = (
+                subnet
+            )
+            return self
+        def ttl(self, duration: datetime.timedelta):
+            """Set the time-to-live (TTL) for the session using a timedelta object."""
+            return self.ttlSeconds(int(duration.total_seconds()))
+        def ttlSeconds(self, seconds: int):
+            """Set the time-to-live (TTL) for the session in seconds."""
+            self.dataproc_config.environment_config.execution_config.ttl = {
+                "seconds": seconds
+            }
+            return self
+        def idleTtl(self, duration: datetime.timedelta):
+            """Set the idle time-to-live (idle TTL) for the session using a timedelta object."""
+            return self.idleTtlSeconds(int(duration.total_seconds()))
+        def idleTtlSeconds(self, seconds: int):
+            """Set the idle time-to-live (idle TTL) for the session in seconds."""
+            self.dataproc_config.environment_config.execution_config.idle_ttl = {
+                "seconds": seconds
+            }
+            return self
+        def sessionTemplate(self, template: str):
+            self.dataproc_config.session_template = template
+            return self
+        def label(self, key: str, value: str):
+            """Add a single label to the session."""
+            return self.labels({key: value})
+        def labels(self, labels: Dict[str, str]):
+            # Filter out system labels and warn user
+            filtered_labels = {}
+            for key, value in labels.items():
+                if key in SYSTEM_LABELS:
+                    logger.warning(
+                        f"Label '{key}' is a system label and cannot be overridden by user. Ignoring."
+                    )
+                else:
+                    filtered_labels[key] = value
+            self.dataproc_config.labels.update(filtered_labels)
+            return self
         def remote(self, url: Optional[str] = None) -> "SparkSession.Builder":
             if url:
@@ -258,8 +330,7 @@ class DataprocSparkSession(SparkSession):
                         client_options=self._client_options
                     ).create_session(session_request)
                     self._display_session_link_on_creation(session_id)
-                    # TODO: Add the 'View Session Details' button once the UI changes are done.
-                    # self._display_view_session_details_button(session_id)
+                    self._display_view_session_details_button(session_id)
                     create_session_pbar_thread.start()
                     session_response: Session = operation.result(
                         polling=retry.Retry(
@@ -377,8 +448,7 @@ class DataprocSparkSession(SparkSession):
                 print(
                     f"Using existing Dataproc Session (configuration changes may not be applied): https://console.cloud.google.com/dataproc/interactive/{self._region}/{s8s_session_id}?project={self._project_id}"
                 )
-                # TODO: Add the 'View Session Details' button once the UI changes are done.
-                # self._display_view_session_details_button(s8s_session_id)
+                self._display_view_session_details_button(s8s_session_id)
                 if session is None:
                     session = self.__create_spark_connect_session_from_s8s(
                         session_response, session_name
@@ -401,11 +471,10 @@ class DataprocSparkSession(SparkSession):
                 return session
         def _get_dataproc_config(self):
-            dataproc_config = Session()
-            if self._dataproc_config:
-                dataproc_config = self._dataproc_config
-                for k, v in self._options.items():
-                    dataproc_config.runtime_config.properties[k] = v
+            # Use the property to ensure we always have a config
+            dataproc_config = self.dataproc_config
+            for k, v in self._options.items():
+                dataproc_config.runtime_config.properties[k] = v
             dataproc_config.spark_connect_session = (
                 sessions.SparkConnectConfig()
             )
@@ -413,6 +482,11 @@ class DataprocSparkSession(SparkSession):
                 dataproc_config.runtime_config.version = (
                     DataprocSparkSession._DEFAULT_RUNTIME_VERSION
                 )
+            # Check for Python version mismatch with runtime for UDF compatibility
+            self._check_python_version_compatibility(
+                dataproc_config.runtime_config.version
+            )
             if (
                 not dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type
                 and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
@@ -452,6 +526,10 @@ class DataprocSparkSession(SparkSession):
                         os.getenv("DATAPROC_SPARK_CONNECT_IDLE_TTL_SECONDS")
                     )
                 }
+            client_environment = environment.get_client_environment_label()
+            dataproc_config.labels["dataproc-session-client"] = (
+                client_environment
+            )
             if "COLAB_NOTEBOOK_ID" in os.environ:
                 colab_notebook_name = os.environ["COLAB_NOTEBOOK_ID"]
                 # Extract the last part of the path, which is the ID
@@ -466,34 +544,55 @@ class DataprocSparkSession(SparkSession):
                         f"Only lowercase letters, numbers, and dashes are allowed. "
                         f"The value must start with lowercase letter or number and end with a lowercase letter or number. "
                         f"Maximum length is 63 characters. "
-                        f"Skipping notebook ID label."
+                        f"Ignoring notebook ID label."
                     )
             default_datasource = os.getenv(
                 "DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
             )
-            if (
-                default_datasource
-                and dataproc_config.runtime_config.version == "2.3"
-            ):
-                if default_datasource == "bigquery":
-                    bq_datasource_properties = {
+            match default_datasource:
+                case "bigquery":
+                    # Merge default configs with existing properties,
+                    # user configs take precedence
+                    for k, v in {
                         "spark.datasource.bigquery.viewsEnabled": "true",
                         "spark.datasource.bigquery.writeMethod": "direct",
                         "spark.sql.catalog.spark_catalog": "com.google.cloud.spark.bigquery.BigQuerySparkSessionCatalog",
-                        "spark.sql.legacy.createHiveTableByDefault": "false",
                         "spark.sql.sources.default": "bigquery",
-                    }
-                    # Merge default configs with existing properties, user configs take precedence
-                    for k, v in bq_datasource_properties.items():
+                    }.items():
                         if k not in dataproc_config.runtime_config.properties:
                             dataproc_config.runtime_config.properties[k] = v
-                else:
-                    logger.warning(
-                        f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
-                        f" {default_datasource}. Supported value is 'bigquery'."
-                    )
+                case _:
+                    if default_datasource:
+                        logger.warning(
+                            f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
+                            f" {default_datasource}. Supported value is 'bigquery'."
+                        )
             return dataproc_config
+        def _check_python_version_compatibility(self, runtime_version):
+            """Check if client Python version matches server Python version for UDF compatibility."""
+            import sys
+            import warnings
+            # Runtime version to server Python version mapping
+            RUNTIME_PYTHON_MAP = {
+                "3.0": (3, 11),
+            }
+            client_python = sys.version_info[:2]  # (major, minor)
+            if runtime_version in RUNTIME_PYTHON_MAP:
+                server_python = RUNTIME_PYTHON_MAP[runtime_version]
+                if client_python != server_python:
+                    warnings.warn(
+                        f"Python version mismatch detected: Client is using Python {client_python[0]}.{client_python[1]}, "
+                        f"but Dataproc runtime {runtime_version} uses Python {server_python[0]}.{server_python[1]}. "
+                        f"This mismatch may cause issues with Python UDF (User Defined Function) compatibility. "
+                        f"Consider using Python {server_python[0]}.{server_python[1]} for optimal UDF execution.",
+                        stacklevel=3,
+                    )
         def _display_view_session_details_button(self, session_id):
             try:
                 session_url = f"https://console.cloud.google.com/dataproc/interactive/sessions/{session_id}/locations/{self._region}?project={self._project_id}"
@@ -690,7 +789,7 @@ class DataprocSparkSession(SparkSession):
         This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws
         an exception.
         Regarding pypi: Popular packages are already pre-installed in s8s runtime.
-        https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.2#python_libraries
+        https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.3#python_libraries
         If there are conflicts/package doesn't exist, it throws an exception.
         """
         if sum([pypi, file, pyfile, archive]) > 1:

{dataproc_spark_connect-0.8.3 → dataproc_spark_connect-1.0.0rc1}/setup.py RENAMED Viewed

@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
 setup(
     name="dataproc-spark-connect",
-    version="0.8.3",
+    version="1.0.0rc1",
     description="Dataproc client library for Spark Connect",
     long_description=long_description,
     author="Google LLC",
@@ -31,7 +31,7 @@ setup(
         "google-api-core>=2.19",
         "google-cloud-dataproc>=5.18",
         "packaging>=20.0",
-        "pyspark[connect]~=3.5.1",
+        "pyspark[connect]~=4.0.0",
         "tqdm>=4.67",
         "websockets>=14.0",
     ],