PyPI - dataproc-spark-connect - Versions diffs - 1.0.0rc4__py2.py3-none-any.whl → 1.0.0rc6__py2.py3-none-any.whl - Mend

dataproc-spark-connect 1.0.0rc4py2.py3-none-any.whl → 1.0.0rc6py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

{dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dataproc-spark-connect
-Version: 1.0.0rc4
+Version: 1.0.0rc6
 Summary: Dataproc client library for Spark Connect
 Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
 Author: Google LLC
@@ -76,6 +76,53 @@ environment variables:
    spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
    ```
+### Using Spark SQL Magic Commands (Jupyter Notebooks)
+The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
+**Installation**: To use magic commands, install the required dependencies manually:
+```bash
+pip install dataproc-spark-connect
+pip install IPython sparksql-magic
+```
+1. Load the magic extension:
+   ```python
+   %load_ext sparksql_magic
+   ```
+2. Configure default settings (optional):
+   ```python
+   %config SparkSql.limit=20
+   ```
+3. Execute SQL queries:
+   ```python
+   %%sparksql
+   SELECT * FROM your_table
+   ```
+4. Advanced usage with options:
+   ```python
+   # Cache results and create a view
+   %%sparksql --cache --view result_view df
+   SELECT * FROM your_table WHERE condition = true
+   ```
+Available options:
+- `--cache` / `-c`: Cache the DataFrame
+- `--eager` / `-e`: Cache with eager loading
+- `--view VIEW` / `-v VIEW`: Create a temporary view
+- `--limit N` / `-l N`: Override default row display limit
+- `variable_name`: Store result in a variable
+See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
+**Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
+```bash
+pip install dataproc-spark-connect
+```
 ## Developing
 For development instructions see [guide](DEVELOPING.md).

{dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
-dataproc_spark_connect-1.0.0rc4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+dataproc_spark_connect-1.0.0rc6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
 google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
-google/cloud/dataproc_spark_connect/environment.py,sha256=H4KcT-_X64oKlQ9vFhfoRSh5JrmyHgFGCeo8UOAztiM,2678
-google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
+google/cloud/dataproc_spark_connect/environment.py,sha256=o5WRKI1vyIaxZ8S2UhtDer6pdi4CXYRzI9Xdpq5hVkQ,2771
+google/cloud/dataproc_spark_connect/exceptions.py,sha256=iwaHgNabcaxqquOpktGkOWKHMf8hgdPQJUgRnIbTXVs,970
 google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
-google/cloud/dataproc_spark_connect/session.py,sha256=RDOnjzhyI1bI_Hf00bddGcBXOnuIHzv7AhlK8fFvYIY,50873
+google/cloud/dataproc_spark_connect/session.py,sha256=F_ryWRpwGC7Ul1ABJImZIBC3O6iVUTReUi7xz5uqlEo,53802
 google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
 google/cloud/dataproc_spark_connect/client/core.py,sha256=GRc4OCTBvIvdagjxOPoDO22vLtt8xDSerdREMRDeUBY,4659
 google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
-dataproc_spark_connect-1.0.0rc4.dist-info/METADATA,sha256=bXOK3plGsUla_ugMQjJ9GDiQz1qey0GcApVg4yjls4k,3468
-dataproc_spark_connect-1.0.0rc4.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
-dataproc_spark_connect-1.0.0rc4.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
-dataproc_spark_connect-1.0.0rc4.dist-info/RECORD,,
+dataproc_spark_connect-1.0.0rc6.dist-info/METADATA,sha256=nwxT-Fe5CPPsF6rKUwXz7hN4LdEd4U78lfndqi1_FRg,4841
+dataproc_spark_connect-1.0.0rc6.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
+dataproc_spark_connect-1.0.0rc6.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
+dataproc_spark_connect-1.0.0rc6.dist-info/RECORD,,

google/cloud/dataproc_spark_connect/environment.py CHANGED Viewed

@@ -67,6 +67,10 @@ def is_interactive_terminal():
     return is_interactive() and is_terminal()
+def is_dataproc_batch() -> bool:
+    return os.getenv("DATAPROC_WORKLOAD_TYPE") == "batch"
 def get_client_environment_label() -> str:
     """
     Map current environment to a standardized client label.

google/cloud/dataproc_spark_connect/exceptions.py CHANGED Viewed

@@ -24,4 +24,4 @@ class DataprocSparkConnectException(Exception):
         super().__init__(message)
     def _render_traceback_(self):
-        return self.message
+        return [self.message]

google/cloud/dataproc_spark_connect/session.py CHANGED Viewed

@@ -472,6 +472,27 @@ class DataprocSparkSession(SparkSession):
                     session_response, dataproc_config.name
                 )
+        def _wait_for_session_available(
+            self, session_name: str, timeout: int = 300
+        ) -> Session:
+            start_time = time.time()
+            while time.time() - start_time < timeout:
+                try:
+                    session = self.session_controller_client.get_session(
+                        name=session_name
+                    )
+                    if "Spark Connect Server" in session.runtime_info.endpoints:
+                        return session
+                    time.sleep(5)
+                except Exception as e:
+                    logger.warning(
+                        f"Error while polling for Spark Connect endpoint: {e}"
+                    )
+                    time.sleep(5)
+            raise RuntimeError(
+                f"Spark Connect endpoint not available for session {session_name} after {timeout} seconds."
+            )
         def _display_session_link_on_creation(self, session_id):
             session_url = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
             plain_message = f"Creating Dataproc Session: {session_url}"
@@ -537,6 +558,9 @@ class DataprocSparkSession(SparkSession):
                 )
                 self._display_view_session_details_button(s8s_session_id)
                 if session is None:
+                    session_response = self._wait_for_session_available(
+                        session_name
+                    )
                     session = self.__create_spark_connect_session_from_s8s(
                         session_response, session_name
                     )
@@ -552,6 +576,13 @@ class DataprocSparkSession(SparkSession):
         def getOrCreate(self) -> "DataprocSparkSession":
             with DataprocSparkSession._lock:
+                if environment.is_dataproc_batch():
+                    # For Dataproc batch workloads, connect to the already initialized local SparkSession
+                    from pyspark.sql import SparkSession as PySparkSQLSession
+                    session = PySparkSQLSession.builder.getOrCreate()
+                    return session  # type: ignore
                 # Handle custom session ID by setting it early and letting existing logic handle it
                 if self._custom_session_id:
                     self._handle_custom_session_id()
@@ -559,6 +590,13 @@ class DataprocSparkSession(SparkSession):
                 session = self._get_exiting_active_session()
                 if session is None:
                     session = self.__create()
+                # Register this session as the instantiated SparkSession for compatibility
+                # with tools and libraries that expect SparkSession._instantiatedSession
+                from pyspark.sql import SparkSession as PySparkSQLSession
+                PySparkSQLSession._instantiatedSession = session
                 return session
         def _handle_custom_session_id(self):
@@ -593,20 +631,33 @@ class DataprocSparkSession(SparkSession):
             self._check_python_version_compatibility(
                 dataproc_config.runtime_config.version
             )
+            # Use local variable to improve readability of deeply nested attribute access
+            exec_config = dataproc_config.environment_config.execution_config
+            # Set service account from environment if not already set
             if (
-                not dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type
-                and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
-            ):
-                dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = AuthenticationConfig.AuthenticationType[
-                    os.getenv("DATAPROC_SPARK_CONNECT_AUTH_TYPE")
-                ]
-            if (
-                not dataproc_config.environment_config.execution_config.service_account
+                not exec_config.service_account
                 and "DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT" in os.environ
             ):
-                dataproc_config.environment_config.execution_config.service_account = os.getenv(
+                exec_config.service_account = os.getenv(
                     "DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT"
                 )
+            # Auto-set authentication type to SERVICE_ACCOUNT when service account is provided
+            if exec_config.service_account:
+                # When service account is provided, explicitly set auth type to SERVICE_ACCOUNT
+                exec_config.authentication_config.user_workload_authentication_type = (
+                    AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
+                )
+            elif (
+                not exec_config.authentication_config.user_workload_authentication_type
+                and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
+            ):
+                # Only set auth type from environment if no service account is present
+                exec_config.authentication_config.user_workload_authentication_type = AuthenticationConfig.AuthenticationType[
+                    os.getenv("DATAPROC_SPARK_CONNECT_AUTH_TYPE")
+                ]
             if (
                 not dataproc_config.environment_config.execution_config.subnetwork_uri
                 and "DATAPROC_SPARK_CONNECT_SUBNET" in os.environ
@@ -673,6 +724,7 @@ class DataprocSparkSession(SparkSession):
                             f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
                             f" {default_datasource}. Supported value is 'bigquery'."
                         )
             return dataproc_config
         def _check_python_version_compatibility(self, runtime_version):
@@ -1148,6 +1200,20 @@ class DataprocSparkSession(SparkSession):
                     )
                 self._remove_stopped_session_from_file()
+                # Clean up SparkSession._instantiatedSession if it points to this session
+                try:
+                    from pyspark.sql import SparkSession as PySparkSQLSession
+                    if PySparkSQLSession._instantiatedSession is self:
+                        PySparkSQLSession._instantiatedSession = None
+                        logger.debug(
+                            "Cleared SparkSession._instantiatedSession reference"
+                        )
+                except (ImportError, AttributeError):
+                    # PySpark not available or _instantiatedSession doesn't exist
+                    pass
                 DataprocSparkSession._active_s8s_session_uuid = None
                 DataprocSparkSession._active_s8s_session_id = None
                 DataprocSparkSession._active_session_uses_custom_id = False

{dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/WHEEL RENAMED Viewed

File without changes

{dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/top_level.txt RENAMED Viewed

File without changes

dataproc-spark-connect 1.0.0rc4__py2.py3-none-any.whl → 1.0.0rc6__py2.py3-none-any.whl

dataproc-spark-connect 1.0.0rc4py2.py3-none-any.whl → 1.0.0rc6py2.py3-none-any.whl