dataproc-spark-connect 1.0.0rc4__py2.py3-none-any.whl → 1.0.0rc6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 1.0.0rc4
3
+ Version: 1.0.0rc6
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -76,6 +76,53 @@ environment variables:
76
76
  spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
77
77
  ```
78
78
 
79
+ ### Using Spark SQL Magic Commands (Jupyter Notebooks)
80
+
81
+ The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
82
+
83
+ **Installation**: To use magic commands, install the required dependencies manually:
84
+ ```bash
85
+ pip install dataproc-spark-connect
86
+ pip install IPython sparksql-magic
87
+ ```
88
+
89
+ 1. Load the magic extension:
90
+ ```python
91
+ %load_ext sparksql_magic
92
+ ```
93
+
94
+ 2. Configure default settings (optional):
95
+ ```python
96
+ %config SparkSql.limit=20
97
+ ```
98
+
99
+ 3. Execute SQL queries:
100
+ ```python
101
+ %%sparksql
102
+ SELECT * FROM your_table
103
+ ```
104
+
105
+ 4. Advanced usage with options:
106
+ ```python
107
+ # Cache results and create a view
108
+ %%sparksql --cache --view result_view df
109
+ SELECT * FROM your_table WHERE condition = true
110
+ ```
111
+
112
+ Available options:
113
+ - `--cache` / `-c`: Cache the DataFrame
114
+ - `--eager` / `-e`: Cache with eager loading
115
+ - `--view VIEW` / `-v VIEW`: Create a temporary view
116
+ - `--limit N` / `-l N`: Override default row display limit
117
+ - `variable_name`: Store result in a variable
118
+
119
+ See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
120
+
121
+ **Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
122
+ ```bash
123
+ pip install dataproc-spark-connect
124
+ ```
125
+
79
126
  ## Developing
80
127
 
81
128
  For development instructions see [guide](DEVELOPING.md).
@@ -1,13 +1,13 @@
1
- dataproc_spark_connect-1.0.0rc4.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
1
+ dataproc_spark_connect-1.0.0rc6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2
2
  google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
3
- google/cloud/dataproc_spark_connect/environment.py,sha256=H4KcT-_X64oKlQ9vFhfoRSh5JrmyHgFGCeo8UOAztiM,2678
4
- google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
3
+ google/cloud/dataproc_spark_connect/environment.py,sha256=o5WRKI1vyIaxZ8S2UhtDer6pdi4CXYRzI9Xdpq5hVkQ,2771
4
+ google/cloud/dataproc_spark_connect/exceptions.py,sha256=iwaHgNabcaxqquOpktGkOWKHMf8hgdPQJUgRnIbTXVs,970
5
5
  google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
6
- google/cloud/dataproc_spark_connect/session.py,sha256=RDOnjzhyI1bI_Hf00bddGcBXOnuIHzv7AhlK8fFvYIY,50873
6
+ google/cloud/dataproc_spark_connect/session.py,sha256=F_ryWRpwGC7Ul1ABJImZIBC3O6iVUTReUi7xz5uqlEo,53802
7
7
  google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
8
8
  google/cloud/dataproc_spark_connect/client/core.py,sha256=GRc4OCTBvIvdagjxOPoDO22vLtt8xDSerdREMRDeUBY,4659
9
9
  google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
10
- dataproc_spark_connect-1.0.0rc4.dist-info/METADATA,sha256=bXOK3plGsUla_ugMQjJ9GDiQz1qey0GcApVg4yjls4k,3468
11
- dataproc_spark_connect-1.0.0rc4.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
- dataproc_spark_connect-1.0.0rc4.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
- dataproc_spark_connect-1.0.0rc4.dist-info/RECORD,,
10
+ dataproc_spark_connect-1.0.0rc6.dist-info/METADATA,sha256=nwxT-Fe5CPPsF6rKUwXz7hN4LdEd4U78lfndqi1_FRg,4841
11
+ dataproc_spark_connect-1.0.0rc6.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
+ dataproc_spark_connect-1.0.0rc6.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
+ dataproc_spark_connect-1.0.0rc6.dist-info/RECORD,,
@@ -67,6 +67,10 @@ def is_interactive_terminal():
67
67
  return is_interactive() and is_terminal()
68
68
 
69
69
 
70
+ def is_dataproc_batch() -> bool:
71
+ return os.getenv("DATAPROC_WORKLOAD_TYPE") == "batch"
72
+
73
+
70
74
  def get_client_environment_label() -> str:
71
75
  """
72
76
  Map current environment to a standardized client label.
@@ -24,4 +24,4 @@ class DataprocSparkConnectException(Exception):
24
24
  super().__init__(message)
25
25
 
26
26
  def _render_traceback_(self):
27
- return self.message
27
+ return [self.message]
@@ -472,6 +472,27 @@ class DataprocSparkSession(SparkSession):
472
472
  session_response, dataproc_config.name
473
473
  )
474
474
 
475
+ def _wait_for_session_available(
476
+ self, session_name: str, timeout: int = 300
477
+ ) -> Session:
478
+ start_time = time.time()
479
+ while time.time() - start_time < timeout:
480
+ try:
481
+ session = self.session_controller_client.get_session(
482
+ name=session_name
483
+ )
484
+ if "Spark Connect Server" in session.runtime_info.endpoints:
485
+ return session
486
+ time.sleep(5)
487
+ except Exception as e:
488
+ logger.warning(
489
+ f"Error while polling for Spark Connect endpoint: {e}"
490
+ )
491
+ time.sleep(5)
492
+ raise RuntimeError(
493
+ f"Spark Connect endpoint not available for session {session_name} after {timeout} seconds."
494
+ )
495
+
475
496
  def _display_session_link_on_creation(self, session_id):
476
497
  session_url = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
477
498
  plain_message = f"Creating Dataproc Session: {session_url}"
@@ -537,6 +558,9 @@ class DataprocSparkSession(SparkSession):
537
558
  )
538
559
  self._display_view_session_details_button(s8s_session_id)
539
560
  if session is None:
561
+ session_response = self._wait_for_session_available(
562
+ session_name
563
+ )
540
564
  session = self.__create_spark_connect_session_from_s8s(
541
565
  session_response, session_name
542
566
  )
@@ -552,6 +576,13 @@ class DataprocSparkSession(SparkSession):
552
576
 
553
577
  def getOrCreate(self) -> "DataprocSparkSession":
554
578
  with DataprocSparkSession._lock:
579
+ if environment.is_dataproc_batch():
580
+ # For Dataproc batch workloads, connect to the already initialized local SparkSession
581
+ from pyspark.sql import SparkSession as PySparkSQLSession
582
+
583
+ session = PySparkSQLSession.builder.getOrCreate()
584
+ return session # type: ignore
585
+
555
586
  # Handle custom session ID by setting it early and letting existing logic handle it
556
587
  if self._custom_session_id:
557
588
  self._handle_custom_session_id()
@@ -559,6 +590,13 @@ class DataprocSparkSession(SparkSession):
559
590
  session = self._get_exiting_active_session()
560
591
  if session is None:
561
592
  session = self.__create()
593
+
594
+ # Register this session as the instantiated SparkSession for compatibility
595
+ # with tools and libraries that expect SparkSession._instantiatedSession
596
+ from pyspark.sql import SparkSession as PySparkSQLSession
597
+
598
+ PySparkSQLSession._instantiatedSession = session
599
+
562
600
  return session
563
601
 
564
602
  def _handle_custom_session_id(self):
@@ -593,20 +631,33 @@ class DataprocSparkSession(SparkSession):
593
631
  self._check_python_version_compatibility(
594
632
  dataproc_config.runtime_config.version
595
633
  )
634
+
635
+ # Use local variable to improve readability of deeply nested attribute access
636
+ exec_config = dataproc_config.environment_config.execution_config
637
+
638
+ # Set service account from environment if not already set
596
639
  if (
597
- not dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type
598
- and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
599
- ):
600
- dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = AuthenticationConfig.AuthenticationType[
601
- os.getenv("DATAPROC_SPARK_CONNECT_AUTH_TYPE")
602
- ]
603
- if (
604
- not dataproc_config.environment_config.execution_config.service_account
640
+ not exec_config.service_account
605
641
  and "DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT" in os.environ
606
642
  ):
607
- dataproc_config.environment_config.execution_config.service_account = os.getenv(
643
+ exec_config.service_account = os.getenv(
608
644
  "DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT"
609
645
  )
646
+
647
+ # Auto-set authentication type to SERVICE_ACCOUNT when service account is provided
648
+ if exec_config.service_account:
649
+ # When service account is provided, explicitly set auth type to SERVICE_ACCOUNT
650
+ exec_config.authentication_config.user_workload_authentication_type = (
651
+ AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
652
+ )
653
+ elif (
654
+ not exec_config.authentication_config.user_workload_authentication_type
655
+ and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
656
+ ):
657
+ # Only set auth type from environment if no service account is present
658
+ exec_config.authentication_config.user_workload_authentication_type = AuthenticationConfig.AuthenticationType[
659
+ os.getenv("DATAPROC_SPARK_CONNECT_AUTH_TYPE")
660
+ ]
610
661
  if (
611
662
  not dataproc_config.environment_config.execution_config.subnetwork_uri
612
663
  and "DATAPROC_SPARK_CONNECT_SUBNET" in os.environ
@@ -673,6 +724,7 @@ class DataprocSparkSession(SparkSession):
673
724
  f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
674
725
  f" {default_datasource}. Supported value is 'bigquery'."
675
726
  )
727
+
676
728
  return dataproc_config
677
729
 
678
730
  def _check_python_version_compatibility(self, runtime_version):
@@ -1148,6 +1200,20 @@ class DataprocSparkSession(SparkSession):
1148
1200
  )
1149
1201
 
1150
1202
  self._remove_stopped_session_from_file()
1203
+
1204
+ # Clean up SparkSession._instantiatedSession if it points to this session
1205
+ try:
1206
+ from pyspark.sql import SparkSession as PySparkSQLSession
1207
+
1208
+ if PySparkSQLSession._instantiatedSession is self:
1209
+ PySparkSQLSession._instantiatedSession = None
1210
+ logger.debug(
1211
+ "Cleared SparkSession._instantiatedSession reference"
1212
+ )
1213
+ except (ImportError, AttributeError):
1214
+ # PySpark not available or _instantiatedSession doesn't exist
1215
+ pass
1216
+
1151
1217
  DataprocSparkSession._active_s8s_session_uuid = None
1152
1218
  DataprocSparkSession._active_s8s_session_id = None
1153
1219
  DataprocSparkSession._active_session_uses_custom_id = False