dataproc-spark-connect 1.0.0rc5__py2.py3-none-any.whl → 1.0.0rc6__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 1.0.0rc5
3
+ Version: 1.0.0rc6
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -76,6 +76,53 @@ environment variables:
76
76
  spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
77
77
  ```
78
78
 
79
+ ### Using Spark SQL Magic Commands (Jupyter Notebooks)
80
+
81
+ The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
82
+
83
+ **Installation**: To use magic commands, install the required dependencies manually:
84
+ ```bash
85
+ pip install dataproc-spark-connect
86
+ pip install IPython sparksql-magic
87
+ ```
88
+
89
+ 1. Load the magic extension:
90
+ ```python
91
+ %load_ext sparksql_magic
92
+ ```
93
+
94
+ 2. Configure default settings (optional):
95
+ ```python
96
+ %config SparkSql.limit=20
97
+ ```
98
+
99
+ 3. Execute SQL queries:
100
+ ```python
101
+ %%sparksql
102
+ SELECT * FROM your_table
103
+ ```
104
+
105
+ 4. Advanced usage with options:
106
+ ```python
107
+ # Cache results and create a view
108
+ %%sparksql --cache --view result_view df
109
+ SELECT * FROM your_table WHERE condition = true
110
+ ```
111
+
112
+ Available options:
113
+ - `--cache` / `-c`: Cache the DataFrame
114
+ - `--eager` / `-e`: Cache with eager loading
115
+ - `--view VIEW` / `-v VIEW`: Create a temporary view
116
+ - `--limit N` / `-l N`: Override default row display limit
117
+ - `variable_name`: Store result in a variable
118
+
119
+ See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
120
+
121
+ **Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
122
+ ```bash
123
+ pip install dataproc-spark-connect
124
+ ```
125
+
79
126
  ## Developing
80
127
 
81
128
  For development instructions see [guide](DEVELOPING.md).
@@ -1,13 +1,13 @@
1
- dataproc_spark_connect-1.0.0rc5.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
1
+ dataproc_spark_connect-1.0.0rc6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2
2
  google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
3
- google/cloud/dataproc_spark_connect/environment.py,sha256=H4KcT-_X64oKlQ9vFhfoRSh5JrmyHgFGCeo8UOAztiM,2678
4
- google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
3
+ google/cloud/dataproc_spark_connect/environment.py,sha256=o5WRKI1vyIaxZ8S2UhtDer6pdi4CXYRzI9Xdpq5hVkQ,2771
4
+ google/cloud/dataproc_spark_connect/exceptions.py,sha256=iwaHgNabcaxqquOpktGkOWKHMf8hgdPQJUgRnIbTXVs,970
5
5
  google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
6
- google/cloud/dataproc_spark_connect/session.py,sha256=e1Z3xpjgZimcaYVrxzFhlMnhWmyxp7v7TTltuQqjhbA,51461
6
+ google/cloud/dataproc_spark_connect/session.py,sha256=F_ryWRpwGC7Ul1ABJImZIBC3O6iVUTReUi7xz5uqlEo,53802
7
7
  google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
8
8
  google/cloud/dataproc_spark_connect/client/core.py,sha256=GRc4OCTBvIvdagjxOPoDO22vLtt8xDSerdREMRDeUBY,4659
9
9
  google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
10
- dataproc_spark_connect-1.0.0rc5.dist-info/METADATA,sha256=sLRphUFOBZYU8T7h4IgDkGs8EvhqZ1Fm5FMw5-SWi2A,3468
11
- dataproc_spark_connect-1.0.0rc5.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
- dataproc_spark_connect-1.0.0rc5.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
- dataproc_spark_connect-1.0.0rc5.dist-info/RECORD,,
10
+ dataproc_spark_connect-1.0.0rc6.dist-info/METADATA,sha256=nwxT-Fe5CPPsF6rKUwXz7hN4LdEd4U78lfndqi1_FRg,4841
11
+ dataproc_spark_connect-1.0.0rc6.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
+ dataproc_spark_connect-1.0.0rc6.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
+ dataproc_spark_connect-1.0.0rc6.dist-info/RECORD,,
@@ -67,6 +67,10 @@ def is_interactive_terminal():
67
67
  return is_interactive() and is_terminal()
68
68
 
69
69
 
70
+ def is_dataproc_batch() -> bool:
71
+ return os.getenv("DATAPROC_WORKLOAD_TYPE") == "batch"
72
+
73
+
70
74
  def get_client_environment_label() -> str:
71
75
  """
72
76
  Map current environment to a standardized client label.
@@ -24,4 +24,4 @@ class DataprocSparkConnectException(Exception):
24
24
  super().__init__(message)
25
25
 
26
26
  def _render_traceback_(self):
27
- return self.message
27
+ return [self.message]
@@ -472,6 +472,27 @@ class DataprocSparkSession(SparkSession):
472
472
  session_response, dataproc_config.name
473
473
  )
474
474
 
475
+ def _wait_for_session_available(
476
+ self, session_name: str, timeout: int = 300
477
+ ) -> Session:
478
+ start_time = time.time()
479
+ while time.time() - start_time < timeout:
480
+ try:
481
+ session = self.session_controller_client.get_session(
482
+ name=session_name
483
+ )
484
+ if "Spark Connect Server" in session.runtime_info.endpoints:
485
+ return session
486
+ time.sleep(5)
487
+ except Exception as e:
488
+ logger.warning(
489
+ f"Error while polling for Spark Connect endpoint: {e}"
490
+ )
491
+ time.sleep(5)
492
+ raise RuntimeError(
493
+ f"Spark Connect endpoint not available for session {session_name} after {timeout} seconds."
494
+ )
495
+
475
496
  def _display_session_link_on_creation(self, session_id):
476
497
  session_url = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
477
498
  plain_message = f"Creating Dataproc Session: {session_url}"
@@ -537,6 +558,9 @@ class DataprocSparkSession(SparkSession):
537
558
  )
538
559
  self._display_view_session_details_button(s8s_session_id)
539
560
  if session is None:
561
+ session_response = self._wait_for_session_available(
562
+ session_name
563
+ )
540
564
  session = self.__create_spark_connect_session_from_s8s(
541
565
  session_response, session_name
542
566
  )
@@ -552,6 +576,13 @@ class DataprocSparkSession(SparkSession):
552
576
 
553
577
  def getOrCreate(self) -> "DataprocSparkSession":
554
578
  with DataprocSparkSession._lock:
579
+ if environment.is_dataproc_batch():
580
+ # For Dataproc batch workloads, connect to the already initialized local SparkSession
581
+ from pyspark.sql import SparkSession as PySparkSQLSession
582
+
583
+ session = PySparkSQLSession.builder.getOrCreate()
584
+ return session # type: ignore
585
+
555
586
  # Handle custom session ID by setting it early and letting existing logic handle it
556
587
  if self._custom_session_id:
557
588
  self._handle_custom_session_id()
@@ -559,6 +590,13 @@ class DataprocSparkSession(SparkSession):
559
590
  session = self._get_exiting_active_session()
560
591
  if session is None:
561
592
  session = self.__create()
593
+
594
+ # Register this session as the instantiated SparkSession for compatibility
595
+ # with tools and libraries that expect SparkSession._instantiatedSession
596
+ from pyspark.sql import SparkSession as PySparkSQLSession
597
+
598
+ PySparkSQLSession._instantiatedSession = session
599
+
562
600
  return session
563
601
 
564
602
  def _handle_custom_session_id(self):
@@ -1162,6 +1200,20 @@ class DataprocSparkSession(SparkSession):
1162
1200
  )
1163
1201
 
1164
1202
  self._remove_stopped_session_from_file()
1203
+
1204
+ # Clean up SparkSession._instantiatedSession if it points to this session
1205
+ try:
1206
+ from pyspark.sql import SparkSession as PySparkSQLSession
1207
+
1208
+ if PySparkSQLSession._instantiatedSession is self:
1209
+ PySparkSQLSession._instantiatedSession = None
1210
+ logger.debug(
1211
+ "Cleared SparkSession._instantiatedSession reference"
1212
+ )
1213
+ except (ImportError, AttributeError):
1214
+ # PySpark not available or _instantiatedSession doesn't exist
1215
+ pass
1216
+
1165
1217
  DataprocSparkSession._active_s8s_session_uuid = None
1166
1218
  DataprocSparkSession._active_s8s_session_id = None
1167
1219
  DataprocSparkSession._active_session_uses_custom_id = False