dataproc-spark-connect 1.0.0rc4__py2.py3-none-any.whl → 1.0.0rc6__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/METADATA +48 -1
- {dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/RECORD +8 -8
- google/cloud/dataproc_spark_connect/environment.py +4 -0
- google/cloud/dataproc_spark_connect/exceptions.py +1 -1
- google/cloud/dataproc_spark_connect/session.py +75 -9
- {dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/WHEEL +0 -0
- {dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/licenses/LICENSE +0 -0
- {dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/top_level.txt +0 -0
{dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0rc6
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
@@ -76,6 +76,53 @@ environment variables:
|
|
|
76
76
|
spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
|
|
77
77
|
```
|
|
78
78
|
|
|
79
|
+
### Using Spark SQL Magic Commands (Jupyter Notebooks)
|
|
80
|
+
|
|
81
|
+
The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
|
|
82
|
+
|
|
83
|
+
**Installation**: To use magic commands, install the required dependencies manually:
|
|
84
|
+
```bash
|
|
85
|
+
pip install dataproc-spark-connect
|
|
86
|
+
pip install IPython sparksql-magic
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
1. Load the magic extension:
|
|
90
|
+
```python
|
|
91
|
+
%load_ext sparksql_magic
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
2. Configure default settings (optional):
|
|
95
|
+
```python
|
|
96
|
+
%config SparkSql.limit=20
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
3. Execute SQL queries:
|
|
100
|
+
```python
|
|
101
|
+
%%sparksql
|
|
102
|
+
SELECT * FROM your_table
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
4. Advanced usage with options:
|
|
106
|
+
```python
|
|
107
|
+
# Cache results and create a view
|
|
108
|
+
%%sparksql --cache --view result_view df
|
|
109
|
+
SELECT * FROM your_table WHERE condition = true
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Available options:
|
|
113
|
+
- `--cache` / `-c`: Cache the DataFrame
|
|
114
|
+
- `--eager` / `-e`: Cache with eager loading
|
|
115
|
+
- `--view VIEW` / `-v VIEW`: Create a temporary view
|
|
116
|
+
- `--limit N` / `-l N`: Override default row display limit
|
|
117
|
+
- `variable_name`: Store result in a variable
|
|
118
|
+
|
|
119
|
+
See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
|
|
120
|
+
|
|
121
|
+
**Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
|
|
122
|
+
```bash
|
|
123
|
+
pip install dataproc-spark-connect
|
|
124
|
+
```
|
|
125
|
+
|
|
79
126
|
## Developing
|
|
80
127
|
|
|
81
128
|
For development instructions see [guide](DEVELOPING.md).
|
{dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/RECORD
RENAMED
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
dataproc_spark_connect-1.0.
|
|
1
|
+
dataproc_spark_connect-1.0.0rc6.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2
2
|
google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
|
|
3
|
-
google/cloud/dataproc_spark_connect/environment.py,sha256=
|
|
4
|
-
google/cloud/dataproc_spark_connect/exceptions.py,sha256=
|
|
3
|
+
google/cloud/dataproc_spark_connect/environment.py,sha256=o5WRKI1vyIaxZ8S2UhtDer6pdi4CXYRzI9Xdpq5hVkQ,2771
|
|
4
|
+
google/cloud/dataproc_spark_connect/exceptions.py,sha256=iwaHgNabcaxqquOpktGkOWKHMf8hgdPQJUgRnIbTXVs,970
|
|
5
5
|
google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
|
|
6
|
-
google/cloud/dataproc_spark_connect/session.py,sha256=
|
|
6
|
+
google/cloud/dataproc_spark_connect/session.py,sha256=F_ryWRpwGC7Ul1ABJImZIBC3O6iVUTReUi7xz5uqlEo,53802
|
|
7
7
|
google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
|
|
8
8
|
google/cloud/dataproc_spark_connect/client/core.py,sha256=GRc4OCTBvIvdagjxOPoDO22vLtt8xDSerdREMRDeUBY,4659
|
|
9
9
|
google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
|
|
10
|
-
dataproc_spark_connect-1.0.
|
|
11
|
-
dataproc_spark_connect-1.0.
|
|
12
|
-
dataproc_spark_connect-1.0.
|
|
13
|
-
dataproc_spark_connect-1.0.
|
|
10
|
+
dataproc_spark_connect-1.0.0rc6.dist-info/METADATA,sha256=nwxT-Fe5CPPsF6rKUwXz7hN4LdEd4U78lfndqi1_FRg,4841
|
|
11
|
+
dataproc_spark_connect-1.0.0rc6.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
|
|
12
|
+
dataproc_spark_connect-1.0.0rc6.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
|
|
13
|
+
dataproc_spark_connect-1.0.0rc6.dist-info/RECORD,,
|
|
@@ -67,6 +67,10 @@ def is_interactive_terminal():
|
|
|
67
67
|
return is_interactive() and is_terminal()
|
|
68
68
|
|
|
69
69
|
|
|
70
|
+
def is_dataproc_batch() -> bool:
|
|
71
|
+
return os.getenv("DATAPROC_WORKLOAD_TYPE") == "batch"
|
|
72
|
+
|
|
73
|
+
|
|
70
74
|
def get_client_environment_label() -> str:
|
|
71
75
|
"""
|
|
72
76
|
Map current environment to a standardized client label.
|
|
@@ -472,6 +472,27 @@ class DataprocSparkSession(SparkSession):
|
|
|
472
472
|
session_response, dataproc_config.name
|
|
473
473
|
)
|
|
474
474
|
|
|
475
|
+
def _wait_for_session_available(
|
|
476
|
+
self, session_name: str, timeout: int = 300
|
|
477
|
+
) -> Session:
|
|
478
|
+
start_time = time.time()
|
|
479
|
+
while time.time() - start_time < timeout:
|
|
480
|
+
try:
|
|
481
|
+
session = self.session_controller_client.get_session(
|
|
482
|
+
name=session_name
|
|
483
|
+
)
|
|
484
|
+
if "Spark Connect Server" in session.runtime_info.endpoints:
|
|
485
|
+
return session
|
|
486
|
+
time.sleep(5)
|
|
487
|
+
except Exception as e:
|
|
488
|
+
logger.warning(
|
|
489
|
+
f"Error while polling for Spark Connect endpoint: {e}"
|
|
490
|
+
)
|
|
491
|
+
time.sleep(5)
|
|
492
|
+
raise RuntimeError(
|
|
493
|
+
f"Spark Connect endpoint not available for session {session_name} after {timeout} seconds."
|
|
494
|
+
)
|
|
495
|
+
|
|
475
496
|
def _display_session_link_on_creation(self, session_id):
|
|
476
497
|
session_url = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
|
|
477
498
|
plain_message = f"Creating Dataproc Session: {session_url}"
|
|
@@ -537,6 +558,9 @@ class DataprocSparkSession(SparkSession):
|
|
|
537
558
|
)
|
|
538
559
|
self._display_view_session_details_button(s8s_session_id)
|
|
539
560
|
if session is None:
|
|
561
|
+
session_response = self._wait_for_session_available(
|
|
562
|
+
session_name
|
|
563
|
+
)
|
|
540
564
|
session = self.__create_spark_connect_session_from_s8s(
|
|
541
565
|
session_response, session_name
|
|
542
566
|
)
|
|
@@ -552,6 +576,13 @@ class DataprocSparkSession(SparkSession):
|
|
|
552
576
|
|
|
553
577
|
def getOrCreate(self) -> "DataprocSparkSession":
|
|
554
578
|
with DataprocSparkSession._lock:
|
|
579
|
+
if environment.is_dataproc_batch():
|
|
580
|
+
# For Dataproc batch workloads, connect to the already initialized local SparkSession
|
|
581
|
+
from pyspark.sql import SparkSession as PySparkSQLSession
|
|
582
|
+
|
|
583
|
+
session = PySparkSQLSession.builder.getOrCreate()
|
|
584
|
+
return session # type: ignore
|
|
585
|
+
|
|
555
586
|
# Handle custom session ID by setting it early and letting existing logic handle it
|
|
556
587
|
if self._custom_session_id:
|
|
557
588
|
self._handle_custom_session_id()
|
|
@@ -559,6 +590,13 @@ class DataprocSparkSession(SparkSession):
|
|
|
559
590
|
session = self._get_exiting_active_session()
|
|
560
591
|
if session is None:
|
|
561
592
|
session = self.__create()
|
|
593
|
+
|
|
594
|
+
# Register this session as the instantiated SparkSession for compatibility
|
|
595
|
+
# with tools and libraries that expect SparkSession._instantiatedSession
|
|
596
|
+
from pyspark.sql import SparkSession as PySparkSQLSession
|
|
597
|
+
|
|
598
|
+
PySparkSQLSession._instantiatedSession = session
|
|
599
|
+
|
|
562
600
|
return session
|
|
563
601
|
|
|
564
602
|
def _handle_custom_session_id(self):
|
|
@@ -593,20 +631,33 @@ class DataprocSparkSession(SparkSession):
|
|
|
593
631
|
self._check_python_version_compatibility(
|
|
594
632
|
dataproc_config.runtime_config.version
|
|
595
633
|
)
|
|
634
|
+
|
|
635
|
+
# Use local variable to improve readability of deeply nested attribute access
|
|
636
|
+
exec_config = dataproc_config.environment_config.execution_config
|
|
637
|
+
|
|
638
|
+
# Set service account from environment if not already set
|
|
596
639
|
if (
|
|
597
|
-
not
|
|
598
|
-
and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
|
|
599
|
-
):
|
|
600
|
-
dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = AuthenticationConfig.AuthenticationType[
|
|
601
|
-
os.getenv("DATAPROC_SPARK_CONNECT_AUTH_TYPE")
|
|
602
|
-
]
|
|
603
|
-
if (
|
|
604
|
-
not dataproc_config.environment_config.execution_config.service_account
|
|
640
|
+
not exec_config.service_account
|
|
605
641
|
and "DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT" in os.environ
|
|
606
642
|
):
|
|
607
|
-
|
|
643
|
+
exec_config.service_account = os.getenv(
|
|
608
644
|
"DATAPROC_SPARK_CONNECT_SERVICE_ACCOUNT"
|
|
609
645
|
)
|
|
646
|
+
|
|
647
|
+
# Auto-set authentication type to SERVICE_ACCOUNT when service account is provided
|
|
648
|
+
if exec_config.service_account:
|
|
649
|
+
# When service account is provided, explicitly set auth type to SERVICE_ACCOUNT
|
|
650
|
+
exec_config.authentication_config.user_workload_authentication_type = (
|
|
651
|
+
AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
|
|
652
|
+
)
|
|
653
|
+
elif (
|
|
654
|
+
not exec_config.authentication_config.user_workload_authentication_type
|
|
655
|
+
and "DATAPROC_SPARK_CONNECT_AUTH_TYPE" in os.environ
|
|
656
|
+
):
|
|
657
|
+
# Only set auth type from environment if no service account is present
|
|
658
|
+
exec_config.authentication_config.user_workload_authentication_type = AuthenticationConfig.AuthenticationType[
|
|
659
|
+
os.getenv("DATAPROC_SPARK_CONNECT_AUTH_TYPE")
|
|
660
|
+
]
|
|
610
661
|
if (
|
|
611
662
|
not dataproc_config.environment_config.execution_config.subnetwork_uri
|
|
612
663
|
and "DATAPROC_SPARK_CONNECT_SUBNET" in os.environ
|
|
@@ -673,6 +724,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
673
724
|
f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
|
|
674
725
|
f" {default_datasource}. Supported value is 'bigquery'."
|
|
675
726
|
)
|
|
727
|
+
|
|
676
728
|
return dataproc_config
|
|
677
729
|
|
|
678
730
|
def _check_python_version_compatibility(self, runtime_version):
|
|
@@ -1148,6 +1200,20 @@ class DataprocSparkSession(SparkSession):
|
|
|
1148
1200
|
)
|
|
1149
1201
|
|
|
1150
1202
|
self._remove_stopped_session_from_file()
|
|
1203
|
+
|
|
1204
|
+
# Clean up SparkSession._instantiatedSession if it points to this session
|
|
1205
|
+
try:
|
|
1206
|
+
from pyspark.sql import SparkSession as PySparkSQLSession
|
|
1207
|
+
|
|
1208
|
+
if PySparkSQLSession._instantiatedSession is self:
|
|
1209
|
+
PySparkSQLSession._instantiatedSession = None
|
|
1210
|
+
logger.debug(
|
|
1211
|
+
"Cleared SparkSession._instantiatedSession reference"
|
|
1212
|
+
)
|
|
1213
|
+
except (ImportError, AttributeError):
|
|
1214
|
+
# PySpark not available or _instantiatedSession doesn't exist
|
|
1215
|
+
pass
|
|
1216
|
+
|
|
1151
1217
|
DataprocSparkSession._active_s8s_session_uuid = None
|
|
1152
1218
|
DataprocSparkSession._active_s8s_session_id = None
|
|
1153
1219
|
DataprocSparkSession._active_session_uses_custom_id = False
|
{dataproc_spark_connect-1.0.0rc4.dist-info → dataproc_spark_connect-1.0.0rc6.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|