dataproc-spark-connect 1.0.0rc5__tar.gz → 1.0.0rc6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/PKG-INFO +48 -1
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/README.md +47 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/dataproc_spark_connect.egg-info/PKG-INFO +48 -1
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/google/cloud/dataproc_spark_connect/environment.py +4 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/google/cloud/dataproc_spark_connect/exceptions.py +1 -1
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/google/cloud/dataproc_spark_connect/session.py +52 -0
- dataproc_spark_connect-1.0.0rc6/pyproject.toml +9 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/setup.py +1 -1
- dataproc_spark_connect-1.0.0rc5/pyproject.toml +0 -3
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/LICENSE +0 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/dataproc_spark_connect.egg-info/SOURCES.txt +0 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/dataproc_spark_connect.egg-info/requires.txt +0 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/google/cloud/dataproc_spark_connect/client/core.py +0 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
- {dataproc_spark_connect-1.0.0rc5 → dataproc_spark_connect-1.0.0rc6}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0rc6
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
@@ -76,6 +76,53 @@ environment variables:
|
|
|
76
76
|
spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
|
|
77
77
|
```
|
|
78
78
|
|
|
79
|
+
### Using Spark SQL Magic Commands (Jupyter Notebooks)
|
|
80
|
+
|
|
81
|
+
The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
|
|
82
|
+
|
|
83
|
+
**Installation**: To use magic commands, install the required dependencies manually:
|
|
84
|
+
```bash
|
|
85
|
+
pip install dataproc-spark-connect
|
|
86
|
+
pip install IPython sparksql-magic
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
1. Load the magic extension:
|
|
90
|
+
```python
|
|
91
|
+
%load_ext sparksql_magic
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
2. Configure default settings (optional):
|
|
95
|
+
```python
|
|
96
|
+
%config SparkSql.limit=20
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
3. Execute SQL queries:
|
|
100
|
+
```python
|
|
101
|
+
%%sparksql
|
|
102
|
+
SELECT * FROM your_table
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
4. Advanced usage with options:
|
|
106
|
+
```python
|
|
107
|
+
# Cache results and create a view
|
|
108
|
+
%%sparksql --cache --view result_view df
|
|
109
|
+
SELECT * FROM your_table WHERE condition = true
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Available options:
|
|
113
|
+
- `--cache` / `-c`: Cache the DataFrame
|
|
114
|
+
- `--eager` / `-e`: Cache with eager loading
|
|
115
|
+
- `--view VIEW` / `-v VIEW`: Create a temporary view
|
|
116
|
+
- `--limit N` / `-l N`: Override default row display limit
|
|
117
|
+
- `variable_name`: Store result in a variable
|
|
118
|
+
|
|
119
|
+
See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
|
|
120
|
+
|
|
121
|
+
**Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
|
|
122
|
+
```bash
|
|
123
|
+
pip install dataproc-spark-connect
|
|
124
|
+
```
|
|
125
|
+
|
|
79
126
|
## Developing
|
|
80
127
|
|
|
81
128
|
For development instructions see [guide](DEVELOPING.md).
|
|
@@ -54,6 +54,53 @@ environment variables:
|
|
|
54
54
|
spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
|
|
55
55
|
```
|
|
56
56
|
|
|
57
|
+
### Using Spark SQL Magic Commands (Jupyter Notebooks)
|
|
58
|
+
|
|
59
|
+
The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
|
|
60
|
+
|
|
61
|
+
**Installation**: To use magic commands, install the required dependencies manually:
|
|
62
|
+
```bash
|
|
63
|
+
pip install dataproc-spark-connect
|
|
64
|
+
pip install IPython sparksql-magic
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
1. Load the magic extension:
|
|
68
|
+
```python
|
|
69
|
+
%load_ext sparksql_magic
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
2. Configure default settings (optional):
|
|
73
|
+
```python
|
|
74
|
+
%config SparkSql.limit=20
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
3. Execute SQL queries:
|
|
78
|
+
```python
|
|
79
|
+
%%sparksql
|
|
80
|
+
SELECT * FROM your_table
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
4. Advanced usage with options:
|
|
84
|
+
```python
|
|
85
|
+
# Cache results and create a view
|
|
86
|
+
%%sparksql --cache --view result_view df
|
|
87
|
+
SELECT * FROM your_table WHERE condition = true
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Available options:
|
|
91
|
+
- `--cache` / `-c`: Cache the DataFrame
|
|
92
|
+
- `--eager` / `-e`: Cache with eager loading
|
|
93
|
+
- `--view VIEW` / `-v VIEW`: Create a temporary view
|
|
94
|
+
- `--limit N` / `-l N`: Override default row display limit
|
|
95
|
+
- `variable_name`: Store result in a variable
|
|
96
|
+
|
|
97
|
+
See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
|
|
98
|
+
|
|
99
|
+
**Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
|
|
100
|
+
```bash
|
|
101
|
+
pip install dataproc-spark-connect
|
|
102
|
+
```
|
|
103
|
+
|
|
57
104
|
## Developing
|
|
58
105
|
|
|
59
106
|
For development instructions see [guide](DEVELOPING.md).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.0rc6
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
@@ -76,6 +76,53 @@ environment variables:
|
|
|
76
76
|
spark = DataprocSparkSession.builder.dataprocSessionConfig(session_config).getOrCreate()
|
|
77
77
|
```
|
|
78
78
|
|
|
79
|
+
### Using Spark SQL Magic Commands (Jupyter Notebooks)
|
|
80
|
+
|
|
81
|
+
The package supports the [sparksql-magic](https://github.com/cryeo/sparksql-magic) library for executing Spark SQL queries directly in Jupyter notebooks.
|
|
82
|
+
|
|
83
|
+
**Installation**: To use magic commands, install the required dependencies manually:
|
|
84
|
+
```bash
|
|
85
|
+
pip install dataproc-spark-connect
|
|
86
|
+
pip install IPython sparksql-magic
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
1. Load the magic extension:
|
|
90
|
+
```python
|
|
91
|
+
%load_ext sparksql_magic
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
2. Configure default settings (optional):
|
|
95
|
+
```python
|
|
96
|
+
%config SparkSql.limit=20
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
3. Execute SQL queries:
|
|
100
|
+
```python
|
|
101
|
+
%%sparksql
|
|
102
|
+
SELECT * FROM your_table
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
4. Advanced usage with options:
|
|
106
|
+
```python
|
|
107
|
+
# Cache results and create a view
|
|
108
|
+
%%sparksql --cache --view result_view df
|
|
109
|
+
SELECT * FROM your_table WHERE condition = true
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Available options:
|
|
113
|
+
- `--cache` / `-c`: Cache the DataFrame
|
|
114
|
+
- `--eager` / `-e`: Cache with eager loading
|
|
115
|
+
- `--view VIEW` / `-v VIEW`: Create a temporary view
|
|
116
|
+
- `--limit N` / `-l N`: Override default row display limit
|
|
117
|
+
- `variable_name`: Store result in a variable
|
|
118
|
+
|
|
119
|
+
See [sparksql-magic](https://github.com/cryeo/sparksql-magic) for more examples.
|
|
120
|
+
|
|
121
|
+
**Note**: Magic commands are optional. If you only need basic DataprocSparkSession functionality without Jupyter magic support, install only the base package:
|
|
122
|
+
```bash
|
|
123
|
+
pip install dataproc-spark-connect
|
|
124
|
+
```
|
|
125
|
+
|
|
79
126
|
## Developing
|
|
80
127
|
|
|
81
128
|
For development instructions see [guide](DEVELOPING.md).
|
|
@@ -67,6 +67,10 @@ def is_interactive_terminal():
|
|
|
67
67
|
return is_interactive() and is_terminal()
|
|
68
68
|
|
|
69
69
|
|
|
70
|
+
def is_dataproc_batch() -> bool:
|
|
71
|
+
return os.getenv("DATAPROC_WORKLOAD_TYPE") == "batch"
|
|
72
|
+
|
|
73
|
+
|
|
70
74
|
def get_client_environment_label() -> str:
|
|
71
75
|
"""
|
|
72
76
|
Map current environment to a standardized client label.
|
|
@@ -472,6 +472,27 @@ class DataprocSparkSession(SparkSession):
|
|
|
472
472
|
session_response, dataproc_config.name
|
|
473
473
|
)
|
|
474
474
|
|
|
475
|
+
def _wait_for_session_available(
|
|
476
|
+
self, session_name: str, timeout: int = 300
|
|
477
|
+
) -> Session:
|
|
478
|
+
start_time = time.time()
|
|
479
|
+
while time.time() - start_time < timeout:
|
|
480
|
+
try:
|
|
481
|
+
session = self.session_controller_client.get_session(
|
|
482
|
+
name=session_name
|
|
483
|
+
)
|
|
484
|
+
if "Spark Connect Server" in session.runtime_info.endpoints:
|
|
485
|
+
return session
|
|
486
|
+
time.sleep(5)
|
|
487
|
+
except Exception as e:
|
|
488
|
+
logger.warning(
|
|
489
|
+
f"Error while polling for Spark Connect endpoint: {e}"
|
|
490
|
+
)
|
|
491
|
+
time.sleep(5)
|
|
492
|
+
raise RuntimeError(
|
|
493
|
+
f"Spark Connect endpoint not available for session {session_name} after {timeout} seconds."
|
|
494
|
+
)
|
|
495
|
+
|
|
475
496
|
def _display_session_link_on_creation(self, session_id):
|
|
476
497
|
session_url = f"https://console.cloud.google.com/dataproc/interactive/{self._region}/{session_id}?project={self._project_id}"
|
|
477
498
|
plain_message = f"Creating Dataproc Session: {session_url}"
|
|
@@ -537,6 +558,9 @@ class DataprocSparkSession(SparkSession):
|
|
|
537
558
|
)
|
|
538
559
|
self._display_view_session_details_button(s8s_session_id)
|
|
539
560
|
if session is None:
|
|
561
|
+
session_response = self._wait_for_session_available(
|
|
562
|
+
session_name
|
|
563
|
+
)
|
|
540
564
|
session = self.__create_spark_connect_session_from_s8s(
|
|
541
565
|
session_response, session_name
|
|
542
566
|
)
|
|
@@ -552,6 +576,13 @@ class DataprocSparkSession(SparkSession):
|
|
|
552
576
|
|
|
553
577
|
def getOrCreate(self) -> "DataprocSparkSession":
|
|
554
578
|
with DataprocSparkSession._lock:
|
|
579
|
+
if environment.is_dataproc_batch():
|
|
580
|
+
# For Dataproc batch workloads, connect to the already initialized local SparkSession
|
|
581
|
+
from pyspark.sql import SparkSession as PySparkSQLSession
|
|
582
|
+
|
|
583
|
+
session = PySparkSQLSession.builder.getOrCreate()
|
|
584
|
+
return session # type: ignore
|
|
585
|
+
|
|
555
586
|
# Handle custom session ID by setting it early and letting existing logic handle it
|
|
556
587
|
if self._custom_session_id:
|
|
557
588
|
self._handle_custom_session_id()
|
|
@@ -559,6 +590,13 @@ class DataprocSparkSession(SparkSession):
|
|
|
559
590
|
session = self._get_exiting_active_session()
|
|
560
591
|
if session is None:
|
|
561
592
|
session = self.__create()
|
|
593
|
+
|
|
594
|
+
# Register this session as the instantiated SparkSession for compatibility
|
|
595
|
+
# with tools and libraries that expect SparkSession._instantiatedSession
|
|
596
|
+
from pyspark.sql import SparkSession as PySparkSQLSession
|
|
597
|
+
|
|
598
|
+
PySparkSQLSession._instantiatedSession = session
|
|
599
|
+
|
|
562
600
|
return session
|
|
563
601
|
|
|
564
602
|
def _handle_custom_session_id(self):
|
|
@@ -1162,6 +1200,20 @@ class DataprocSparkSession(SparkSession):
|
|
|
1162
1200
|
)
|
|
1163
1201
|
|
|
1164
1202
|
self._remove_stopped_session_from_file()
|
|
1203
|
+
|
|
1204
|
+
# Clean up SparkSession._instantiatedSession if it points to this session
|
|
1205
|
+
try:
|
|
1206
|
+
from pyspark.sql import SparkSession as PySparkSQLSession
|
|
1207
|
+
|
|
1208
|
+
if PySparkSQLSession._instantiatedSession is self:
|
|
1209
|
+
PySparkSQLSession._instantiatedSession = None
|
|
1210
|
+
logger.debug(
|
|
1211
|
+
"Cleared SparkSession._instantiatedSession reference"
|
|
1212
|
+
)
|
|
1213
|
+
except (ImportError, AttributeError):
|
|
1214
|
+
# PySpark not available or _instantiatedSession doesn't exist
|
|
1215
|
+
pass
|
|
1216
|
+
|
|
1165
1217
|
DataprocSparkSession._active_s8s_session_uuid = None
|
|
1166
1218
|
DataprocSparkSession._active_s8s_session_id = None
|
|
1167
1219
|
DataprocSparkSession._active_session_uses_custom_id = False
|
|
@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
|
|
|
20
20
|
|
|
21
21
|
setup(
|
|
22
22
|
name="dataproc-spark-connect",
|
|
23
|
-
version="1.0.
|
|
23
|
+
version="1.0.0rc6",
|
|
24
24
|
description="Dataproc client library for Spark Connect",
|
|
25
25
|
long_description=long_description,
|
|
26
26
|
author="Google LLC",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|