dataproc-spark-connect 0.9.0__py2.py3-none-any.whl → 1.0.0rc2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataproc_spark_connect-0.9.0.dist-info → dataproc_spark_connect-1.0.0rc2.dist-info}/METADATA +2 -2
- dataproc_spark_connect-1.0.0rc2.dist-info/RECORD +13 -0
- google/cloud/dataproc_spark_connect/client/core.py +5 -3
- google/cloud/dataproc_spark_connect/environment.py +13 -0
- google/cloud/dataproc_spark_connect/session.py +96 -42
- dataproc_spark_connect-0.9.0.dist-info/RECORD +0 -13
- {dataproc_spark_connect-0.9.0.dist-info → dataproc_spark_connect-1.0.0rc2.dist-info}/WHEEL +0 -0
- {dataproc_spark_connect-0.9.0.dist-info → dataproc_spark_connect-1.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {dataproc_spark_connect-0.9.0.dist-info → dataproc_spark_connect-1.0.0rc2.dist-info}/top_level.txt +0 -0
{dataproc_spark_connect-0.9.0.dist-info → dataproc_spark_connect-1.0.0rc2.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0rc2
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
@@ -9,7 +9,7 @@ License-File: LICENSE
|
|
|
9
9
|
Requires-Dist: google-api-core>=2.19
|
|
10
10
|
Requires-Dist: google-cloud-dataproc>=5.18
|
|
11
11
|
Requires-Dist: packaging>=20.0
|
|
12
|
-
Requires-Dist: pyspark[connect]~=
|
|
12
|
+
Requires-Dist: pyspark[connect]~=4.0.0
|
|
13
13
|
Requires-Dist: tqdm>=4.67
|
|
14
14
|
Requires-Dist: websockets>=14.0
|
|
15
15
|
Dynamic: author
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
dataproc_spark_connect-1.0.0rc2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2
|
+
google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
|
|
3
|
+
google/cloud/dataproc_spark_connect/environment.py,sha256=l1wWiHMHtBQ9YonE-kHTpaZlN9vLE4fyJSTn7RZP6kA,2503
|
|
4
|
+
google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
|
|
5
|
+
google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
|
|
6
|
+
google/cloud/dataproc_spark_connect/session.py,sha256=FdJI_F9k6EfIvlgC1-f-Qb_Uwg9SmkIyWhpNZlqGQhw,40405
|
|
7
|
+
google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
|
|
8
|
+
google/cloud/dataproc_spark_connect/client/core.py,sha256=GRc4OCTBvIvdagjxOPoDO22vLtt8xDSerdREMRDeUBY,4659
|
|
9
|
+
google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
|
|
10
|
+
dataproc_spark_connect-1.0.0rc2.dist-info/METADATA,sha256=o2vfu5NRn2Pb0N7cavrBm2OLwP_LXQBVrclNjEtb9Do,3468
|
|
11
|
+
dataproc_spark_connect-1.0.0rc2.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
|
|
12
|
+
dataproc_spark_connect-1.0.0rc2.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
|
|
13
|
+
dataproc_spark_connect-1.0.0rc2.dist-info/RECORD,,
|
|
@@ -15,14 +15,14 @@ import logging
|
|
|
15
15
|
|
|
16
16
|
import google
|
|
17
17
|
import grpc
|
|
18
|
-
from pyspark.sql.connect.client import
|
|
18
|
+
from pyspark.sql.connect.client import DefaultChannelBuilder
|
|
19
19
|
|
|
20
20
|
from . import proxy
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
class DataprocChannelBuilder(
|
|
25
|
+
class DataprocChannelBuilder(DefaultChannelBuilder):
|
|
26
26
|
"""
|
|
27
27
|
This is a helper class that is used to create a GRPC channel based on the given
|
|
28
28
|
connection string per the documentation of Spark Connect.
|
|
@@ -88,7 +88,9 @@ class ProxiedChannel(grpc.Channel):
|
|
|
88
88
|
self._proxy = proxy.DataprocSessionProxy(0, target_host)
|
|
89
89
|
self._proxy.start()
|
|
90
90
|
self._proxied_connect_url = f"sc://localhost:{self._proxy.port}"
|
|
91
|
-
self._wrapped =
|
|
91
|
+
self._wrapped = DefaultChannelBuilder(
|
|
92
|
+
self._proxied_connect_url
|
|
93
|
+
).toChannel()
|
|
92
94
|
|
|
93
95
|
def __enter__(self):
|
|
94
96
|
return self
|
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import os
|
|
16
|
+
import sys
|
|
16
17
|
from typing import Callable, Tuple, List
|
|
17
18
|
|
|
18
19
|
|
|
@@ -46,6 +47,18 @@ def is_jetbrains_ide() -> bool:
|
|
|
46
47
|
return "jetbrains" in os.getenv("TERMINAL_EMULATOR", "").lower()
|
|
47
48
|
|
|
48
49
|
|
|
50
|
+
def is_interactive():
|
|
51
|
+
return hasattr(sys, "ps1")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def is_terminal():
|
|
55
|
+
return sys.stdin.isatty()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def is_interactive_terminal():
|
|
59
|
+
return is_interactive() and is_terminal()
|
|
60
|
+
|
|
61
|
+
|
|
49
62
|
def get_client_environment_label() -> str:
|
|
50
63
|
"""
|
|
51
64
|
Map current environment to a standardized client label.
|
|
@@ -24,8 +24,10 @@ import threading
|
|
|
24
24
|
import time
|
|
25
25
|
import uuid
|
|
26
26
|
import tqdm
|
|
27
|
+
from tqdm import tqdm as cli_tqdm
|
|
28
|
+
from tqdm.notebook import tqdm as notebook_tqdm
|
|
27
29
|
from types import MethodType
|
|
28
|
-
from typing import Any, cast, ClassVar, Dict, Optional, Union
|
|
30
|
+
from typing import Any, cast, ClassVar, Dict, Iterable, Optional, Union
|
|
29
31
|
|
|
30
32
|
from google.api_core import retry
|
|
31
33
|
from google.api_core.client_options import ClientOptions
|
|
@@ -103,13 +105,14 @@ class DataprocSparkSession(SparkSession):
|
|
|
103
105
|
... ) # doctest: +SKIP
|
|
104
106
|
"""
|
|
105
107
|
|
|
106
|
-
_DEFAULT_RUNTIME_VERSION = "
|
|
108
|
+
_DEFAULT_RUNTIME_VERSION = "3.0"
|
|
107
109
|
|
|
108
110
|
_active_s8s_session_uuid: ClassVar[Optional[str]] = None
|
|
109
111
|
_project_id = None
|
|
110
112
|
_region = None
|
|
111
113
|
_client_options = None
|
|
112
114
|
_active_s8s_session_id: ClassVar[Optional[str]] = None
|
|
115
|
+
_execution_progress_bar = dict()
|
|
113
116
|
|
|
114
117
|
class Builder(SparkSession.Builder):
|
|
115
118
|
|
|
@@ -158,19 +161,6 @@ class DataprocSparkSession(SparkSession):
|
|
|
158
161
|
self.dataproc_config.environment_config.execution_config.service_account = (
|
|
159
162
|
account
|
|
160
163
|
)
|
|
161
|
-
# Automatically set auth type to SERVICE_ACCOUNT when service account is provided
|
|
162
|
-
# This overrides any env var setting to simplify user experience
|
|
163
|
-
self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
|
|
164
|
-
AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
|
|
165
|
-
)
|
|
166
|
-
return self
|
|
167
|
-
|
|
168
|
-
def authType(
|
|
169
|
-
self, auth_type: "AuthenticationConfig.AuthenticationType"
|
|
170
|
-
):
|
|
171
|
-
self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
|
|
172
|
-
auth_type
|
|
173
|
-
)
|
|
174
164
|
return self
|
|
175
165
|
|
|
176
166
|
def subnetwork(self, subnet: str):
|
|
@@ -181,10 +171,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
181
171
|
|
|
182
172
|
def ttl(self, duration: datetime.timedelta):
|
|
183
173
|
"""Set the time-to-live (TTL) for the session using a timedelta object."""
|
|
184
|
-
self.
|
|
185
|
-
"seconds": int(duration.total_seconds())
|
|
186
|
-
}
|
|
187
|
-
return self
|
|
174
|
+
return self.ttlSeconds(int(duration.total_seconds()))
|
|
188
175
|
|
|
189
176
|
def ttlSeconds(self, seconds: int):
|
|
190
177
|
"""Set the time-to-live (TTL) for the session in seconds."""
|
|
@@ -195,10 +182,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
195
182
|
|
|
196
183
|
def idleTtl(self, duration: datetime.timedelta):
|
|
197
184
|
"""Set the idle time-to-live (idle TTL) for the session using a timedelta object."""
|
|
198
|
-
self.
|
|
199
|
-
"seconds": int(duration.total_seconds())
|
|
200
|
-
}
|
|
201
|
-
return self
|
|
185
|
+
return self.idleTtlSeconds(int(duration.total_seconds()))
|
|
202
186
|
|
|
203
187
|
def idleTtlSeconds(self, seconds: int):
|
|
204
188
|
"""Set the idle time-to-live (idle TTL) for the session in seconds."""
|
|
@@ -266,6 +250,9 @@ class DataprocSparkSession(SparkSession):
|
|
|
266
250
|
assert self._channel_builder is not None
|
|
267
251
|
session = DataprocSparkSession(connection=self._channel_builder)
|
|
268
252
|
|
|
253
|
+
# Register handler for Cell Execution Progress bar
|
|
254
|
+
session._register_progress_execution_handler()
|
|
255
|
+
|
|
269
256
|
DataprocSparkSession._set_default_and_active_session(session)
|
|
270
257
|
return session
|
|
271
258
|
|
|
@@ -568,27 +555,24 @@ class DataprocSparkSession(SparkSession):
|
|
|
568
555
|
default_datasource = os.getenv(
|
|
569
556
|
"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
|
|
570
557
|
)
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
bq_datasource_properties = {
|
|
558
|
+
match default_datasource:
|
|
559
|
+
case "bigquery":
|
|
560
|
+
# Merge default configs with existing properties,
|
|
561
|
+
# user configs take precedence
|
|
562
|
+
for k, v in {
|
|
577
563
|
"spark.datasource.bigquery.viewsEnabled": "true",
|
|
578
564
|
"spark.datasource.bigquery.writeMethod": "direct",
|
|
579
565
|
"spark.sql.catalog.spark_catalog": "com.google.cloud.spark.bigquery.BigQuerySparkSessionCatalog",
|
|
580
|
-
"spark.sql.legacy.createHiveTableByDefault": "false",
|
|
581
566
|
"spark.sql.sources.default": "bigquery",
|
|
582
|
-
}
|
|
583
|
-
# Merge default configs with existing properties, user configs take precedence
|
|
584
|
-
for k, v in bq_datasource_properties.items():
|
|
567
|
+
}.items():
|
|
585
568
|
if k not in dataproc_config.runtime_config.properties:
|
|
586
569
|
dataproc_config.runtime_config.properties[k] = v
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
570
|
+
case _:
|
|
571
|
+
if default_datasource:
|
|
572
|
+
logger.warning(
|
|
573
|
+
f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
|
|
574
|
+
f" {default_datasource}. Supported value is 'bigquery'."
|
|
575
|
+
)
|
|
592
576
|
return dataproc_config
|
|
593
577
|
|
|
594
578
|
def _check_python_version_compatibility(self, runtime_version):
|
|
@@ -598,9 +582,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
598
582
|
|
|
599
583
|
# Runtime version to server Python version mapping
|
|
600
584
|
RUNTIME_PYTHON_MAP = {
|
|
601
|
-
"
|
|
602
|
-
"2.2": (3, 12),
|
|
603
|
-
"2.3": (3, 11),
|
|
585
|
+
"3.0": (3, 11),
|
|
604
586
|
}
|
|
605
587
|
|
|
606
588
|
client_python = sys.version_info[:2] # (major, minor)
|
|
@@ -706,6 +688,78 @@ class DataprocSparkSession(SparkSession):
|
|
|
706
688
|
execute_and_fetch_as_iterator_wrapped_method, self.client
|
|
707
689
|
)
|
|
708
690
|
|
|
691
|
+
# Patching clearProgressHandlers method to not remove Dataproc Progress Handler
|
|
692
|
+
clearProgressHandlers_base_method = self.clearProgressHandlers
|
|
693
|
+
|
|
694
|
+
def clearProgressHandlers_wrapper_method(_, *args, **kwargs):
|
|
695
|
+
clearProgressHandlers_base_method(*args, **kwargs)
|
|
696
|
+
|
|
697
|
+
self._register_progress_execution_handler()
|
|
698
|
+
|
|
699
|
+
self.clearProgressHandlers = MethodType(
|
|
700
|
+
clearProgressHandlers_wrapper_method, self
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
def _register_progress_execution_handler(self):
|
|
704
|
+
from pyspark.sql.connect.shell.progress import StageInfo
|
|
705
|
+
|
|
706
|
+
def handler(
|
|
707
|
+
stages: Optional[Iterable[StageInfo]],
|
|
708
|
+
inflight_tasks: int,
|
|
709
|
+
operation_id: Optional[str],
|
|
710
|
+
done: bool,
|
|
711
|
+
):
|
|
712
|
+
if operation_id is None:
|
|
713
|
+
return
|
|
714
|
+
|
|
715
|
+
# Don't build / render progress bar for non-interactive (despite
|
|
716
|
+
# Ipython or non-IPython)
|
|
717
|
+
if not environment.is_interactive():
|
|
718
|
+
return
|
|
719
|
+
|
|
720
|
+
total_tasks = 0
|
|
721
|
+
completed_tasks = 0
|
|
722
|
+
|
|
723
|
+
for stage in stages or []:
|
|
724
|
+
total_tasks += stage.num_tasks
|
|
725
|
+
completed_tasks += stage.num_completed_tasks
|
|
726
|
+
|
|
727
|
+
tqdm_pbar = notebook_tqdm
|
|
728
|
+
if environment.is_interactive_terminal():
|
|
729
|
+
tqdm_pbar = cli_tqdm
|
|
730
|
+
|
|
731
|
+
# Use a lock to ensure only one thread can access and modify
|
|
732
|
+
# the shared dictionaries at a time.
|
|
733
|
+
with self._lock:
|
|
734
|
+
if operation_id in self._execution_progress_bar:
|
|
735
|
+
pbar = self._execution_progress_bar[operation_id]
|
|
736
|
+
if pbar.total != total_tasks:
|
|
737
|
+
pbar.reset(
|
|
738
|
+
total=total_tasks
|
|
739
|
+
) # This force resets the progress bar % too on next refresh
|
|
740
|
+
else:
|
|
741
|
+
pbar = tqdm_pbar(
|
|
742
|
+
total=total_tasks,
|
|
743
|
+
leave=True,
|
|
744
|
+
dynamic_ncols=True,
|
|
745
|
+
bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} Tasks",
|
|
746
|
+
)
|
|
747
|
+
self._execution_progress_bar[operation_id] = pbar
|
|
748
|
+
|
|
749
|
+
# To handle skipped or failed tasks.
|
|
750
|
+
# StageInfo proto doesn't have skipped and failed tasks information to process.
|
|
751
|
+
if done and completed_tasks < total_tasks:
|
|
752
|
+
completed_tasks = total_tasks
|
|
753
|
+
|
|
754
|
+
pbar.n = completed_tasks
|
|
755
|
+
pbar.refresh()
|
|
756
|
+
|
|
757
|
+
if done:
|
|
758
|
+
pbar.close()
|
|
759
|
+
self._execution_progress_bar.pop(operation_id, None)
|
|
760
|
+
|
|
761
|
+
self.registerProgressHandler(handler)
|
|
762
|
+
|
|
709
763
|
@staticmethod
|
|
710
764
|
def _sql_lazy_transformation(req):
|
|
711
765
|
# Select SQL command
|
|
@@ -813,7 +867,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
813
867
|
This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws
|
|
814
868
|
an exception.
|
|
815
869
|
Regarding pypi: Popular packages are already pre-installed in s8s runtime.
|
|
816
|
-
https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.
|
|
870
|
+
https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.3#python_libraries
|
|
817
871
|
If there are conflicts/package doesn't exist, it throws an exception.
|
|
818
872
|
"""
|
|
819
873
|
if sum([pypi, file, pyfile, archive]) > 1:
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
dataproc_spark_connect-0.9.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
2
|
-
google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
|
|
3
|
-
google/cloud/dataproc_spark_connect/environment.py,sha256=UICy9XyqAxL-cryVWx7GZPRAxoir5LKk0dtqqY_l--c,2307
|
|
4
|
-
google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
|
|
5
|
-
google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
|
|
6
|
-
google/cloud/dataproc_spark_connect/session.py,sha256=ELj5hDhofK1967eE5YaG_LP5B80KWFQWJn5gxi9yYt0,38577
|
|
7
|
-
google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
|
|
8
|
-
google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
|
|
9
|
-
google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
|
|
10
|
-
dataproc_spark_connect-0.9.0.dist-info/METADATA,sha256=1z8Ag1P_Lh9db0Rk9nGFoOu6sdeRs0UlrgtOqN_OhIQ,3465
|
|
11
|
-
dataproc_spark_connect-0.9.0.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
|
|
12
|
-
dataproc_spark_connect-0.9.0.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
|
|
13
|
-
dataproc_spark_connect-0.9.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
{dataproc_spark_connect-0.9.0.dist-info → dataproc_spark_connect-1.0.0rc2.dist-info}/top_level.txt
RENAMED
|
File without changes
|