dataproc-spark-connect 0.9.0__tar.gz → 1.0.0rc1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/PKG-INFO +2 -2
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/PKG-INFO +2 -2
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/requires.txt +1 -1
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/client/core.py +5 -3
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/session.py +17 -41
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/setup.py +2 -2
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/LICENSE +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/README.md +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/SOURCES.txt +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/dependency_links.txt +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/dataproc_spark_connect.egg-info/top_level.txt +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/__init__.py +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/client/__init__.py +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/client/proxy.py +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/environment.py +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/exceptions.py +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/google/cloud/dataproc_spark_connect/pypi_artifacts.py +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/pyproject.toml +0 -0
- {dataproc_spark_connect-0.9.0 → dataproc_spark_connect-1.0.0rc1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0rc1
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
@@ -9,7 +9,7 @@ License-File: LICENSE
|
|
|
9
9
|
Requires-Dist: google-api-core>=2.19
|
|
10
10
|
Requires-Dist: google-cloud-dataproc>=5.18
|
|
11
11
|
Requires-Dist: packaging>=20.0
|
|
12
|
-
Requires-Dist: pyspark[connect]~=
|
|
12
|
+
Requires-Dist: pyspark[connect]~=4.0.0
|
|
13
13
|
Requires-Dist: tqdm>=4.67
|
|
14
14
|
Requires-Dist: websockets>=14.0
|
|
15
15
|
Dynamic: author
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dataproc-spark-connect
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0rc1
|
|
4
4
|
Summary: Dataproc client library for Spark Connect
|
|
5
5
|
Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
|
|
6
6
|
Author: Google LLC
|
|
@@ -9,7 +9,7 @@ License-File: LICENSE
|
|
|
9
9
|
Requires-Dist: google-api-core>=2.19
|
|
10
10
|
Requires-Dist: google-cloud-dataproc>=5.18
|
|
11
11
|
Requires-Dist: packaging>=20.0
|
|
12
|
-
Requires-Dist: pyspark[connect]~=
|
|
12
|
+
Requires-Dist: pyspark[connect]~=4.0.0
|
|
13
13
|
Requires-Dist: tqdm>=4.67
|
|
14
14
|
Requires-Dist: websockets>=14.0
|
|
15
15
|
Dynamic: author
|
|
@@ -15,14 +15,14 @@ import logging
|
|
|
15
15
|
|
|
16
16
|
import google
|
|
17
17
|
import grpc
|
|
18
|
-
from pyspark.sql.connect.client import
|
|
18
|
+
from pyspark.sql.connect.client import DefaultChannelBuilder
|
|
19
19
|
|
|
20
20
|
from . import proxy
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
class DataprocChannelBuilder(
|
|
25
|
+
class DataprocChannelBuilder(DefaultChannelBuilder):
|
|
26
26
|
"""
|
|
27
27
|
This is a helper class that is used to create a GRPC channel based on the given
|
|
28
28
|
connection string per the documentation of Spark Connect.
|
|
@@ -88,7 +88,9 @@ class ProxiedChannel(grpc.Channel):
|
|
|
88
88
|
self._proxy = proxy.DataprocSessionProxy(0, target_host)
|
|
89
89
|
self._proxy.start()
|
|
90
90
|
self._proxied_connect_url = f"sc://localhost:{self._proxy.port}"
|
|
91
|
-
self._wrapped =
|
|
91
|
+
self._wrapped = DefaultChannelBuilder(
|
|
92
|
+
self._proxied_connect_url
|
|
93
|
+
).toChannel()
|
|
92
94
|
|
|
93
95
|
def __enter__(self):
|
|
94
96
|
return self
|
|
@@ -103,7 +103,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
103
103
|
... ) # doctest: +SKIP
|
|
104
104
|
"""
|
|
105
105
|
|
|
106
|
-
_DEFAULT_RUNTIME_VERSION = "
|
|
106
|
+
_DEFAULT_RUNTIME_VERSION = "3.0"
|
|
107
107
|
|
|
108
108
|
_active_s8s_session_uuid: ClassVar[Optional[str]] = None
|
|
109
109
|
_project_id = None
|
|
@@ -158,19 +158,6 @@ class DataprocSparkSession(SparkSession):
|
|
|
158
158
|
self.dataproc_config.environment_config.execution_config.service_account = (
|
|
159
159
|
account
|
|
160
160
|
)
|
|
161
|
-
# Automatically set auth type to SERVICE_ACCOUNT when service account is provided
|
|
162
|
-
# This overrides any env var setting to simplify user experience
|
|
163
|
-
self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
|
|
164
|
-
AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
|
|
165
|
-
)
|
|
166
|
-
return self
|
|
167
|
-
|
|
168
|
-
def authType(
|
|
169
|
-
self, auth_type: "AuthenticationConfig.AuthenticationType"
|
|
170
|
-
):
|
|
171
|
-
self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
|
|
172
|
-
auth_type
|
|
173
|
-
)
|
|
174
161
|
return self
|
|
175
162
|
|
|
176
163
|
def subnetwork(self, subnet: str):
|
|
@@ -181,10 +168,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
181
168
|
|
|
182
169
|
def ttl(self, duration: datetime.timedelta):
|
|
183
170
|
"""Set the time-to-live (TTL) for the session using a timedelta object."""
|
|
184
|
-
self.
|
|
185
|
-
"seconds": int(duration.total_seconds())
|
|
186
|
-
}
|
|
187
|
-
return self
|
|
171
|
+
return self.ttlSeconds(int(duration.total_seconds()))
|
|
188
172
|
|
|
189
173
|
def ttlSeconds(self, seconds: int):
|
|
190
174
|
"""Set the time-to-live (TTL) for the session in seconds."""
|
|
@@ -195,10 +179,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
195
179
|
|
|
196
180
|
def idleTtl(self, duration: datetime.timedelta):
|
|
197
181
|
"""Set the idle time-to-live (idle TTL) for the session using a timedelta object."""
|
|
198
|
-
self.
|
|
199
|
-
"seconds": int(duration.total_seconds())
|
|
200
|
-
}
|
|
201
|
-
return self
|
|
182
|
+
return self.idleTtlSeconds(int(duration.total_seconds()))
|
|
202
183
|
|
|
203
184
|
def idleTtlSeconds(self, seconds: int):
|
|
204
185
|
"""Set the idle time-to-live (idle TTL) for the session in seconds."""
|
|
@@ -568,27 +549,24 @@ class DataprocSparkSession(SparkSession):
|
|
|
568
549
|
default_datasource = os.getenv(
|
|
569
550
|
"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
|
|
570
551
|
)
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
bq_datasource_properties = {
|
|
552
|
+
match default_datasource:
|
|
553
|
+
case "bigquery":
|
|
554
|
+
# Merge default configs with existing properties,
|
|
555
|
+
# user configs take precedence
|
|
556
|
+
for k, v in {
|
|
577
557
|
"spark.datasource.bigquery.viewsEnabled": "true",
|
|
578
558
|
"spark.datasource.bigquery.writeMethod": "direct",
|
|
579
559
|
"spark.sql.catalog.spark_catalog": "com.google.cloud.spark.bigquery.BigQuerySparkSessionCatalog",
|
|
580
|
-
"spark.sql.legacy.createHiveTableByDefault": "false",
|
|
581
560
|
"spark.sql.sources.default": "bigquery",
|
|
582
|
-
}
|
|
583
|
-
# Merge default configs with existing properties, user configs take precedence
|
|
584
|
-
for k, v in bq_datasource_properties.items():
|
|
561
|
+
}.items():
|
|
585
562
|
if k not in dataproc_config.runtime_config.properties:
|
|
586
563
|
dataproc_config.runtime_config.properties[k] = v
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
564
|
+
case _:
|
|
565
|
+
if default_datasource:
|
|
566
|
+
logger.warning(
|
|
567
|
+
f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
|
|
568
|
+
f" {default_datasource}. Supported value is 'bigquery'."
|
|
569
|
+
)
|
|
592
570
|
return dataproc_config
|
|
593
571
|
|
|
594
572
|
def _check_python_version_compatibility(self, runtime_version):
|
|
@@ -598,9 +576,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
598
576
|
|
|
599
577
|
# Runtime version to server Python version mapping
|
|
600
578
|
RUNTIME_PYTHON_MAP = {
|
|
601
|
-
"
|
|
602
|
-
"2.2": (3, 12),
|
|
603
|
-
"2.3": (3, 11),
|
|
579
|
+
"3.0": (3, 11),
|
|
604
580
|
}
|
|
605
581
|
|
|
606
582
|
client_python = sys.version_info[:2] # (major, minor)
|
|
@@ -813,7 +789,7 @@ class DataprocSparkSession(SparkSession):
|
|
|
813
789
|
This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws
|
|
814
790
|
an exception.
|
|
815
791
|
Regarding pypi: Popular packages are already pre-installed in s8s runtime.
|
|
816
|
-
https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.
|
|
792
|
+
https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.3#python_libraries
|
|
817
793
|
If there are conflicts/package doesn't exist, it throws an exception.
|
|
818
794
|
"""
|
|
819
795
|
if sum([pypi, file, pyfile, archive]) > 1:
|
|
@@ -20,7 +20,7 @@ long_description = (this_directory / "README.md").read_text()
|
|
|
20
20
|
|
|
21
21
|
setup(
|
|
22
22
|
name="dataproc-spark-connect",
|
|
23
|
-
version="0.
|
|
23
|
+
version="1.0.0rc1",
|
|
24
24
|
description="Dataproc client library for Spark Connect",
|
|
25
25
|
long_description=long_description,
|
|
26
26
|
author="Google LLC",
|
|
@@ -31,7 +31,7 @@ setup(
|
|
|
31
31
|
"google-api-core>=2.19",
|
|
32
32
|
"google-cloud-dataproc>=5.18",
|
|
33
33
|
"packaging>=20.0",
|
|
34
|
-
"pyspark[connect]~=
|
|
34
|
+
"pyspark[connect]~=4.0.0",
|
|
35
35
|
"tqdm>=4.67",
|
|
36
36
|
"websockets>=14.0",
|
|
37
37
|
],
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|