dataproc-spark-connect 0.9.0__py2.py3-none-any.whl → 1.0.0rc2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dataproc-spark-connect
3
- Version: 0.9.0
3
+ Version: 1.0.0rc2
4
4
  Summary: Dataproc client library for Spark Connect
5
5
  Home-page: https://github.com/GoogleCloudDataproc/dataproc-spark-connect-python
6
6
  Author: Google LLC
@@ -9,7 +9,7 @@ License-File: LICENSE
9
9
  Requires-Dist: google-api-core>=2.19
10
10
  Requires-Dist: google-cloud-dataproc>=5.18
11
11
  Requires-Dist: packaging>=20.0
12
- Requires-Dist: pyspark[connect]~=3.5.1
12
+ Requires-Dist: pyspark[connect]~=4.0.0
13
13
  Requires-Dist: tqdm>=4.67
14
14
  Requires-Dist: websockets>=14.0
15
15
  Dynamic: author
@@ -0,0 +1,13 @@
1
+ dataproc_spark_connect-1.0.0rc2.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2
+ google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
3
+ google/cloud/dataproc_spark_connect/environment.py,sha256=l1wWiHMHtBQ9YonE-kHTpaZlN9vLE4fyJSTn7RZP6kA,2503
4
+ google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
5
+ google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
6
+ google/cloud/dataproc_spark_connect/session.py,sha256=FdJI_F9k6EfIvlgC1-f-Qb_Uwg9SmkIyWhpNZlqGQhw,40405
7
+ google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
8
+ google/cloud/dataproc_spark_connect/client/core.py,sha256=GRc4OCTBvIvdagjxOPoDO22vLtt8xDSerdREMRDeUBY,4659
9
+ google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
10
+ dataproc_spark_connect-1.0.0rc2.dist-info/METADATA,sha256=o2vfu5NRn2Pb0N7cavrBm2OLwP_LXQBVrclNjEtb9Do,3468
11
+ dataproc_spark_connect-1.0.0rc2.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
+ dataproc_spark_connect-1.0.0rc2.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
+ dataproc_spark_connect-1.0.0rc2.dist-info/RECORD,,
@@ -15,14 +15,14 @@ import logging
15
15
 
16
16
  import google
17
17
  import grpc
18
- from pyspark.sql.connect.client import ChannelBuilder
18
+ from pyspark.sql.connect.client import DefaultChannelBuilder
19
19
 
20
20
  from . import proxy
21
21
 
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
24
 
25
- class DataprocChannelBuilder(ChannelBuilder):
25
+ class DataprocChannelBuilder(DefaultChannelBuilder):
26
26
  """
27
27
  This is a helper class that is used to create a GRPC channel based on the given
28
28
  connection string per the documentation of Spark Connect.
@@ -88,7 +88,9 @@ class ProxiedChannel(grpc.Channel):
88
88
  self._proxy = proxy.DataprocSessionProxy(0, target_host)
89
89
  self._proxy.start()
90
90
  self._proxied_connect_url = f"sc://localhost:{self._proxy.port}"
91
- self._wrapped = ChannelBuilder(self._proxied_connect_url).toChannel()
91
+ self._wrapped = DefaultChannelBuilder(
92
+ self._proxied_connect_url
93
+ ).toChannel()
92
94
 
93
95
  def __enter__(self):
94
96
  return self
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import os
16
+ import sys
16
17
  from typing import Callable, Tuple, List
17
18
 
18
19
 
@@ -46,6 +47,18 @@ def is_jetbrains_ide() -> bool:
46
47
  return "jetbrains" in os.getenv("TERMINAL_EMULATOR", "").lower()
47
48
 
48
49
 
50
+ def is_interactive():
51
+ return hasattr(sys, "ps1")
52
+
53
+
54
+ def is_terminal():
55
+ return sys.stdin.isatty()
56
+
57
+
58
+ def is_interactive_terminal():
59
+ return is_interactive() and is_terminal()
60
+
61
+
49
62
  def get_client_environment_label() -> str:
50
63
  """
51
64
  Map current environment to a standardized client label.
@@ -24,8 +24,10 @@ import threading
24
24
  import time
25
25
  import uuid
26
26
  import tqdm
27
+ from tqdm import tqdm as cli_tqdm
28
+ from tqdm.notebook import tqdm as notebook_tqdm
27
29
  from types import MethodType
28
- from typing import Any, cast, ClassVar, Dict, Optional, Union
30
+ from typing import Any, cast, ClassVar, Dict, Iterable, Optional, Union
29
31
 
30
32
  from google.api_core import retry
31
33
  from google.api_core.client_options import ClientOptions
@@ -103,13 +105,14 @@ class DataprocSparkSession(SparkSession):
103
105
  ... ) # doctest: +SKIP
104
106
  """
105
107
 
106
- _DEFAULT_RUNTIME_VERSION = "2.3"
108
+ _DEFAULT_RUNTIME_VERSION = "3.0"
107
109
 
108
110
  _active_s8s_session_uuid: ClassVar[Optional[str]] = None
109
111
  _project_id = None
110
112
  _region = None
111
113
  _client_options = None
112
114
  _active_s8s_session_id: ClassVar[Optional[str]] = None
115
+ _execution_progress_bar = dict()
113
116
 
114
117
  class Builder(SparkSession.Builder):
115
118
 
@@ -158,19 +161,6 @@ class DataprocSparkSession(SparkSession):
158
161
  self.dataproc_config.environment_config.execution_config.service_account = (
159
162
  account
160
163
  )
161
- # Automatically set auth type to SERVICE_ACCOUNT when service account is provided
162
- # This overrides any env var setting to simplify user experience
163
- self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
164
- AuthenticationConfig.AuthenticationType.SERVICE_ACCOUNT
165
- )
166
- return self
167
-
168
- def authType(
169
- self, auth_type: "AuthenticationConfig.AuthenticationType"
170
- ):
171
- self.dataproc_config.environment_config.execution_config.authentication_config.user_workload_authentication_type = (
172
- auth_type
173
- )
174
164
  return self
175
165
 
176
166
  def subnetwork(self, subnet: str):
@@ -181,10 +171,7 @@ class DataprocSparkSession(SparkSession):
181
171
 
182
172
  def ttl(self, duration: datetime.timedelta):
183
173
  """Set the time-to-live (TTL) for the session using a timedelta object."""
184
- self.dataproc_config.environment_config.execution_config.ttl = {
185
- "seconds": int(duration.total_seconds())
186
- }
187
- return self
174
+ return self.ttlSeconds(int(duration.total_seconds()))
188
175
 
189
176
  def ttlSeconds(self, seconds: int):
190
177
  """Set the time-to-live (TTL) for the session in seconds."""
@@ -195,10 +182,7 @@ class DataprocSparkSession(SparkSession):
195
182
 
196
183
  def idleTtl(self, duration: datetime.timedelta):
197
184
  """Set the idle time-to-live (idle TTL) for the session using a timedelta object."""
198
- self.dataproc_config.environment_config.execution_config.idle_ttl = {
199
- "seconds": int(duration.total_seconds())
200
- }
201
- return self
185
+ return self.idleTtlSeconds(int(duration.total_seconds()))
202
186
 
203
187
  def idleTtlSeconds(self, seconds: int):
204
188
  """Set the idle time-to-live (idle TTL) for the session in seconds."""
@@ -266,6 +250,9 @@ class DataprocSparkSession(SparkSession):
266
250
  assert self._channel_builder is not None
267
251
  session = DataprocSparkSession(connection=self._channel_builder)
268
252
 
253
+ # Register handler for Cell Execution Progress bar
254
+ session._register_progress_execution_handler()
255
+
269
256
  DataprocSparkSession._set_default_and_active_session(session)
270
257
  return session
271
258
 
@@ -568,27 +555,24 @@ class DataprocSparkSession(SparkSession):
568
555
  default_datasource = os.getenv(
569
556
  "DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE"
570
557
  )
571
- if (
572
- default_datasource
573
- and dataproc_config.runtime_config.version == "2.3"
574
- ):
575
- if default_datasource == "bigquery":
576
- bq_datasource_properties = {
558
+ match default_datasource:
559
+ case "bigquery":
560
+ # Merge default configs with existing properties,
561
+ # user configs take precedence
562
+ for k, v in {
577
563
  "spark.datasource.bigquery.viewsEnabled": "true",
578
564
  "spark.datasource.bigquery.writeMethod": "direct",
579
565
  "spark.sql.catalog.spark_catalog": "com.google.cloud.spark.bigquery.BigQuerySparkSessionCatalog",
580
- "spark.sql.legacy.createHiveTableByDefault": "false",
581
566
  "spark.sql.sources.default": "bigquery",
582
- }
583
- # Merge default configs with existing properties, user configs take precedence
584
- for k, v in bq_datasource_properties.items():
567
+ }.items():
585
568
  if k not in dataproc_config.runtime_config.properties:
586
569
  dataproc_config.runtime_config.properties[k] = v
587
- else:
588
- logger.warning(
589
- f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
590
- f" {default_datasource}. Supported value is 'bigquery'."
591
- )
570
+ case _:
571
+ if default_datasource:
572
+ logger.warning(
573
+ f"DATAPROC_SPARK_CONNECT_DEFAULT_DATASOURCE is set to an invalid value:"
574
+ f" {default_datasource}. Supported value is 'bigquery'."
575
+ )
592
576
  return dataproc_config
593
577
 
594
578
  def _check_python_version_compatibility(self, runtime_version):
@@ -598,9 +582,7 @@ class DataprocSparkSession(SparkSession):
598
582
 
599
583
  # Runtime version to server Python version mapping
600
584
  RUNTIME_PYTHON_MAP = {
601
- "1.2": (3, 12),
602
- "2.2": (3, 12),
603
- "2.3": (3, 11),
585
+ "3.0": (3, 11),
604
586
  }
605
587
 
606
588
  client_python = sys.version_info[:2] # (major, minor)
@@ -706,6 +688,78 @@ class DataprocSparkSession(SparkSession):
706
688
  execute_and_fetch_as_iterator_wrapped_method, self.client
707
689
  )
708
690
 
691
+ # Patching clearProgressHandlers method to not remove Dataproc Progress Handler
692
+ clearProgressHandlers_base_method = self.clearProgressHandlers
693
+
694
+ def clearProgressHandlers_wrapper_method(_, *args, **kwargs):
695
+ clearProgressHandlers_base_method(*args, **kwargs)
696
+
697
+ self._register_progress_execution_handler()
698
+
699
+ self.clearProgressHandlers = MethodType(
700
+ clearProgressHandlers_wrapper_method, self
701
+ )
702
+
703
+ def _register_progress_execution_handler(self):
704
+ from pyspark.sql.connect.shell.progress import StageInfo
705
+
706
+ def handler(
707
+ stages: Optional[Iterable[StageInfo]],
708
+ inflight_tasks: int,
709
+ operation_id: Optional[str],
710
+ done: bool,
711
+ ):
712
+ if operation_id is None:
713
+ return
714
+
715
+ # Don't build / render progress bar for non-interactive (despite
716
+ # Ipython or non-IPython)
717
+ if not environment.is_interactive():
718
+ return
719
+
720
+ total_tasks = 0
721
+ completed_tasks = 0
722
+
723
+ for stage in stages or []:
724
+ total_tasks += stage.num_tasks
725
+ completed_tasks += stage.num_completed_tasks
726
+
727
+ tqdm_pbar = notebook_tqdm
728
+ if environment.is_interactive_terminal():
729
+ tqdm_pbar = cli_tqdm
730
+
731
+ # Use a lock to ensure only one thread can access and modify
732
+ # the shared dictionaries at a time.
733
+ with self._lock:
734
+ if operation_id in self._execution_progress_bar:
735
+ pbar = self._execution_progress_bar[operation_id]
736
+ if pbar.total != total_tasks:
737
+ pbar.reset(
738
+ total=total_tasks
739
+ ) # This force resets the progress bar % too on next refresh
740
+ else:
741
+ pbar = tqdm_pbar(
742
+ total=total_tasks,
743
+ leave=True,
744
+ dynamic_ncols=True,
745
+ bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} Tasks",
746
+ )
747
+ self._execution_progress_bar[operation_id] = pbar
748
+
749
+ # To handle skipped or failed tasks.
750
+ # StageInfo proto doesn't have skipped and failed tasks information to process.
751
+ if done and completed_tasks < total_tasks:
752
+ completed_tasks = total_tasks
753
+
754
+ pbar.n = completed_tasks
755
+ pbar.refresh()
756
+
757
+ if done:
758
+ pbar.close()
759
+ self._execution_progress_bar.pop(operation_id, None)
760
+
761
+ self.registerProgressHandler(handler)
762
+
709
763
  @staticmethod
710
764
  def _sql_lazy_transformation(req):
711
765
  # Select SQL command
@@ -813,7 +867,7 @@ class DataprocSparkSession(SparkSession):
813
867
  This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws
814
868
  an exception.
815
869
  Regarding pypi: Popular packages are already pre-installed in s8s runtime.
816
- https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.2#python_libraries
870
+ https://cloud.google.com/dataproc-serverless/docs/concepts/versions/spark-runtime-2.3#python_libraries
817
871
  If there are conflicts/package doesn't exist, it throws an exception.
818
872
  """
819
873
  if sum([pypi, file, pyfile, archive]) > 1:
@@ -1,13 +0,0 @@
1
- dataproc_spark_connect-0.9.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
2
- google/cloud/dataproc_spark_connect/__init__.py,sha256=dIqHNWVWWrSuRf26x11kX5e9yMKSHCtmI_GBj1-FDdE,1101
3
- google/cloud/dataproc_spark_connect/environment.py,sha256=UICy9XyqAxL-cryVWx7GZPRAxoir5LKk0dtqqY_l--c,2307
4
- google/cloud/dataproc_spark_connect/exceptions.py,sha256=WF-qdzgdofRwILCriIkjjsmjObZfF0P3Ecg4lv-Hmec,968
5
- google/cloud/dataproc_spark_connect/pypi_artifacts.py,sha256=gd-VMwiVP-EJuPp9Vf9Shx8pqps3oSKp0hBcSSZQS-A,1575
6
- google/cloud/dataproc_spark_connect/session.py,sha256=ELj5hDhofK1967eE5YaG_LP5B80KWFQWJn5gxi9yYt0,38577
7
- google/cloud/dataproc_spark_connect/client/__init__.py,sha256=6hCNSsgYlie6GuVpc5gjFsPnyeMTScTpXSPYqp1fplY,615
8
- google/cloud/dataproc_spark_connect/client/core.py,sha256=m3oXTKBm3sBy6jhDu9GRecrxLb5CdEM53SgMlnJb6ag,4616
9
- google/cloud/dataproc_spark_connect/client/proxy.py,sha256=qUZXvVY1yn934vE6nlO495XUZ53AUx9O74a9ozkGI9U,8976
10
- dataproc_spark_connect-0.9.0.dist-info/METADATA,sha256=1z8Ag1P_Lh9db0Rk9nGFoOu6sdeRs0UlrgtOqN_OhIQ,3465
11
- dataproc_spark_connect-0.9.0.dist-info/WHEEL,sha256=JNWh1Fm1UdwIQV075glCn4MVuCRs0sotJIq-J6rbxCU,109
12
- dataproc_spark_connect-0.9.0.dist-info/top_level.txt,sha256=_1QvSJIhFAGfxb79D6DhB7SUw2X6T4rwnz_LLrbcD3c,7
13
- dataproc_spark_connect-0.9.0.dist-info/RECORD,,