PyPI - pyspark-client - Versions diffs - 4.1.0.dev2__tar.gz → 4.1.0.dev3__tar.gz - Mend

pyspark-client 4.1.0.dev2tar.gz → 4.1.0.dev3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (370) hide show

{pyspark_client-4.1.0.dev2/pyspark_client.egg-info → pyspark_client-4.1.0.dev3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pyspark-client
-Version: 4.1.0.dev2
+Version: 4.1.0.dev3
 Summary: Python Spark Connect client for Apache Spark
 Home-page: https://github.com/apache/spark/tree/master/python
 Author: Spark Developers

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/errors/error-conditions.json RENAMED Viewed

@@ -1134,6 +1134,24 @@
       "Cannot serialize the function `<name>`. If you accessed the Spark session, or a DataFrame defined outside of the function, or any object that contains a Spark session, please be aware that they are not allowed in Spark Connect. For `foreachBatch`, please access the Spark session using `df.sparkSession`, where `df` is the first parameter in your `foreachBatch` function. For `StreamingQueryListener`, please access the Spark session using `self.spark`. For details please check out the PySpark doc for `foreachBatch` and `StreamingQueryListener`."
     ]
   },
+  "ST_INVALID_ALGORITHM_VALUE" : {
+    "message" : [
+      "Invalid or unsupported edge interpolation algorithm value: '<alg>'."
+    ],
+    "sqlState" : "22023"
+  },
+  "ST_INVALID_CRS_VALUE" : {
+    "message" : [
+      "Invalid or unsupported CRS (coordinate reference system) value: '<crs>'."
+    ],
+    "sqlState" : "22023"
+  },
+  "ST_INVALID_SRID_VALUE" : {
+    "message" : [
+      "Invalid or unsupported SRID (spatial reference identifier) value: <srid>."
+    ],
+    "sqlState" : "22023"
+  },
   "TEST_CLASS_NOT_COMPILED": {
     "message": [
       "<test_class_path> doesn't exist. Spark sql test classes are not compiled."

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/pipelines/__init__.py RENAMED Viewed

@@ -20,6 +20,7 @@ from pyspark.pipelines.api import (
     materialized_view,
     table,
     temporary_view,
+    create_sink,
 )
 __all__ = [
@@ -28,4 +29,5 @@ __all__ = [
     "materialized_view",
     "table",
     "temporary_view",
+    "create_sink",
 ]

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/pipelines/api.py RENAMED Viewed

@@ -23,10 +23,11 @@ from pyspark.pipelines.flow import Flow, QueryFunction
 from pyspark.pipelines.source_code_location import (
     get_caller_source_code_location,
 )
-from pyspark.pipelines.dataset import (
+from pyspark.pipelines.output import (
     MaterializedView,
     StreamingTable,
     TemporaryView,
+    Sink,
 )
 from pyspark.sql.types import StructType
@@ -156,7 +157,7 @@ def table(
         resolved_name = name or decorated.__name__
         registry = get_active_graph_element_registry()
-        registry.register_dataset(
+        registry.register_output(
             StreamingTable(
                 comment=comment,
                 name=resolved_name,
@@ -258,7 +259,7 @@ def materialized_view(
         resolved_name = name or decorated.__name__
         registry = get_active_graph_element_registry()
-        registry.register_dataset(
+        registry.register_output(
             MaterializedView(
                 comment=comment,
                 name=resolved_name,
@@ -351,7 +352,7 @@ def temporary_view(
         resolved_name = name or decorated.__name__
         registry = get_active_graph_element_registry()
-        registry.register_dataset(
+        registry.register_output(
             TemporaryView(
                 comment=comment,
                 name=resolved_name,
@@ -446,4 +447,46 @@ def create_streaming_table(
         schema=schema,
         format=format,
     )
-    get_active_graph_element_registry().register_dataset(table)
+    get_active_graph_element_registry().register_output(table)
+def create_sink(
+    name: str,
+    format: str,
+    options: Optional[Dict[str, str]] = None,
+) -> None:
+    """
+    Creates a sink that can be targeted by streaming flows, providing a generic destination \
+    for flows to send data external to the pipeline.
+    :param name: The name of the sink.
+    :param format: The format of the sink, e.g. "parquet".
+    :param options: A dict where the keys are the property names and the values are the \
+        property values. These properties will be set on the sink.
+    """
+    if type(name) is not str:
+        raise PySparkTypeError(
+            errorClass="NOT_STR",
+            messageParameters={"arg_name": "name", "arg_type": type(name).__name__},
+        )
+    if type(format) is not str:
+        raise PySparkTypeError(
+            errorClass="NOT_STR",
+            messageParameters={"arg_name": "format", "arg_type": type(format).__name__},
+        )
+    if options is not None and not isinstance(options, dict):
+        raise PySparkTypeError(
+            errorClass="NOT_DICT",
+            messageParameters={
+                "arg_name": "options",
+                "arg_type": type(options).__name__,
+            },
+        )
+    sink = Sink(
+        name=name,
+        format=format,
+        options=options or {},
+        source_code_location=get_caller_source_code_location(stacklevel=1),
+        comment=None,
+    )
+    get_active_graph_element_registry().register_output(sink)

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/pipelines/cli.py RENAMED Viewed

@@ -90,6 +90,7 @@ class PipelineSpec:
     """Spec for a pipeline.
     :param name: The name of the pipeline.
+    :param storage: The root directory for storing metadata, such as streaming checkpoints.
     :param catalog: The default catalog to use for the pipeline.
     :param database: The default database to use for the pipeline.
     :param configuration: A dictionary of Spark configuration properties to set for the pipeline.
@@ -97,6 +98,7 @@ class PipelineSpec:
     """
     name: str
+    storage: str
     catalog: Optional[str]
     database: Optional[str]
     configuration: Mapping[str, str]
@@ -150,8 +152,16 @@ def load_pipeline_spec(spec_path: Path) -> PipelineSpec:
 def unpack_pipeline_spec(spec_data: Mapping[str, Any]) -> PipelineSpec:
-    ALLOWED_FIELDS = {"name", "catalog", "database", "schema", "configuration", "libraries"}
-    REQUIRED_FIELDS = ["name"]
+    ALLOWED_FIELDS = {
+        "name",
+        "storage",
+        "catalog",
+        "database",
+        "schema",
+        "configuration",
+        "libraries",
+    }
+    REQUIRED_FIELDS = ["name", "storage"]
     for key in spec_data.keys():
         if key not in ALLOWED_FIELDS:
             raise PySparkException(
@@ -167,6 +177,7 @@ def unpack_pipeline_spec(spec_data: Mapping[str, Any]) -> PipelineSpec:
     return PipelineSpec(
         name=spec_data["name"],
+        storage=spec_data["storage"],
         catalog=spec_data.get("catalog"),
         database=spec_data.get("database", spec_data.get("schema")),
         configuration=validate_str_dict(spec_data.get("configuration", {}), "configuration"),
@@ -295,7 +306,9 @@ def run(
     spec = load_pipeline_spec(spec_path)
     log_with_curr_timestamp("Creating Spark session...")
-    spark_builder = SparkSession.builder
+    spark_builder = SparkSession.builder.config(
+        "spark.sql.connect.serverStacktrace.enabled", "false"
+    )
     for key, value in spec.configuration.items():
         spark_builder = spark_builder.config(key, value)
@@ -321,6 +334,7 @@ def run(
         full_refresh_all=full_refresh_all,
         refresh=refresh,
         dry=dry,
+        storage=spec.storage,
     )
     try:
         handle_pipeline_events(result_iter)

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/pipelines/graph_element_registry.py RENAMED Viewed

@@ -18,7 +18,7 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
-from pyspark.pipelines.dataset import Dataset
+from pyspark.pipelines.output import Output
 from pyspark.pipelines.flow import Flow
 from contextlib import contextmanager
 from contextvars import ContextVar
@@ -35,7 +35,7 @@ class GraphElementRegistry(ABC):
     """
     @abstractmethod
-    def register_dataset(self, dataset: Dataset) -> None:
+    def register_output(self, output: Output) -> None:
         """Add the given dataset to the registry."""
     @abstractmethod

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/pipelines/init_cli.py RENAMED Viewed

@@ -19,6 +19,7 @@ from pathlib import Path
 SPEC = """
 name: {{ name }}
+storage: storage-root
 libraries:
   - glob:
       include: transformations/**

pyspark_client-4.1.0.dev2/pyspark/pipelines/dataset.py → pyspark_client-4.1.0.dev3/pyspark/pipelines/output.py RENAMED Viewed

@@ -22,12 +22,12 @@ from pyspark.sql.types import StructType
 @dataclass(frozen=True)
-class Dataset:
-    """Base class for definitions of datasets in a pipeline dataflow graph.
+class Output:
+    """Base class for definitions of outputs in a pipeline dataflow graph.
-    :param name: The name of the dataset. May be a multi-part name, such as "db.table".
-    :param comment: Optional comment for the dataset.
-    :param source_code_location: The location of the source code that created this dataset.
+    :param name: The name of the outputs. May be a multi-part name, such as "db.table".
+    :param comment: Optional comment for the output.
+    :param source_code_location: The location of the source code that created this output.
         This is used for debugging and tracing purposes.
     """
@@ -37,7 +37,7 @@ class Dataset:
 @dataclass(frozen=True)
-class Table(Dataset):
+class Table(Output):
     """
     Definition of a table in a pipeline dataflow graph, i.e. a catalog object backed by data in
     physical storage.
@@ -69,8 +69,17 @@ class StreamingTable(Table):
 @dataclass(frozen=True)
-class TemporaryView(Dataset):
+class TemporaryView(Output):
     """Definition of a temporary view in a pipeline dataflow graph. Temporary views can be
     referenced by flows within the dataflow graph, but are not visible outside of the graph."""
     pass
+@dataclass(frozen=True)
+class Sink(Output):
+    """Definition of an external sink in a pipeline dataflow graph. An external sink's
+    contents are written to an external system rather than managed by the pipeline itself."""
+    format: str
+    options: Mapping[str, str]

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/pipelines/source_code_location.py RENAMED Viewed

@@ -30,6 +30,34 @@ def get_caller_source_code_location(stacklevel: int) -> SourceCodeLocation:
     """
     Returns a SourceCodeLocation object representing the location code that invokes this function.
+    If this function is called from a decorator (ex. @sdp.table), note that the returned line
+    number is affected by how the decorator was triggered - i.e. whether @sdp.table or @sdp.table()
+    was called - AND what python version is being used
+    Case 1:
+    |@sdp.table()
+    |def fn
+    @sdp.table() is executed immediately, on line 1. This is true for all python versions.
+    Case 2:
+    |@sdp.table
+    |def fn
+    In python < 3.10, @sdp.table will expand to fn = sdp.table(fn), replacing the line that `fn` is
+    defined on. This would be line 2. More interestingly, this means:
+    |@sdp.table
+    |
+    |
+    |def fn
+    Will expand to fn = sdp.table(fn) on line 4, where `fn` is defined.
+    However, in python 3.10+, the line number in the stack trace will still be the line that the
+    decorator was defined on. In other words, case 2 will be treated the same as case 1, and the
+    line number will be 1.
     :param stacklevel: The number of stack frames to go up. 0 means the direct caller of this
         function, 1 means the caller of the caller, and so on.
     """

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/pipelines/spark_connect_graph_element_registry.py RENAMED Viewed

@@ -20,21 +20,25 @@ from pyspark.errors import PySparkTypeError
 from pyspark.sql import SparkSession
 from pyspark.sql.connect.dataframe import DataFrame as ConnectDataFrame
 from pyspark.pipelines.block_connect_access import block_spark_connect_execution_and_analysis
-from pyspark.pipelines.dataset import (
-    Dataset,
+from pyspark.pipelines.output import (
+    Output,
     MaterializedView,
     Table,
+    Sink,
     StreamingTable,
     TemporaryView,
 )
 from pyspark.pipelines.flow import Flow
 from pyspark.pipelines.graph_element_registry import GraphElementRegistry
+from pyspark.pipelines.source_code_location import SourceCodeLocation
+from pyspark.sql.connect.types import pyspark_types_to_proto_types
+from pyspark.sql.types import StructType
 from typing import Any, cast
 import pyspark.sql.connect.proto as pb2
 class SparkConnectGraphElementRegistry(GraphElementRegistry):
-    """Registers datasets and flows in a dataflow graph held in a Spark Connect server."""
+    """Registers outputs and flows in a dataflow graph held in a Spark Connect server."""
     def __init__(self, spark: SparkSession, dataflow_graph_id: str) -> None:
         # Cast because mypy seems to think `spark`` is a function, not an object. Likely related to
@@ -42,46 +46,66 @@ class SparkConnectGraphElementRegistry(GraphElementRegistry):
         self._client = cast(Any, spark).client
         self._dataflow_graph_id = dataflow_graph_id
-    def register_dataset(self, dataset: Dataset) -> None:
-        if isinstance(dataset, Table):
-            table_properties = dataset.table_properties
-            partition_cols = dataset.partition_cols
-            schema = None  # TODO
-            format = dataset.format
+    def register_output(self, output: Output) -> None:
+        table_details = None
+        sink_details = None
+        if isinstance(output, Table):
+            if isinstance(output.schema, str):
+                schema_string = output.schema
+                schema_data_type = None
+            elif isinstance(output.schema, StructType):
+                schema_string = None
+                schema_data_type = pyspark_types_to_proto_types(output.schema)
+            else:
+                schema_string = None
+                schema_data_type = None
+            table_details = pb2.PipelineCommand.DefineOutput.TableDetails(
+                table_properties=output.table_properties,
+                partition_cols=output.partition_cols,
+                format=output.format,
+                # Even though schema_string is not required, the generated Python code seems to
+                # erroneously think it is required.
+                schema_string=schema_string,  # type: ignore[arg-type]
+                schema_data_type=schema_data_type,
+            )
-            if isinstance(dataset, MaterializedView):
-                dataset_type = pb2.DatasetType.MATERIALIZED_VIEW
-            elif isinstance(dataset, StreamingTable):
-                dataset_type = pb2.DatasetType.TABLE
+            if isinstance(output, MaterializedView):
+                output_type = pb2.OutputType.MATERIALIZED_VIEW
+            elif isinstance(output, StreamingTable):
+                output_type = pb2.OutputType.TABLE
             else:
                 raise PySparkTypeError(
                     errorClass="UNSUPPORTED_PIPELINES_DATASET_TYPE",
-                    messageParameters={"dataset_type": type(dataset).__name__},
+                    messageParameters={"output_type": type(output).__name__},
                 )
-        elif isinstance(dataset, TemporaryView):
-            table_properties = None
-            partition_cols = None
-            schema = None
-            format = None
-            dataset_type = pb2.DatasetType.TEMPORARY_VIEW
+        elif isinstance(output, TemporaryView):
+            output_type = pb2.OutputType.TEMPORARY_VIEW
+            table_details = None
+        elif isinstance(output, Sink):
+            output_type = pb2.OutputType.SINK
+            sink_details = pb2.PipelineCommand.DefineOutput.SinkDetails(
+                options=output.options,
+                format=output.format,
+            )
         else:
             raise PySparkTypeError(
                 errorClass="UNSUPPORTED_PIPELINES_DATASET_TYPE",
-                messageParameters={"dataset_type": type(dataset).__name__},
+                messageParameters={"output_type": type(output).__name__},
             )
-        inner_command = pb2.PipelineCommand.DefineDataset(
+        inner_command = pb2.PipelineCommand.DefineOutput(
             dataflow_graph_id=self._dataflow_graph_id,
-            dataset_name=dataset.name,
-            dataset_type=dataset_type,
-            comment=dataset.comment,
-            table_properties=table_properties,
-            partition_cols=partition_cols,
-            schema=schema,
-            format=format,
+            output_name=output.name,
+            output_type=output_type,
+            comment=output.comment,
+            sink_details=sink_details,
+            table_details=table_details,
+            source_code_location=source_code_location_to_proto(output.source_code_location),
         )
         command = pb2.Command()
-        command.pipeline_command.define_dataset.CopyFrom(inner_command)
+        command.pipeline_command.define_output.CopyFrom(inner_command)
         self._client.execute_command(command)
     def register_flow(self, flow: Flow) -> None:
@@ -89,12 +113,17 @@ class SparkConnectGraphElementRegistry(GraphElementRegistry):
             df = flow.func()
         relation = cast(ConnectDataFrame, df)._plan.plan(self._client)
+        relation_flow_details = pb2.PipelineCommand.DefineFlow.WriteRelationFlowDetails(
+            relation=relation,
+        )
         inner_command = pb2.PipelineCommand.DefineFlow(
             dataflow_graph_id=self._dataflow_graph_id,
             flow_name=flow.name,
             target_dataset_name=flow.target,
-            relation=relation,
+            relation_flow_details=relation_flow_details,
             sql_conf=flow.spark_conf,
+            source_code_location=source_code_location_to_proto(flow.source_code_location),
         )
         command = pb2.Command()
         command.pipeline_command.define_flow.CopyFrom(inner_command)
@@ -109,3 +138,11 @@ class SparkConnectGraphElementRegistry(GraphElementRegistry):
         command = pb2.Command()
         command.pipeline_command.define_sql_graph_elements.CopyFrom(inner_command)
         self._client.execute_command(command)
+def source_code_location_to_proto(
+    source_code_location: SourceCodeLocation,
+) -> pb2.SourceCodeLocation:
+    return pb2.SourceCodeLocation(
+        file_name=source_code_location.filename, line_number=source_code_location.line_number
+    )

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/pipelines/spark_connect_pipeline.py RENAMED Viewed

@@ -72,6 +72,7 @@ def start_run(
     full_refresh_all: bool,
     refresh: Optional[Sequence[str]],
     dry: bool,
+    storage: str,
 ) -> Iterator[Dict[str, Any]]:
     """Start a run of the dataflow graph in the Spark Connect server.
@@ -79,6 +80,8 @@ def start_run(
     :param full_refresh: List of datasets to reset and recompute.
     :param full_refresh_all: Perform a full graph reset and recompute.
     :param refresh: List of datasets to update.
+    :param dry: If true, the run will not actually execute any flows, but only validate the graph.
+    :param storage: The storage location to store metadata such as streaming checkpoints.
     """
     inner_command = pb2.PipelineCommand.StartRun(
         dataflow_graph_id=dataflow_graph_id,
@@ -86,6 +89,7 @@ def start_run(
         full_refresh_all=full_refresh_all,
         refresh_selection=refresh or [],
         dry=dry,
+        storage=storage,
     )
     command = pb2.Command()
     command.pipeline_command.start_run.CopyFrom(inner_command)

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/sql/avro/functions.py RENAMED Viewed

@@ -69,7 +69,7 @@ def from_avro(
     >>> df = spark.createDataFrame(data, ("key", "value"))
     >>> avroDf = df.select(to_avro(df.value).alias("avro"))
     >>> avroDf.collect()
-    [Row(avro=bytearray(b'\\x00\\x00\\x04\\x00\\nAlice'))]
+    [Row(avro=b'\\x00\\x00\\x04\\x00\\nAlice')]
     >>> jsonFormatSchema = '''{"type":"record","name":"topLevelRecord","fields":
     ...     [{"name":"avro","type":[{"type":"record","name":"value","namespace":"topLevelRecord",
@@ -141,12 +141,12 @@ def to_avro(data: "ColumnOrName", jsonFormatSchema: str = "") -> Column:
     >>> data = ['SPADES']
     >>> df = spark.createDataFrame(data, "string")
     >>> df.select(to_avro(df.value).alias("suite")).collect()
-    [Row(suite=bytearray(b'\\x00\\x0cSPADES'))]
+    [Row(suite=b'\\x00\\x0cSPADES')]
     >>> jsonFormatSchema = '''["null", {"type": "enum", "name": "value",
     ...     "symbols": ["SPADES", "HEARTS", "DIAMONDS", "CLUBS"]}]'''
     >>> df.select(to_avro(df.value, jsonFormatSchema).alias("suite")).collect()
-    [Row(suite=bytearray(b'\\x02\\x00'))]
+    [Row(suite=b'\\x02\\x00')]
     """
     from py4j.java_gateway import JVMView
     from pyspark.sql.classic.column import _to_java_column

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/sql/column.py RENAMED Viewed

@@ -21,6 +21,7 @@ import sys
 from typing import (
     overload,
     Any,
+    Callable,
     TYPE_CHECKING,
     Union,
 )
@@ -1538,6 +1539,58 @@ class Column(TableValuedFunctionArgument):
         """
         ...
+    @dispatch_col_method
+    def transform(self, f: Callable[["Column"], "Column"]) -> "Column":
+        """
+        Applies a transformation function to this column.
+        This method allows you to apply a function that takes a Column and returns a Column,
+        enabling method chaining and functional transformations.
+        .. versionadded:: 4.1.0
+        Parameters
+        ----------
+        f : callable
+            A function that takes a :class:`Column` and returns a :class:`Column`.
+        Returns
+        -------
+        :class:`Column`
+            The result of applying the function to this column.
+        Examples
+        --------
+        Example 1: Chain built-in functions
+        >>> from pyspark.sql.functions import trim, upper
+        >>> df = spark.createDataFrame([("  hello  ",), ("  world  ",)], ["text"])
+        >>> df.select(df.text.transform(trim).transform(upper).alias("result")).show()
+        +------+
+        |result|
+        +------+
+        | HELLO|
+        | WORLD|
+        +------+
+        Example 2: Use lambda functions
+        >>> df = spark.createDataFrame([(10,), (20,), (30,)], ["value"])
+        >>> df.select(
+        ...     df.value.transform(lambda c: c + 5)
+        ...     .transform(lambda c: c * 2)
+        ...     .transform(lambda c: c - 10).alias("result")
+        ... ).show()
+        +------+
+        |result|
+        +------+
+        |    20|
+        |    40|
+        |    60|
+        +------+
+        """
+        ...
     @dispatch_col_method
     def outer(self) -> "Column":
         """

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/sql/connect/_typing.py RENAMED Viewed

@@ -39,7 +39,7 @@ LiteralType = PrimitiveType
 DecimalLiteral = decimal.Decimal
-DateTimeLiteral = Union[datetime.datetime, datetime.date]
+DateTimeLiteral = Union[datetime.date, datetime.time, datetime.datetime]
 DataTypeOrString = Union[DataType, str]

{pyspark_client-4.1.0.dev2 → pyspark_client-4.1.0.dev3}/pyspark/sql/connect/client/artifact.py RENAMED Viewed

@@ -427,6 +427,30 @@ class ArtifactManager:
         status = resp.statuses.get(artifactName)
         return status.exists if status is not None else False
+    def get_cached_artifacts(self, hashes: list[str]) -> set[str]:
+        """
+        Batch check which artifacts are already cached on the server.
+        Returns a set of hashes that are already cached.
+        """
+        if not hashes:
+            return set()
+        artifact_names = [f"{CACHE_PREFIX}/{hash}" for hash in hashes]
+        request = proto.ArtifactStatusesRequest(
+            user_context=self._user_context, session_id=self._session_id, names=artifact_names
+        )
+        resp: proto.ArtifactStatusesResponse = self._stub.ArtifactStatus(
+            request, metadata=self._metadata
+        )
+        cached = set()
+        for hash in hashes:
+            artifact_name = f"{CACHE_PREFIX}/{hash}"
+            status = resp.statuses.get(artifact_name)
+            if status is not None and status.exists:
+                cached.add(hash)
+        return cached
     def cache_artifact(self, blob: bytes) -> str:
         """
         Cache the give blob at the session.
@@ -442,3 +466,34 @@ class ArtifactManager:
                 # TODO(SPARK-42658): Handle responses containing CRC failures.
         return hash
+    def cache_artifacts(self, blobs: list[bytes]) -> list[str]:
+        """
+        Cache the given blobs at the session.
+        This method batches artifact status checks and uploads to minimize RPC overhead.
+        """
+        # Compute hashes for all blobs upfront
+        hashes = [hashlib.sha256(blob).hexdigest() for blob in blobs]
+        unique_hashes = list(set(hashes))
+        # Batch check which artifacts are already cached
+        cached_hashes = self.get_cached_artifacts(unique_hashes)
+        # Collect unique artifacts that need to be uploaded
+        seen_hashes = set()
+        artifacts_to_add = []
+        for blob, hash in zip(blobs, hashes):
+            if hash not in cached_hashes and hash not in seen_hashes:
+                artifacts_to_add.append(new_cache_artifact(hash, InMemory(blob)))
+                seen_hashes.add(hash)
+        # Batch upload all missing artifacts in a single RPC call
+        if artifacts_to_add:
+            requests = self._add_artifacts(artifacts_to_add)
+            response: proto.AddArtifactsResponse = self._retrieve_responses(requests)
+            summaries: List[proto.AddArtifactsResponse.ArtifactSummary] = []
+            for summary in response.artifacts:
+                summaries.append(summary)
+                # TODO(SPARK-42658): Handle responses containing CRC failures.
+        return hashes

pyspark-client 4.1.0.dev2__tar.gz → 4.1.0.dev3__tar.gz

pyspark-client 4.1.0.dev2tar.gz → 4.1.0.dev3tar.gz