PyPI - flyteplugins-spark - Versions diffs - 2.0.0b6__py3-none-any.whl → 2.0.0b44__py3-none-any.whl - Mend

flyteplugins-spark 2.0.0b6py3-none-any.whl → 2.0.0b44py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

flyteplugins/spark/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
-__all__ = ["Spark"]
+__all__ = ["ParquetToSparkDecoder", "Spark", "SparkToParquetEncoder"]
+from flyteplugins.spark.df_transformer import ParquetToSparkDecoder, SparkToParquetEncoder
 from flyteplugins.spark.task import Spark

flyteplugins/spark/df_transformer.py ADDED Viewed

@@ -0,0 +1,55 @@
+import flyte
+import pyspark
+from flyte.io import PARQUET, DataFrame
+from flyte.io.extend import DataFrameDecoder, DataFrameEncoder, DataFrameTransformerEngine
+from flyteidl2.core import literals_pb2, types_pb2
+from typing_extensions import cast
+class SparkToParquetEncoder(DataFrameEncoder):
+    def __init__(self):
+        super().__init__(python_type=pyspark.sql.DataFrame, supported_format=PARQUET)
+    async def encode(
+        self,
+        dataframe: DataFrame,
+        structured_dataset_type: types_pb2.StructuredDatasetType,
+    ) -> literals_pb2.StructuredDataset:
+        path = dataframe.uri
+        ctx = flyte.ctx()
+        if ctx and not path:
+            path = ctx.raw_data_path.get_random_remote_path()
+        ss = pyspark.sql.SparkSession.builder.getOrCreate()
+        # Avoid generating SUCCESS files
+        ss.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
+        cast(pyspark.sql.DataFrame, dataframe._raw_df).write.mode("overwrite").parquet(path=path)
+        structured_dataset_type.format = PARQUET
+        return literals_pb2.StructuredDataset(
+            uri=path,
+            metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type=structured_dataset_type),
+        )
+class ParquetToSparkDecoder(DataFrameDecoder):
+    def __init__(self):
+        super().__init__(pyspark.sql.DataFrame, None, PARQUET)
+    async def decode(
+        self,
+        flyte_value: literals_pb2.StructuredDataset,
+        current_task_metadata: literals_pb2.StructuredDatasetMetadata,
+    ) -> pyspark.sql.DataFrame:
+        spark = pyspark.sql.SparkSession.builder.getOrCreate()
+        path = flyte_value.uri
+        if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
+            columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
+            return spark.read.parquet(path).select(*columns)
+        return spark.read.parquet(path)
+DataFrameTransformerEngine.register(SparkToParquetEncoder(), default_format_for_type=True)
+DataFrameTransformerEngine.register(ParquetToSparkDecoder(), default_format_for_type=True)

flyteplugins/spark/task.py CHANGED Viewed

@@ -4,13 +4,15 @@ import tempfile
 from dataclasses import dataclass
 from typing import Any, Dict, Optional
+import flyte
 from flyte import PodTemplate
-from flyte._initialize import is_initialized
 from flyte.extend import AsyncFunctionTaskTemplate, TaskPluginRegistry
 from flyte.models import SerializationContext
-from flyteidl.plugins.spark_pb2 import SparkApplication, SparkJob
+from flyteidl2.plugins.spark_pb2 import SparkApplication, SparkJob
 from google.protobuf.json_format import MessageToDict
+DEFAULT_SPARK_CONTEXT_NAME = "FlyteSpark"
 @dataclass
 class Spark(object):
@@ -50,18 +52,22 @@ class PysparkFunctionTask(AsyncFunctionTaskTemplate):
     plugin_config: Spark
     task_type: str = "spark"
+    debuggable: bool = True
     async def pre(self, *args, **kwargs) -> Dict[str, Any]:
         import pyspark as _pyspark
-        sess = _pyspark.sql.SparkSession.builder.appName("FlyteSpark").getOrCreate()
+        sess = _pyspark.sql.SparkSession.builder.appName(DEFAULT_SPARK_CONTEXT_NAME).getOrCreate()
-        if is_initialized():
+        if flyte.ctx().is_in_cluster():
             base_dir = tempfile.mkdtemp()
-            file_name = "flyte_wf"
+            code_bundle_dir = flyte.ctx().code_bundle.destination
+            file_name = "flyte_code_bundle"
             file_format = "zip"
-            shutil.make_archive(f"{base_dir}/{file_name}", file_format, os.getcwd())
-            sess.sparkContext.addPyFile(f"{base_dir}/{file_name}.{file_format}")
+            file_path = f"{base_dir}/{file_name}.{file_format}"
+            if not os.path.exists(file_path):
+                shutil.make_archive(f"{base_dir}/{file_name}", file_format, code_bundle_dir)
+                sess.sparkContext.addPyFile(file_path)
         return {"spark_session": sess}
@@ -72,8 +78,8 @@ class PysparkFunctionTask(AsyncFunctionTaskTemplate):
         job = SparkJob(
             sparkConf=self.plugin_config.spark_conf,
             hadoopConf=self.plugin_config.hadoop_conf,
-            mainApplicationFile=self.plugin_config.applications_path,
-            executorPath=self.plugin_config.executor_path,
+            mainApplicationFile=self.plugin_config.applications_path or "local://" + sctx.get_entrypoint_path(),
+            executorPath=self.plugin_config.executor_path or sctx.interpreter_path,
             mainClass="",
             applicationType=SparkApplication.PYTHON,
             driverPod=driver_pod,
@@ -82,5 +88,17 @@ class PysparkFunctionTask(AsyncFunctionTaskTemplate):
         return MessageToDict(job)
+    async def post(self, return_vals: Any) -> Any:
+        ctx = flyte.ctx()
+        if ctx and ctx.action.name == "a0":
+            # Only stop the SparkSession if it was created by the parent task in the debug mode.
+            # This is to make sure that the SparkSession is stopped by
+            # parent action only when debugging in the interactive mode.
+            # Note: The action name is always "a0" in the debug mode.
+            import pyspark as _pyspark
+            sess = _pyspark.sql.SparkSession.builder.appName(DEFAULT_SPARK_CONTEXT_NAME).getOrCreate()
+            sess.stop()
 TaskPluginRegistry.register(Spark, PysparkFunctionTask)

{flyteplugins_spark-2.0.0b6.dist-info → flyteplugins_spark-2.0.0b44.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: flyteplugins-spark
-Version: 2.0.0b6
+Version: 2.0.0b44
 Summary: Spark plugin for flyte
 Author-email: Kevin Su <pingsutw@users.noreply.github.com>
 Requires-Python: >=3.10
@@ -15,5 +15,5 @@ Union can execute Spark jobs natively on a Kubernetes Cluster, which manages a v
 To install the plugin, run the following command:
 ```bash
-pip install flyteplugins-spark
+pip install --pre flyteplugins-spark
 ```

flyteplugins_spark-2.0.0b44.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+flyteplugins/spark/__init__.py,sha256=UDl9xv0gAEATqXL7YerclxvJfcPhL10DkFVavM9nuFg,204
+flyteplugins/spark/df_transformer.py,sha256=u4QLAVVvKTUpK7CaOfXzwHnFSJnj6DuLhRjTeB_NG6Q,2164
+flyteplugins/spark/task.py,sha256=wrdWiuXyVWl_FwE0uxqk3td8gdMe_yrh2CYL3zE7Rug,4074
+flyteplugins_spark-2.0.0b44.dist-info/METADATA,sha256=jP5YAkPsfCq5x4YOTf0gfJCysiOJtI6hCSHIbzKuNxs,763
+flyteplugins_spark-2.0.0b44.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+flyteplugins_spark-2.0.0b44.dist-info/top_level.txt,sha256=cgd779rPu9EsvdtuYgUxNHHgElaQvPn74KhB5XSeMBE,13
+flyteplugins_spark-2.0.0b44.dist-info/RECORD,,

flyteplugins_spark-2.0.0b6.dist-info/RECORD DELETED Viewed

@@ -1,6 +0,0 @@
-flyteplugins/spark/__init__.py,sha256=7q06oXtVhOuNW4Nv-YhJvt8ORqhXmMPNYa-m4IkougI,63
-flyteplugins/spark/task.py,sha256=3fnw8QzcROAqmSAZIJLPc90Q-AvENV1fVdHChMFWdhs,3174
-flyteplugins_spark-2.0.0b6.dist-info/METADATA,sha256=k4BLnNiC7UT5kAbwNubiFt89xcZg4W58KM2qY3lRv7M,756
-flyteplugins_spark-2.0.0b6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-flyteplugins_spark-2.0.0b6.dist-info/top_level.txt,sha256=cgd779rPu9EsvdtuYgUxNHHgElaQvPn74KhB5XSeMBE,13
-flyteplugins_spark-2.0.0b6.dist-info/RECORD,,

{flyteplugins_spark-2.0.0b6.dist-info → flyteplugins_spark-2.0.0b44.dist-info}/WHEEL RENAMED Viewed

File without changes

{flyteplugins_spark-2.0.0b6.dist-info → flyteplugins_spark-2.0.0b44.dist-info}/top_level.txt RENAMED Viewed

File without changes

flyteplugins-spark 2.0.0b6__py3-none-any.whl → 2.0.0b44__py3-none-any.whl

flyteplugins-spark 2.0.0b6py3-none-any.whl → 2.0.0b44py3-none-any.whl