flyteplugins-spark 2.0.0b6__py3-none-any.whl → 2.0.0b44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flyteplugins/spark/__init__.py +2 -1
- flyteplugins/spark/df_transformer.py +55 -0
- flyteplugins/spark/task.py +27 -9
- {flyteplugins_spark-2.0.0b6.dist-info → flyteplugins_spark-2.0.0b44.dist-info}/METADATA +2 -2
- flyteplugins_spark-2.0.0b44.dist-info/RECORD +7 -0
- flyteplugins_spark-2.0.0b6.dist-info/RECORD +0 -6
- {flyteplugins_spark-2.0.0b6.dist-info → flyteplugins_spark-2.0.0b44.dist-info}/WHEEL +0 -0
- {flyteplugins_spark-2.0.0b6.dist-info → flyteplugins_spark-2.0.0b44.dist-info}/top_level.txt +0 -0
flyteplugins/spark/__init__.py
CHANGED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import flyte
|
|
2
|
+
import pyspark
|
|
3
|
+
from flyte.io import PARQUET, DataFrame
|
|
4
|
+
from flyte.io.extend import DataFrameDecoder, DataFrameEncoder, DataFrameTransformerEngine
|
|
5
|
+
from flyteidl2.core import literals_pb2, types_pb2
|
|
6
|
+
from typing_extensions import cast
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class SparkToParquetEncoder(DataFrameEncoder):
|
|
10
|
+
def __init__(self):
|
|
11
|
+
super().__init__(python_type=pyspark.sql.DataFrame, supported_format=PARQUET)
|
|
12
|
+
|
|
13
|
+
async def encode(
|
|
14
|
+
self,
|
|
15
|
+
dataframe: DataFrame,
|
|
16
|
+
structured_dataset_type: types_pb2.StructuredDatasetType,
|
|
17
|
+
) -> literals_pb2.StructuredDataset:
|
|
18
|
+
path = dataframe.uri
|
|
19
|
+
ctx = flyte.ctx()
|
|
20
|
+
if ctx and not path:
|
|
21
|
+
path = ctx.raw_data_path.get_random_remote_path()
|
|
22
|
+
|
|
23
|
+
ss = pyspark.sql.SparkSession.builder.getOrCreate()
|
|
24
|
+
|
|
25
|
+
# Avoid generating SUCCESS files
|
|
26
|
+
ss.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
|
|
27
|
+
cast(pyspark.sql.DataFrame, dataframe._raw_df).write.mode("overwrite").parquet(path=path)
|
|
28
|
+
|
|
29
|
+
structured_dataset_type.format = PARQUET
|
|
30
|
+
return literals_pb2.StructuredDataset(
|
|
31
|
+
uri=path,
|
|
32
|
+
metadata=literals_pb2.StructuredDatasetMetadata(structured_dataset_type=structured_dataset_type),
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ParquetToSparkDecoder(DataFrameDecoder):
|
|
37
|
+
def __init__(self):
|
|
38
|
+
super().__init__(pyspark.sql.DataFrame, None, PARQUET)
|
|
39
|
+
|
|
40
|
+
async def decode(
|
|
41
|
+
self,
|
|
42
|
+
flyte_value: literals_pb2.StructuredDataset,
|
|
43
|
+
current_task_metadata: literals_pb2.StructuredDatasetMetadata,
|
|
44
|
+
) -> pyspark.sql.DataFrame:
|
|
45
|
+
spark = pyspark.sql.SparkSession.builder.getOrCreate()
|
|
46
|
+
path = flyte_value.uri
|
|
47
|
+
|
|
48
|
+
if current_task_metadata.structured_dataset_type and current_task_metadata.structured_dataset_type.columns:
|
|
49
|
+
columns = [c.name for c in current_task_metadata.structured_dataset_type.columns]
|
|
50
|
+
return spark.read.parquet(path).select(*columns)
|
|
51
|
+
return spark.read.parquet(path)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
DataFrameTransformerEngine.register(SparkToParquetEncoder(), default_format_for_type=True)
|
|
55
|
+
DataFrameTransformerEngine.register(ParquetToSparkDecoder(), default_format_for_type=True)
|
flyteplugins/spark/task.py
CHANGED
|
@@ -4,13 +4,15 @@ import tempfile
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
from typing import Any, Dict, Optional
|
|
6
6
|
|
|
7
|
+
import flyte
|
|
7
8
|
from flyte import PodTemplate
|
|
8
|
-
from flyte._initialize import is_initialized
|
|
9
9
|
from flyte.extend import AsyncFunctionTaskTemplate, TaskPluginRegistry
|
|
10
10
|
from flyte.models import SerializationContext
|
|
11
|
-
from
|
|
11
|
+
from flyteidl2.plugins.spark_pb2 import SparkApplication, SparkJob
|
|
12
12
|
from google.protobuf.json_format import MessageToDict
|
|
13
13
|
|
|
14
|
+
DEFAULT_SPARK_CONTEXT_NAME = "FlyteSpark"
|
|
15
|
+
|
|
14
16
|
|
|
15
17
|
@dataclass
|
|
16
18
|
class Spark(object):
|
|
@@ -50,18 +52,22 @@ class PysparkFunctionTask(AsyncFunctionTaskTemplate):
|
|
|
50
52
|
|
|
51
53
|
plugin_config: Spark
|
|
52
54
|
task_type: str = "spark"
|
|
55
|
+
debuggable: bool = True
|
|
53
56
|
|
|
54
57
|
async def pre(self, *args, **kwargs) -> Dict[str, Any]:
|
|
55
58
|
import pyspark as _pyspark
|
|
56
59
|
|
|
57
|
-
sess = _pyspark.sql.SparkSession.builder.appName(
|
|
60
|
+
sess = _pyspark.sql.SparkSession.builder.appName(DEFAULT_SPARK_CONTEXT_NAME).getOrCreate()
|
|
58
61
|
|
|
59
|
-
if
|
|
62
|
+
if flyte.ctx().is_in_cluster():
|
|
60
63
|
base_dir = tempfile.mkdtemp()
|
|
61
|
-
|
|
64
|
+
code_bundle_dir = flyte.ctx().code_bundle.destination
|
|
65
|
+
file_name = "flyte_code_bundle"
|
|
62
66
|
file_format = "zip"
|
|
63
|
-
|
|
64
|
-
|
|
67
|
+
file_path = f"{base_dir}/{file_name}.{file_format}"
|
|
68
|
+
if not os.path.exists(file_path):
|
|
69
|
+
shutil.make_archive(f"{base_dir}/{file_name}", file_format, code_bundle_dir)
|
|
70
|
+
sess.sparkContext.addPyFile(file_path)
|
|
65
71
|
|
|
66
72
|
return {"spark_session": sess}
|
|
67
73
|
|
|
@@ -72,8 +78,8 @@ class PysparkFunctionTask(AsyncFunctionTaskTemplate):
|
|
|
72
78
|
job = SparkJob(
|
|
73
79
|
sparkConf=self.plugin_config.spark_conf,
|
|
74
80
|
hadoopConf=self.plugin_config.hadoop_conf,
|
|
75
|
-
mainApplicationFile=self.plugin_config.applications_path,
|
|
76
|
-
executorPath=self.plugin_config.executor_path,
|
|
81
|
+
mainApplicationFile=self.plugin_config.applications_path or "local://" + sctx.get_entrypoint_path(),
|
|
82
|
+
executorPath=self.plugin_config.executor_path or sctx.interpreter_path,
|
|
77
83
|
mainClass="",
|
|
78
84
|
applicationType=SparkApplication.PYTHON,
|
|
79
85
|
driverPod=driver_pod,
|
|
@@ -82,5 +88,17 @@ class PysparkFunctionTask(AsyncFunctionTaskTemplate):
|
|
|
82
88
|
|
|
83
89
|
return MessageToDict(job)
|
|
84
90
|
|
|
91
|
+
async def post(self, return_vals: Any) -> Any:
|
|
92
|
+
ctx = flyte.ctx()
|
|
93
|
+
if ctx and ctx.action.name == "a0":
|
|
94
|
+
# Only stop the SparkSession if it was created by the parent task in the debug mode.
|
|
95
|
+
# This is to make sure that the SparkSession is stopped by
|
|
96
|
+
# parent action only when debugging in the interactive mode.
|
|
97
|
+
# Note: The action name is always "a0" in the debug mode.
|
|
98
|
+
import pyspark as _pyspark
|
|
99
|
+
|
|
100
|
+
sess = _pyspark.sql.SparkSession.builder.appName(DEFAULT_SPARK_CONTEXT_NAME).getOrCreate()
|
|
101
|
+
sess.stop()
|
|
102
|
+
|
|
85
103
|
|
|
86
104
|
TaskPluginRegistry.register(Spark, PysparkFunctionTask)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: flyteplugins-spark
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.0b44
|
|
4
4
|
Summary: Spark plugin for flyte
|
|
5
5
|
Author-email: Kevin Su <pingsutw@users.noreply.github.com>
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -15,5 +15,5 @@ Union can execute Spark jobs natively on a Kubernetes Cluster, which manages a v
|
|
|
15
15
|
To install the plugin, run the following command:
|
|
16
16
|
|
|
17
17
|
```bash
|
|
18
|
-
pip install flyteplugins-spark
|
|
18
|
+
pip install --pre flyteplugins-spark
|
|
19
19
|
```
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
flyteplugins/spark/__init__.py,sha256=UDl9xv0gAEATqXL7YerclxvJfcPhL10DkFVavM9nuFg,204
|
|
2
|
+
flyteplugins/spark/df_transformer.py,sha256=u4QLAVVvKTUpK7CaOfXzwHnFSJnj6DuLhRjTeB_NG6Q,2164
|
|
3
|
+
flyteplugins/spark/task.py,sha256=wrdWiuXyVWl_FwE0uxqk3td8gdMe_yrh2CYL3zE7Rug,4074
|
|
4
|
+
flyteplugins_spark-2.0.0b44.dist-info/METADATA,sha256=jP5YAkPsfCq5x4YOTf0gfJCysiOJtI6hCSHIbzKuNxs,763
|
|
5
|
+
flyteplugins_spark-2.0.0b44.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
+
flyteplugins_spark-2.0.0b44.dist-info/top_level.txt,sha256=cgd779rPu9EsvdtuYgUxNHHgElaQvPn74KhB5XSeMBE,13
|
|
7
|
+
flyteplugins_spark-2.0.0b44.dist-info/RECORD,,
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
flyteplugins/spark/__init__.py,sha256=7q06oXtVhOuNW4Nv-YhJvt8ORqhXmMPNYa-m4IkougI,63
|
|
2
|
-
flyteplugins/spark/task.py,sha256=3fnw8QzcROAqmSAZIJLPc90Q-AvENV1fVdHChMFWdhs,3174
|
|
3
|
-
flyteplugins_spark-2.0.0b6.dist-info/METADATA,sha256=k4BLnNiC7UT5kAbwNubiFt89xcZg4W58KM2qY3lRv7M,756
|
|
4
|
-
flyteplugins_spark-2.0.0b6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
5
|
-
flyteplugins_spark-2.0.0b6.dist-info/top_level.txt,sha256=cgd779rPu9EsvdtuYgUxNHHgElaQvPn74KhB5XSeMBE,13
|
|
6
|
-
flyteplugins_spark-2.0.0b6.dist-info/RECORD,,
|
|
File without changes
|
{flyteplugins_spark-2.0.0b6.dist-info → flyteplugins_spark-2.0.0b44.dist-info}/top_level.txt
RENAMED
|
File without changes
|