flyteplugins-spark 0.2.0b30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flyteplugins-spark might be problematic. Click here for more details.
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
import tempfile
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Any, Dict, Optional
|
|
6
|
+
|
|
7
|
+
from flyte import PodTemplate
|
|
8
|
+
from flyte._initialize import is_initialized
|
|
9
|
+
from flyte.extend import AsyncFunctionTaskTemplate, TaskPluginRegistry
|
|
10
|
+
from flyte.models import SerializationContext
|
|
11
|
+
from flyteidl.plugins.spark_pb2 import SparkApplication, SparkJob
|
|
12
|
+
from google.protobuf.json_format import MessageToDict
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Spark(object):
|
|
17
|
+
"""
|
|
18
|
+
Use this to configure a SparkContext for a your task. Task's marked with this will automatically execute
|
|
19
|
+
natively onto K8s as a distributed execution of spark
|
|
20
|
+
|
|
21
|
+
Attributes:
|
|
22
|
+
spark_conf (Optional[Dict[str, str]]): Spark configuration dictionary.
|
|
23
|
+
hadoop_conf (Optional[Dict[str, str]]): Hadoop configuration dictionary.
|
|
24
|
+
executor_path (Optional[str]): Path to the Python binary for PySpark execution.
|
|
25
|
+
applications_path (Optional[str]): Path to the main application file.
|
|
26
|
+
driver_pod (Optional[PodTemplate]): Pod template for the driver pod.
|
|
27
|
+
executor_pod (Optional[PodTemplate]): Pod template for the executor pods.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
spark_conf: Optional[Dict[str, str]] = None
|
|
31
|
+
hadoop_conf: Optional[Dict[str, str]] = None
|
|
32
|
+
executor_path: Optional[str] = None
|
|
33
|
+
applications_path: Optional[str] = None
|
|
34
|
+
driver_pod: Optional[PodTemplate] = None
|
|
35
|
+
executor_pod: Optional[PodTemplate] = None
|
|
36
|
+
|
|
37
|
+
def __post_init__(self):
|
|
38
|
+
if self.spark_conf is None:
|
|
39
|
+
self.spark_conf = {}
|
|
40
|
+
|
|
41
|
+
if self.hadoop_conf is None:
|
|
42
|
+
self.hadoop_conf = {}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(kw_only=True)
|
|
46
|
+
class PysparkFunctionTask(AsyncFunctionTaskTemplate):
|
|
47
|
+
"""
|
|
48
|
+
Actual Plugin that transforms the local python code for execution within a spark context
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
plugin_config: Spark
|
|
52
|
+
task_type: str = "spark"
|
|
53
|
+
|
|
54
|
+
async def pre(self, *args, **kwargs) -> Dict[str, Any]:
|
|
55
|
+
import pyspark as _pyspark
|
|
56
|
+
|
|
57
|
+
sess = _pyspark.sql.SparkSession.builder.appName("FlyteSpark").getOrCreate()
|
|
58
|
+
|
|
59
|
+
if is_initialized():
|
|
60
|
+
base_dir = tempfile.mkdtemp()
|
|
61
|
+
file_name = "flyte_wf"
|
|
62
|
+
file_format = "zip"
|
|
63
|
+
shutil.make_archive(f"{base_dir}/{file_name}", file_format, os.getcwd())
|
|
64
|
+
sess.sparkContext.addPyFile(f"{base_dir}/{file_name}.{file_format}")
|
|
65
|
+
|
|
66
|
+
return {"spark_session": sess}
|
|
67
|
+
|
|
68
|
+
def custom_config(self, sctx: SerializationContext) -> Dict[str, Any]:
|
|
69
|
+
driver_pod = self.plugin_config.driver_pod.to_k8s_pod() if self.plugin_config.driver_pod else None
|
|
70
|
+
executor_pod = self.plugin_config.executor_pod.to_k8s_pod() if self.plugin_config.executor_pod else None
|
|
71
|
+
|
|
72
|
+
job = SparkJob(
|
|
73
|
+
sparkConf=self.plugin_config.spark_conf,
|
|
74
|
+
hadoopConf=self.plugin_config.hadoop_conf,
|
|
75
|
+
mainApplicationFile=self.plugin_config.applications_path,
|
|
76
|
+
executorPath=self.plugin_config.executor_path,
|
|
77
|
+
mainClass="",
|
|
78
|
+
applicationType=SparkApplication.PYTHON,
|
|
79
|
+
driverPod=driver_pod,
|
|
80
|
+
executorPod=executor_pod,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return MessageToDict(job)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
TaskPluginRegistry.register(Spark, PysparkFunctionTask)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flyteplugins-spark
|
|
3
|
+
Version: 0.2.0b30
|
|
4
|
+
Summary: Spark plugin for flyte
|
|
5
|
+
Author-email: Kevin Su <pingsutw@users.noreply.github.com>
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: pyspark
|
|
9
|
+
|
|
10
|
+
# Union Spark Plugin
|
|
11
|
+
|
|
12
|
+
Union can execute Spark jobs natively on a Kubernetes Cluster, which manages a virtual cluster’s lifecycle, spin-up, and tear down. It leverages the open-sourced Spark On K8s Operator and can be enabled without signing up for any service. This is like running a transient spark cluster — a type of cluster spun up for a specific Spark job and torn down after completion.
|
|
13
|
+
|
|
14
|
+
To install the plugin, run the following command:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
pip install flyteplugins-spark
|
|
18
|
+
```
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
flyteplugins/spark/__init__.py,sha256=7q06oXtVhOuNW4Nv-YhJvt8ORqhXmMPNYa-m4IkougI,63
|
|
2
|
+
flyteplugins/spark/task.py,sha256=3fnw8QzcROAqmSAZIJLPc90Q-AvENV1fVdHChMFWdhs,3174
|
|
3
|
+
flyteplugins_spark-0.2.0b30.dist-info/METADATA,sha256=PmUrpy-01jPgkUUYkcYhljrjbErp2CVrmeAfGviv4YU,736
|
|
4
|
+
flyteplugins_spark-0.2.0b30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
5
|
+
flyteplugins_spark-0.2.0b30.dist-info/top_level.txt,sha256=cgd779rPu9EsvdtuYgUxNHHgElaQvPn74KhB5XSeMBE,13
|
|
6
|
+
flyteplugins_spark-0.2.0b30.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
flyteplugins
|