scalable-pypeline 1.2.3__py2.py3-none-any.whl → 2.0.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pypeline/__init__.py +1 -1
- pypeline/barrier.py +34 -0
- pypeline/composition.py +348 -0
- pypeline/constants.py +51 -84
- pypeline/dramatiq.py +470 -0
- pypeline/extensions.py +9 -8
- pypeline/flask/__init__.py +3 -5
- pypeline/flask/api/pipelines.py +109 -148
- pypeline/flask/api/schedules.py +14 -39
- pypeline/flask/decorators.py +18 -53
- pypeline/flask/flask_pypeline.py +156 -0
- pypeline/middleware.py +61 -0
- pypeline/pipeline_config_schema.py +105 -92
- pypeline/pypeline_yaml.py +458 -0
- pypeline/schedule_config_schema.py +35 -120
- pypeline/utils/config_utils.py +52 -310
- pypeline/utils/module_utils.py +35 -71
- pypeline/utils/pipeline_utils.py +161 -0
- scalable_pypeline-2.0.1.dist-info/METADATA +217 -0
- scalable_pypeline-2.0.1.dist-info/RECORD +27 -0
- scalable_pypeline-2.0.1.dist-info/entry_points.txt +3 -0
- tests/fixtures/__init__.py +0 -1
- pypeline/celery.py +0 -206
- pypeline/celery_beat.py +0 -254
- pypeline/flask/api/utils.py +0 -35
- pypeline/flask/flask_sermos.py +0 -156
- pypeline/generators.py +0 -196
- pypeline/logging_config.py +0 -171
- pypeline/pipeline/__init__.py +0 -0
- pypeline/pipeline/chained_task.py +0 -70
- pypeline/pipeline/generator.py +0 -254
- pypeline/sermos_yaml.py +0 -442
- pypeline/utils/graph_utils.py +0 -144
- pypeline/utils/task_utils.py +0 -552
- scalable_pypeline-1.2.3.dist-info/METADATA +0 -163
- scalable_pypeline-1.2.3.dist-info/RECORD +0 -33
- scalable_pypeline-1.2.3.dist-info/entry_points.txt +0 -2
- tests/fixtures/s3_fixtures.py +0 -52
- {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.1.dist-info}/LICENSE +0 -0
- {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.1.dist-info}/WHEEL +0 -0
- {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,161 @@
|
|
1
|
+
import logging
|
2
|
+
import typing
|
3
|
+
import networkx as nx
|
4
|
+
from dramatiq import get_broker, Message
|
5
|
+
from pypeline.composition import parallel_pipeline
|
6
|
+
from pypeline.dramatiq import LazyActor, get_callable, register_lazy_actor
|
7
|
+
from pypeline.utils.config_utils import retrieve_latest_pipeline_config
|
8
|
+
|
9
|
+
T = typing.TypeVar("T") # T can be any type
|
10
|
+
|
11
|
+
|
12
|
+
logger = logging.getLogger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def get_execution_graph(
|
16
|
+
config: dict,
|
17
|
+
adjacency_key: str = "dagAdjacency",
|
18
|
+
task_definitions_key: str = "taskDefinitions",
|
19
|
+
) -> nx.DiGraph:
|
20
|
+
"""Generate a directed graph based on a pipeline config's adjacency list
|
21
|
+
and task definitions.
|
22
|
+
|
23
|
+
`dagAdjacency` is a dictionary containing all nodes and downstream
|
24
|
+
nodes.
|
25
|
+
|
26
|
+
`taskDefinitions` is a dictionary containing metadata required for
|
27
|
+
each node such as the worker, model version, etc. This metadata is
|
28
|
+
attached to each node so it can be retrieved directly from the graph.
|
29
|
+
"""
|
30
|
+
G = nx.DiGraph()
|
31
|
+
|
32
|
+
# Get our adjacency list and task definitions
|
33
|
+
adjacency_dict = config.get(adjacency_key, {})
|
34
|
+
task_definitions = config.get(task_definitions_key, {})
|
35
|
+
if len(adjacency_dict.keys()) == 0:
|
36
|
+
logger.warning(
|
37
|
+
"Adjacency definition `{}` was not found ...".format(adjacency_key)
|
38
|
+
)
|
39
|
+
|
40
|
+
# Build the graph
|
41
|
+
for node in adjacency_dict.keys():
|
42
|
+
adjacent_nodes = adjacency_dict[node]
|
43
|
+
|
44
|
+
# If no adjacent nodes, then this is a terminal node
|
45
|
+
if len(adjacent_nodes) == 0:
|
46
|
+
G.add_node(node, attr_dict=task_definitions.get(node, {}))
|
47
|
+
continue
|
48
|
+
|
49
|
+
# Otherwise, we'll add an edge from this node to all adjacent nodes
|
50
|
+
# and add the task defnition metadata to the edge
|
51
|
+
G.add_edges_from(
|
52
|
+
[(node, n, task_definitions.get(n, {})) for n in adjacent_nodes]
|
53
|
+
)
|
54
|
+
return G
|
55
|
+
|
56
|
+
|
57
|
+
def process_non_none_value(value: T) -> None:
|
58
|
+
"""
|
59
|
+
Processes a value that must not be None.
|
60
|
+
|
61
|
+
The function checks if the provided value is None and raises a ValueError if it is.
|
62
|
+
If the value is not None, it proceeds to process and print the value.
|
63
|
+
|
64
|
+
:param value: The value to process. Can be of any type, but must not be None.
|
65
|
+
|
66
|
+
:raises ValueError: If the value is None.
|
67
|
+
|
68
|
+
Example:
|
69
|
+
>>> process_non_none_value(42)
|
70
|
+
Processing value: 42
|
71
|
+
|
72
|
+
>>> process_non_none_value("hello")
|
73
|
+
Processing value: hello
|
74
|
+
|
75
|
+
>>> process_non_none_value(None)
|
76
|
+
Traceback (most recent call last):
|
77
|
+
...
|
78
|
+
ValueError: None value is not allowed
|
79
|
+
"""
|
80
|
+
if value is None:
|
81
|
+
raise ValueError("None value is not allowed")
|
82
|
+
# Process the value
|
83
|
+
print(f"Processing value: {value}")
|
84
|
+
|
85
|
+
|
86
|
+
def topological_sort_with_parallelism(
|
87
|
+
graph: nx.DiGraph, executable_nodes=None
|
88
|
+
) -> typing.List[typing.List[T]]:
|
89
|
+
"""
|
90
|
+
Recurse over the graph to find an optimal execution strategy for processing nodes in an order where
|
91
|
+
no node shall be processed before all of its predecessors have been processed. The function handles
|
92
|
+
parallel execution by identifying nodes that can be processed in parallel at each step. If the graph
|
93
|
+
contains a cycle, the function will not be able to generate an execution plan and will raise an exception.
|
94
|
+
|
95
|
+
:param graph: A directed acyclic graph (DiGraph) from NetworkX.
|
96
|
+
:param executable_nodes: A list of lists where each inner list contains nodes that can be executed
|
97
|
+
in parallel at each step. This parameter is used for recursion.
|
98
|
+
:return: A list of lists where each inner list contains nodes that can be executed in parallel at each step.
|
99
|
+
|
100
|
+
>>> g = nx.DiGraph()
|
101
|
+
>>> g.add_edges_from([(1, 2), (1, 3), (2, 4), (3, 4)])
|
102
|
+
>>> topological_sort_with_parallelism(g)
|
103
|
+
[[1], [2, 3], [4]]
|
104
|
+
|
105
|
+
>>> g = nx.DiGraph()
|
106
|
+
>>> g.add_edges_from([(1, 2), (2, 3), (3, 4)])
|
107
|
+
>>> topological_sort_with_parallelism(g)
|
108
|
+
[[1], [2], [3], [4]]
|
109
|
+
|
110
|
+
>>> g = nx.DiGraph()
|
111
|
+
>>> g.add_edges_from([(1, 2), (2, 3), (3, 1)])
|
112
|
+
>>> topological_sort_with_parallelism(g)
|
113
|
+
Traceback (most recent call last):
|
114
|
+
...
|
115
|
+
NetworkXUnfeasible: Graph contains a cycle, cannot compute a topological sort.
|
116
|
+
"""
|
117
|
+
nodes = list(nx.topological_sort(graph))
|
118
|
+
round_executable_nodes = [n for n in nodes if graph.in_degree(n) == 0]
|
119
|
+
graph.remove_nodes_from(round_executable_nodes)
|
120
|
+
|
121
|
+
if len(round_executable_nodes) == 0:
|
122
|
+
return executable_nodes
|
123
|
+
else:
|
124
|
+
executable_nodes = [] if executable_nodes is None else executable_nodes
|
125
|
+
executable_nodes.append(round_executable_nodes)
|
126
|
+
return topological_sort_with_parallelism(graph, executable_nodes)
|
127
|
+
|
128
|
+
|
129
|
+
def dag_generator(pipeline_id: str, *args, **kwargs):
|
130
|
+
pipeline_config = retrieve_latest_pipeline_config(pipeline_id=pipeline_id)["config"]
|
131
|
+
graph = get_execution_graph(pipeline_config)
|
132
|
+
optimal_execution_graph = topological_sort_with_parallelism(graph.copy())
|
133
|
+
broker = get_broker()
|
134
|
+
|
135
|
+
registered_actors: typing.Dict[str, LazyActor] = {}
|
136
|
+
broker.actors.clear()
|
137
|
+
|
138
|
+
messages: typing.List[typing.List[Message]] = []
|
139
|
+
|
140
|
+
task_definitions = pipeline_config["taskDefinitions"]
|
141
|
+
for task_group in optimal_execution_graph:
|
142
|
+
message_group = []
|
143
|
+
for task in task_group:
|
144
|
+
module_path = task_definitions[task]["handler"]
|
145
|
+
tmp_handler = get_callable(module_path)
|
146
|
+
lazy_actor = register_lazy_actor(
|
147
|
+
broker, tmp_handler, pipeline_config["metadata"]
|
148
|
+
)
|
149
|
+
registered_actors[task] = lazy_actor
|
150
|
+
if args and not kwargs:
|
151
|
+
message_group.append(registered_actors[task].message(*args))
|
152
|
+
elif kwargs and not args:
|
153
|
+
message_group.append(registered_actors[task].message(**kwargs))
|
154
|
+
elif args and kwargs:
|
155
|
+
message_group.append(registered_actors[task].message(*args, **kwargs))
|
156
|
+
else:
|
157
|
+
message_group.append(registered_actors[task].message())
|
158
|
+
messages.append(message_group)
|
159
|
+
p = parallel_pipeline(messages)
|
160
|
+
|
161
|
+
return p
|
@@ -0,0 +1,217 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: scalable-pypeline
|
3
|
+
Version: 2.0.1
|
4
|
+
Summary: PypeLine - Python pipelines for the Real World
|
5
|
+
Home-page: https://gitlab.com/bravos2/pypeline
|
6
|
+
Author: Bravos Power Corporation
|
7
|
+
License: Apache License 2.0
|
8
|
+
Description-Content-Type: text/markdown
|
9
|
+
License-File: LICENSE
|
10
|
+
Requires-Dist: PyYAML (<7,>=6.0.1)
|
11
|
+
Requires-Dist: click (==8.0.4)
|
12
|
+
Requires-Dist: marshmallow (<4,>=3.2.1)
|
13
|
+
Requires-Dist: redis (<5,>=4.5.4)
|
14
|
+
Requires-Dist: db-medley[redis] (<2,>=1.0.2)
|
15
|
+
Requires-Dist: croniter (<2,>=1.0.15)
|
16
|
+
Provides-Extra: build
|
17
|
+
Requires-Dist: wheel ; extra == 'build'
|
18
|
+
Requires-Dist: twine ; extra == 'build'
|
19
|
+
Provides-Extra: dev
|
20
|
+
Requires-Dist: blackd ; extra == 'dev'
|
21
|
+
Provides-Extra: flask
|
22
|
+
Requires-Dist: Werkzeug (==2.0.3) ; extra == 'flask'
|
23
|
+
Requires-Dist: itsdangerous (==2.0.1) ; extra == 'flask'
|
24
|
+
Requires-Dist: Flask (<2,>=1.1.2) ; extra == 'flask'
|
25
|
+
Requires-Dist: flask-smorest (<0.29,>=0.23.0) ; extra == 'flask'
|
26
|
+
Requires-Dist: Jinja2 (==3.0.3) ; extra == 'flask'
|
27
|
+
Provides-Extra: test
|
28
|
+
Requires-Dist: pytest-cov (<3,>=2.6.1) ; extra == 'test'
|
29
|
+
Requires-Dist: tox (<4,>=3.14.1) ; extra == 'test'
|
30
|
+
Requires-Dist: mock (<2,>=1) ; extra == 'test'
|
31
|
+
Requires-Dist: moto (<4,>=1.3.16) ; extra == 'test'
|
32
|
+
Requires-Dist: responses (<0.11,>=0.10.16) ; extra == 'test'
|
33
|
+
Requires-Dist: fakeredis (<3,>=2.10.3) ; extra == 'test'
|
34
|
+
Requires-Dist: importlib-metadata (<5,>=4.12) ; extra == 'test'
|
35
|
+
Provides-Extra: web
|
36
|
+
Requires-Dist: gunicorn ; extra == 'web'
|
37
|
+
Requires-Dist: gevent (<22,>=21.12.0) ; extra == 'web'
|
38
|
+
Provides-Extra: workers
|
39
|
+
Requires-Dist: networkx (>=2.4) ; extra == 'workers'
|
40
|
+
Requires-Dist: dramatiq[rabbitmq] (<2,>=1.17.0) ; extra == 'workers'
|
41
|
+
Requires-Dist: apscheduler (<4,>=3.10.4) ; extra == 'workers'
|
42
|
+
|
43
|
+
```
|
44
|
+
______ __ ________ _____ _ _____ _ _ _____
|
45
|
+
| ___ \\ \ / /| ___ \| ___|| | |_ _|| \ | || ___|
|
46
|
+
| |_/ / \ V / | |_/ /| |__ | | | | | \| || |__
|
47
|
+
| __/ \ / | __/ | __| | | | | | . ` || __|
|
48
|
+
| | | | | | | |___ | |_____| |_ | |\ || |___
|
49
|
+
\_| \_/ \_| \____/ \_____/\___/ \_| \_/\____/
|
50
|
+
```
|
51
|
+
|
52
|
+
## Overview
|
53
|
+
|
54
|
+
PypeLine is a versatile open-source library designed to streamline the management of data workflows and APIs. With PypeLine, you can efficiently schedule cron jobs, execute complex Directed Acyclical Graph (DAG) pipelines, and set up a Flask API complete with OpenAPI documentation.
|
55
|
+
|
56
|
+
#### Key Features
|
57
|
+
- Cron Job Scheduling: Easily schedule recurring tasks with flexible cron job functionality, ensuring that your processes run reliably at specified intervals.
|
58
|
+
- DAG Pipelines: Define and execute DAGs to manage complex data workflows with dependencies. PypeLine handles the execution order and parallelism, ensuring that each task runs in the correct sequence.
|
59
|
+
- Flask API with OpenAPI: Quickly configure a RESTful API using Flask, with built-in support for OpenAPI documentation, allowing for clear, standardized documentation of your endpoints.
|
60
|
+
|
61
|
+
## Requirements
|
62
|
+
|
63
|
+
- RabbitMQ
|
64
|
+
- Redis
|
65
|
+
- Docker (optional for dev)
|
66
|
+
|
67
|
+
## Getting Started
|
68
|
+
|
69
|
+
Install PypeLines:
|
70
|
+
|
71
|
+
```commandline
|
72
|
+
pip install scalable-pypeline[flask,web,workers]>=1.2.3
|
73
|
+
```
|
74
|
+
|
75
|
+
Configure your Flask project (app.py)
|
76
|
+
|
77
|
+
```python
|
78
|
+
from flask import Flask
|
79
|
+
from pypeline.flask import FlaskPypeline
|
80
|
+
from pypeline_demo.api import bp
|
81
|
+
from pypeline_demo.config import Config
|
82
|
+
from pypeline_demo.extensions import dramatiq
|
83
|
+
|
84
|
+
|
85
|
+
|
86
|
+
def create_app():
|
87
|
+
app = Flask(__name__)
|
88
|
+
|
89
|
+
dramatiq.init_app(app)
|
90
|
+
|
91
|
+
# Initialize your app with a configuration
|
92
|
+
app.config.from_object(Config)
|
93
|
+
|
94
|
+
pypeline = FlaskPypeline()
|
95
|
+
pypeline.init_app(app, init_api=True)
|
96
|
+
|
97
|
+
# Register API blueprints you wish
|
98
|
+
app.extensions["pypeline_core_api"].register_blueprint(bp)
|
99
|
+
# Register application blueprints to application
|
100
|
+
app.register_blueprint(bp)
|
101
|
+
|
102
|
+
return app
|
103
|
+
|
104
|
+
|
105
|
+
if __name__ == "__main__":
|
106
|
+
app = create_app()
|
107
|
+
app.run(port=5001)
|
108
|
+
```
|
109
|
+
|
110
|
+
Configure Dramatiq extension (extensions.py)
|
111
|
+
|
112
|
+
```python
|
113
|
+
from pypeline.dramatiq import Dramatiq
|
114
|
+
|
115
|
+
|
116
|
+
dramatiq = Dramatiq()
|
117
|
+
```
|
118
|
+
|
119
|
+
Setup your yaml configuration for pypelines (pypeline.yaml)
|
120
|
+
|
121
|
+
```yaml
|
122
|
+
serviceConfig:
|
123
|
+
- name: pipeline-worker
|
124
|
+
registeredTasks:
|
125
|
+
- handler: pypeline_demo.pipeline.a
|
126
|
+
- handler: pypeline_demo.pipeline.b
|
127
|
+
- handler: pypeline_demo.pipeline.c
|
128
|
+
- handler: pypeline_demo.scheduled_tasks.cron_task
|
129
|
+
|
130
|
+
pipelines:
|
131
|
+
demo_pipeline:
|
132
|
+
name: Demo Pipeline
|
133
|
+
description: Pipeline to show examples of DAG Adjacency
|
134
|
+
schemaVersion: 1
|
135
|
+
config:
|
136
|
+
dagAdjacency:
|
137
|
+
a:
|
138
|
+
- b
|
139
|
+
- c
|
140
|
+
metadata:
|
141
|
+
maxRetry: 1
|
142
|
+
retryBackoff: 180
|
143
|
+
retryBackoffMax: 300
|
144
|
+
retryJitter: true
|
145
|
+
maxTtl: 10800
|
146
|
+
queue: new-queue
|
147
|
+
taskDefinitions:
|
148
|
+
a:
|
149
|
+
handler: pypeline_demo.pipeline.a
|
150
|
+
b:
|
151
|
+
handler: pypeline_demo.pipeline.b
|
152
|
+
c:
|
153
|
+
handler: pypeline_demo.pipeline.c
|
154
|
+
scheduledTasks:
|
155
|
+
cron-task:
|
156
|
+
name: Example cron task
|
157
|
+
enabled: true
|
158
|
+
config:
|
159
|
+
task: pypeline_demo.scheduled_tasks.cron_task
|
160
|
+
queue: new-queue
|
161
|
+
schedule:
|
162
|
+
minute: '*'
|
163
|
+
hour: '*'
|
164
|
+
dayOfWeek: '*'
|
165
|
+
dayOfMonth: '*'
|
166
|
+
monthOfYear: '*'
|
167
|
+
schemaVersion: 1
|
168
|
+
```
|
169
|
+
|
170
|
+
Setup your modules to be executed by yaml (pipeline.py && scheduled_tasks.py)
|
171
|
+
|
172
|
+
```python
|
173
|
+
import time
|
174
|
+
|
175
|
+
|
176
|
+
def a(event):
|
177
|
+
print("A")
|
178
|
+
|
179
|
+
|
180
|
+
def b(event):
|
181
|
+
print("B")
|
182
|
+
time.sleep(10)
|
183
|
+
|
184
|
+
|
185
|
+
def c(event):
|
186
|
+
print("C")
|
187
|
+
```
|
188
|
+
|
189
|
+
```python
|
190
|
+
def cron_task():
|
191
|
+
print("HI")
|
192
|
+
```
|
193
|
+
|
194
|
+
Configure your environment variables (demo.env)
|
195
|
+
|
196
|
+
```env
|
197
|
+
SERMOS_BASE_URL=local
|
198
|
+
PYPELINE_CLIENT_PKG_NAME=pypeline_demo
|
199
|
+
REDIS_URL=redis://:password@localhost:6379/0
|
200
|
+
RABBITMQ_URL=amqp://admin:password@localhost:5672
|
201
|
+
```
|
202
|
+
|
203
|
+
Start Rabbit & Redis as your message broker and backend results storage. We use `docker compose` for this.
|
204
|
+
|
205
|
+
## DEMO PROJECT COMING SOON!
|
206
|
+
|
207
|
+
|
208
|
+
## Testing
|
209
|
+
|
210
|
+
If you are developing pypeline and want to test this package,
|
211
|
+
install the test dependencies:
|
212
|
+
|
213
|
+
$ pip install -e .[test]
|
214
|
+
|
215
|
+
Now, run the tests:
|
216
|
+
|
217
|
+
$ tox
|
@@ -0,0 +1,27 @@
|
|
1
|
+
pypeline/__init__.py,sha256=wAxkK8w13vqoF47A8iqWdSlIgRRXmZiQ0R4wePZfzhs,22
|
2
|
+
pypeline/barrier.py,sha256=dLDaprH5NB-C7MQjZqPpBBhMjmO0VV_kTonlgweznHc,1096
|
3
|
+
pypeline/composition.py,sha256=s0p_KdD-UheyFe2yPG8GJ94txq9C8fPDU58UcLfal6Q,12955
|
4
|
+
pypeline/constants.py,sha256=vi4UZz1xd0ZeIuelp4QgCQsMlIHW65-lVB8l_iA8kBE,2578
|
5
|
+
pypeline/dramatiq.py,sha256=jr9WORqtusC1gnbvF59CNXny8ORk2Lmlbmf1qsbiLXo,14799
|
6
|
+
pypeline/extensions.py,sha256=BzOTnXhNxap3N7uIUUh_hO6dDwx08Vc_RJDE93_K0Lo,610
|
7
|
+
pypeline/middleware.py,sha256=6vWNCoRVqnASJ40CAOEpe0JcYRpB6BTkLz8E51q4z2Y,2756
|
8
|
+
pypeline/pipeline_config_schema.py,sha256=DQ_RMucnA0AyrndlW6lkb0orGromcO6C9GgLHyG6lJ0,8013
|
9
|
+
pypeline/pypeline_yaml.py,sha256=Og08sUKwOjq7JYPnkg-NIcGbHravYCkC5Arz22rZEtA,16981
|
10
|
+
pypeline/schedule_config_schema.py,sha256=vtZV-5wpGcAiYcXxdBPRkrjsbR6x_9E-1PC2elrKKbE,3611
|
11
|
+
pypeline/flask/__init__.py,sha256=AdljRh0lMiS8ExgDmgzObwVs8jW7hqQuf83Ml8kn8GQ,491
|
12
|
+
pypeline/flask/decorators.py,sha256=ki6jkjZwbDbCWuj7ET7N-ncZwrASp4Fy7257WIYiAAQ,1102
|
13
|
+
pypeline/flask/flask_pypeline.py,sha256=Uqyu3PnSP3DoVZUJPqV9chjT4xdRgvcL3OMXxkbdTEg,5490
|
14
|
+
pypeline/flask/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
15
|
+
pypeline/flask/api/pipelines.py,sha256=sPvEoNwmnJPSA96lZHYS2fwKqZlVyE2OSjUmOPFi91o,7267
|
16
|
+
pypeline/flask/api/schedules.py,sha256=31lwoFlGv-S-2ahGUCnD5YbmKws8yddj6_PEzzdBi9s,1321
|
17
|
+
pypeline/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
pypeline/utils/config_utils.py,sha256=rAIATyoW7kGETZ_Z2DqiXtGd7bJp5uPfcLtfNPOYsNs,2167
|
19
|
+
pypeline/utils/module_utils.py,sha256=boEP9IYr4p_ick7HlVUfIxOYHQlEmo7dgvDBCQc-C28,2914
|
20
|
+
pypeline/utils/pipeline_utils.py,sha256=dewzkMajs7uyPHyHjJfISA9pc2-1J5A99Hm4XqNw5qM,6031
|
21
|
+
tests/fixtures/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
+
scalable_pypeline-2.0.1.dist-info/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
|
23
|
+
scalable_pypeline-2.0.1.dist-info/METADATA,sha256=4ozytitQKStqd6jqYr1ALmorfspjit-OTgi1ye_jvpk,6239
|
24
|
+
scalable_pypeline-2.0.1.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
|
25
|
+
scalable_pypeline-2.0.1.dist-info/entry_points.txt,sha256=uWs10ODfHSBKo2Cx_QaUjPHQTpZ3e77j9VlAdRRmMyg,119
|
26
|
+
scalable_pypeline-2.0.1.dist-info/top_level.txt,sha256=C7dpkEOc_-nnsAQb28BfQknjD6XHRyS9ZrvVeoIbV7s,15
|
27
|
+
scalable_pypeline-2.0.1.dist-info/RECORD,,
|
tests/fixtures/__init__.py
CHANGED
@@ -1 +0,0 @@
|
|
1
|
-
from tests.fixtures.s3_fixtures import *
|
pypeline/celery.py
DELETED
@@ -1,206 +0,0 @@
|
|
1
|
-
""" Configure and instantiate Celery
|
2
|
-
"""
|
3
|
-
import os
|
4
|
-
|
5
|
-
if os.environ.get('USE_GEVENT', "False").lower() == 'true':
|
6
|
-
from gevent import monkey
|
7
|
-
monkey.patch_all()
|
8
|
-
|
9
|
-
import sys
|
10
|
-
import logging
|
11
|
-
from pypeline.pipeline.chained_task import ChainedTask
|
12
|
-
from celery_dyrygent.tasks import register_workflow_processor
|
13
|
-
from typing import List
|
14
|
-
from celery import Celery
|
15
|
-
from pypeline.logging_config import setup_logging
|
16
|
-
from pypeline.utils.module_utils import SermosModuleLoader
|
17
|
-
from pypeline.utils.task_utils import PipelineResult, \
|
18
|
-
get_service_config_for_worker
|
19
|
-
from pypeline.extensions import sermos_config, sermos_client_version
|
20
|
-
from pypeline import __version__
|
21
|
-
|
22
|
-
logger = logging.getLogger('celery')
|
23
|
-
ENABLE_TOOLS = str(os.environ.get('ENABLE_TOOLS', 'false')).lower() == 'true'
|
24
|
-
CELERY_TASKS_ACK_LATE = str(os.environ.get('CELERY_TASKS_ACK_LATE', 'false')).lower() == 'true'
|
25
|
-
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
|
26
|
-
OVERLOAD_ES = os.environ.get('ENV', 'production').lower() == 'production'
|
27
|
-
PIPELINE_CHORD_COMPRESSION = os.environ.get('PIPELINE_CHORD_COMPRESSION', None)
|
28
|
-
|
29
|
-
setup_logging(app_version=__version__,
|
30
|
-
client_version=sermos_client_version,
|
31
|
-
default_level=LOG_LEVEL,
|
32
|
-
overload_elasticsearch=OVERLOAD_ES,
|
33
|
-
establish_logging_config=True)
|
34
|
-
|
35
|
-
def task_chain_regulator(*args, **kwargs):
|
36
|
-
""" Utility task to ensure celery properly waits between groups in a chain.
|
37
|
-
|
38
|
-
For a chain(), if each element is a group() then celery does not
|
39
|
-
properly adhere to the chain elements occurring sequentially. If you
|
40
|
-
insert a task that is not a group() in between, though, then the
|
41
|
-
chain operates as expected.
|
42
|
-
"""
|
43
|
-
return True
|
44
|
-
|
45
|
-
|
46
|
-
def pipeline_success(event: dict):
|
47
|
-
""" Utility task to ensure celery properly waits between groups in a chain.
|
48
|
-
|
49
|
-
For a chain(), if each element is a group() then celery does not
|
50
|
-
properly adhere to the chain elements occurring sequentially. If you
|
51
|
-
insert a task that is not a group() in between, though, then the
|
52
|
-
chain operates as expected.
|
53
|
-
"""
|
54
|
-
pr = PipelineResult(event['execution_id'])
|
55
|
-
pr.load()
|
56
|
-
pr.save(status='success')
|
57
|
-
|
58
|
-
|
59
|
-
class GenerateCeleryTasks(SermosModuleLoader):
|
60
|
-
""" Use the sermos.yaml configuration to turn customer methods into
|
61
|
-
decorated celery tasks that are available for work/pipelines
|
62
|
-
"""
|
63
|
-
def __init__(self, config: dict, celery_instance: Celery):
|
64
|
-
super(GenerateCeleryTasks, self).__init__()
|
65
|
-
self.config = config if config else {}
|
66
|
-
self.celery = celery_instance
|
67
|
-
|
68
|
-
def _get_default_tasks(self) -> List[dict]:
|
69
|
-
""" Sermos provides default tasks that all workers should know about.
|
70
|
-
"""
|
71
|
-
return [{
|
72
|
-
'handler': 'pypeline.celery.task_chain_regulator'
|
73
|
-
}, {
|
74
|
-
'handler': 'pypeline.celery.pipeline_success'
|
75
|
-
}]
|
76
|
-
|
77
|
-
def generate(self):
|
78
|
-
""" Loads methods based on sermos config file and decorates them as
|
79
|
-
celery tasks.
|
80
|
-
|
81
|
-
Customer's methods:
|
82
|
-
--------------------------------
|
83
|
-
def demo_task(*args, **kwargs):
|
84
|
-
return True
|
85
|
-
|
86
|
-
Turns into the equivallent of:
|
87
|
-
--------------------------------
|
88
|
-
@celery.task(queue='queue-name')
|
89
|
-
def demo_task(*args, **kwargs):t
|
90
|
-
return True
|
91
|
-
"""
|
92
|
-
# Set in k8s deployment as an environment variable when Sermos Cloud
|
93
|
-
# generates the final secrets.yaml file. The name comes from the user's
|
94
|
-
# sermos.yaml file based on serviceConfig[].name. Each 'worker' will
|
95
|
-
# have a single name and each individually registers tasks through its
|
96
|
-
# registeredTasks list. This allows each worker to only attempt
|
97
|
-
# bootstrapping those tasks that are relevant to the worker and not, for
|
98
|
-
# example, attempt to import a package that's not used by this worker
|
99
|
-
service = get_service_config_for_worker(self.config)
|
100
|
-
if not service:
|
101
|
-
return
|
102
|
-
for task in service.get('registeredTasks', []):
|
103
|
-
pipeline_meta = None
|
104
|
-
for pipeline_key, pipeline in sermos_config['pipelines'].items():
|
105
|
-
pipeline_config = pipeline["config"]
|
106
|
-
pipeline_tasks = [t["handler"] for t in pipeline_config["taskDefinitions"].values()]
|
107
|
-
if task["handler"] in pipeline_tasks:
|
108
|
-
pipeline_meta = pipeline_config["metadata"]
|
109
|
-
break
|
110
|
-
|
111
|
-
try:
|
112
|
-
worker_path = task['handler'] # Required, no default
|
113
|
-
|
114
|
-
tmp_handler = self.get_callable(worker_path)
|
115
|
-
|
116
|
-
# Decorate the method as a celery task along with a default
|
117
|
-
# queue if provided in config. Set ChainedTask as the base
|
118
|
-
# which allows chained tasks to pass kwargs correctly.
|
119
|
-
if pipeline_meta and pipeline_meta["maxRetry"] > 0:
|
120
|
-
tmp_handler = self.celery.task(
|
121
|
-
tmp_handler,
|
122
|
-
autoretry_for=(Exception,),
|
123
|
-
max_retries=pipeline_meta["maxRetry"],
|
124
|
-
retry_backoff=pipeline_meta["retryBackoff"],
|
125
|
-
retry_jitter=pipeline_meta["retryJitter"],
|
126
|
-
retry_backoff_max=pipeline_meta["retryBackoffMax"]
|
127
|
-
)
|
128
|
-
else:
|
129
|
-
tmp_handler = self.celery.task(tmp_handler)
|
130
|
-
except Exception as e:
|
131
|
-
logger.warning(f"Unable to add a task to celery: {e}")
|
132
|
-
# Sermos provides default tasks that all workers should know about, add
|
133
|
-
# them here.
|
134
|
-
for task in self._get_default_tasks():
|
135
|
-
tmp_handler = self.get_callable(task['handler'])
|
136
|
-
tmp_handler = self.celery.task(tmp_handler)
|
137
|
-
|
138
|
-
|
139
|
-
def configure_celery(celery: Celery):
|
140
|
-
""" Configure Sermos-compatible Celery instance. Primarily this means
|
141
|
-
compatibility with Pipelines and Scheduled Tasks through injecting the
|
142
|
-
event kwarg. Also sets prebaked defaults that can be overloaded by user.
|
143
|
-
"""
|
144
|
-
REDIS_URL = os.environ.get('REDIS_URL', 'redis://localhost:6379/0')
|
145
|
-
CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_URL', REDIS_URL)
|
146
|
-
CELERY_RESULT_BACKEND = os.environ.get('CELERY_RESULT_BACKEND', REDIS_URL)
|
147
|
-
|
148
|
-
celery.Task = ChainedTask
|
149
|
-
|
150
|
-
# Configure the broker and tasks
|
151
|
-
celery.conf.broker_url = CELERY_BROKER_URL
|
152
|
-
|
153
|
-
# Use our custom database scheduler for dynamic celery beat updates.
|
154
|
-
celery.conf.beat_scheduler =\
|
155
|
-
'pypeline.celery_beat:SermosScheduler'
|
156
|
-
|
157
|
-
# Reasonable defaults, override as necessary
|
158
|
-
celery.conf.worker_redirect_stdouts = True
|
159
|
-
celery.conf.worker_redirect_stdouts_level = LOG_LEVEL
|
160
|
-
celery.conf.worker_hijack_root_logger = False
|
161
|
-
|
162
|
-
if PIPELINE_CHORD_COMPRESSION:
|
163
|
-
celery.conf.task_compression = PIPELINE_CHORD_COMPRESSION
|
164
|
-
|
165
|
-
# NOTE: The broker URL may not be the best result backend. For example,
|
166
|
-
# When using Rabbit as the broker (recommended), you should use Redis
|
167
|
-
# as the result backend, as Rabbit has horrible support as backend.
|
168
|
-
celery.conf.result_backend = CELERY_RESULT_BACKEND
|
169
|
-
celery.conf.task_ignore_result = False # Must not ignore for Chords
|
170
|
-
celery.conf.result_expires = int(
|
171
|
-
os.environ.get('CELERY_RESULT_EXPIRES', 10800)) # 3 hours by default
|
172
|
-
celery.conf.broker_pool_limit = int(os.environ.get('BROKER_POOL_LIMIT',
|
173
|
-
10))
|
174
|
-
celery.conf.worker_max_tasks_per_child = int(
|
175
|
-
os.environ.get('MAX_TASKS_PER_CHILD', 100))
|
176
|
-
celery.conf.task_soft_time_limit =\
|
177
|
-
int(os.environ.get('TASK_TIMEOUT_SECONDS', 3600))
|
178
|
-
celery.conf.task_time_limit =\
|
179
|
-
int(os.environ.get('TASK_TIMEOUT_SECONDS', 3600)) + 10 # Cleanup buffer
|
180
|
-
celery.conf.task_acks_late = CELERY_TASKS_ACK_LATE
|
181
|
-
celery.conf.task_serializer = 'json'
|
182
|
-
celery.conf.result_serializer = 'json'
|
183
|
-
celery.conf.accept_content = ['json']
|
184
|
-
# Required config options for some brokers we use frequently.
|
185
|
-
transport_options = {}
|
186
|
-
celery.conf.broker_transport_options = transport_options
|
187
|
-
|
188
|
-
# Sermos generally has long-running tasks (relatively speaking), so
|
189
|
-
# limit number of jobs a worker can reserve. This may not be true for
|
190
|
-
# all tasks, so configure this on a per application basis. In the event
|
191
|
-
# mutltiple task kinds exist in an application (short and long), see
|
192
|
-
# http://docs.celeryproject.org/en/latest/userguide/optimizing.html#optimizing-prefetch-limit
|
193
|
-
# for some guidance on combining multiple workers and routing tasks.
|
194
|
-
# TODO make configurable from env
|
195
|
-
celery.conf.worker_prefetch_multiplier = 1
|
196
|
-
|
197
|
-
# Add our application's workers & any other tasks to be made
|
198
|
-
# available
|
199
|
-
register_workflow_processor(celery)
|
200
|
-
try:
|
201
|
-
GenerateCeleryTasks(sermos_config, celery).generate()
|
202
|
-
except Exception as e:
|
203
|
-
logger.error(f"Unable to dynamically generate celery tasks: {e}")
|
204
|
-
sys.exit(1)
|
205
|
-
|
206
|
-
return celery
|