scalable-pypeline 1.2.2__py2.py3-none-any.whl → 2.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. pypeline/__init__.py +1 -1
  2. pypeline/barrier.py +34 -0
  3. pypeline/composition.py +348 -0
  4. pypeline/constants.py +51 -84
  5. pypeline/dramatiq.py +470 -0
  6. pypeline/extensions.py +9 -8
  7. pypeline/flask/__init__.py +3 -5
  8. pypeline/flask/api/pipelines.py +109 -148
  9. pypeline/flask/api/schedules.py +14 -39
  10. pypeline/flask/decorators.py +18 -53
  11. pypeline/flask/flask_pypeline.py +156 -0
  12. pypeline/middleware.py +61 -0
  13. pypeline/pipeline_config_schema.py +104 -91
  14. pypeline/pypeline_yaml.py +458 -0
  15. pypeline/schedule_config_schema.py +35 -120
  16. pypeline/utils/config_utils.py +52 -310
  17. pypeline/utils/module_utils.py +35 -71
  18. pypeline/utils/pipeline_utils.py +161 -0
  19. scalable_pypeline-2.0.1.dist-info/METADATA +217 -0
  20. scalable_pypeline-2.0.1.dist-info/RECORD +27 -0
  21. scalable_pypeline-2.0.1.dist-info/entry_points.txt +3 -0
  22. tests/fixtures/__init__.py +0 -1
  23. pypeline/celery.py +0 -206
  24. pypeline/celery_beat.py +0 -254
  25. pypeline/flask/api/utils.py +0 -35
  26. pypeline/flask/flask_sermos.py +0 -156
  27. pypeline/generators.py +0 -196
  28. pypeline/logging_config.py +0 -171
  29. pypeline/pipeline/__init__.py +0 -0
  30. pypeline/pipeline/chained_task.py +0 -70
  31. pypeline/pipeline/generator.py +0 -254
  32. pypeline/sermos_yaml.py +0 -442
  33. pypeline/utils/graph_utils.py +0 -144
  34. pypeline/utils/task_utils.py +0 -552
  35. scalable_pypeline-1.2.2.dist-info/METADATA +0 -163
  36. scalable_pypeline-1.2.2.dist-info/RECORD +0 -33
  37. scalable_pypeline-1.2.2.dist-info/entry_points.txt +0 -2
  38. tests/fixtures/s3_fixtures.py +0 -52
  39. {scalable_pypeline-1.2.2.dist-info → scalable_pypeline-2.0.1.dist-info}/LICENSE +0 -0
  40. {scalable_pypeline-1.2.2.dist-info → scalable_pypeline-2.0.1.dist-info}/WHEEL +0 -0
  41. {scalable_pypeline-1.2.2.dist-info → scalable_pypeline-2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,161 @@
1
+ import logging
2
+ import typing
3
+ import networkx as nx
4
+ from dramatiq import get_broker, Message
5
+ from pypeline.composition import parallel_pipeline
6
+ from pypeline.dramatiq import LazyActor, get_callable, register_lazy_actor
7
+ from pypeline.utils.config_utils import retrieve_latest_pipeline_config
8
+
9
+ T = typing.TypeVar("T") # T can be any type
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def get_execution_graph(
16
+ config: dict,
17
+ adjacency_key: str = "dagAdjacency",
18
+ task_definitions_key: str = "taskDefinitions",
19
+ ) -> nx.DiGraph:
20
+ """Generate a directed graph based on a pipeline config's adjacency list
21
+ and task definitions.
22
+
23
+ `dagAdjacency` is a dictionary containing all nodes and downstream
24
+ nodes.
25
+
26
+ `taskDefinitions` is a dictionary containing metadata required for
27
+ each node such as the worker, model version, etc. This metadata is
28
+ attached to each node so it can be retrieved directly from the graph.
29
+ """
30
+ G = nx.DiGraph()
31
+
32
+ # Get our adjacency list and task definitions
33
+ adjacency_dict = config.get(adjacency_key, {})
34
+ task_definitions = config.get(task_definitions_key, {})
35
+ if len(adjacency_dict.keys()) == 0:
36
+ logger.warning(
37
+ "Adjacency definition `{}` was not found ...".format(adjacency_key)
38
+ )
39
+
40
+ # Build the graph
41
+ for node in adjacency_dict.keys():
42
+ adjacent_nodes = adjacency_dict[node]
43
+
44
+ # If no adjacent nodes, then this is a terminal node
45
+ if len(adjacent_nodes) == 0:
46
+ G.add_node(node, attr_dict=task_definitions.get(node, {}))
47
+ continue
48
+
49
+ # Otherwise, we'll add an edge from this node to all adjacent nodes
50
+ # and add the task defnition metadata to the edge
51
+ G.add_edges_from(
52
+ [(node, n, task_definitions.get(n, {})) for n in adjacent_nodes]
53
+ )
54
+ return G
55
+
56
+
57
+ def process_non_none_value(value: T) -> None:
58
+ """
59
+ Processes a value that must not be None.
60
+
61
+ The function checks if the provided value is None and raises a ValueError if it is.
62
+ If the value is not None, it proceeds to process and print the value.
63
+
64
+ :param value: The value to process. Can be of any type, but must not be None.
65
+
66
+ :raises ValueError: If the value is None.
67
+
68
+ Example:
69
+ >>> process_non_none_value(42)
70
+ Processing value: 42
71
+
72
+ >>> process_non_none_value("hello")
73
+ Processing value: hello
74
+
75
+ >>> process_non_none_value(None)
76
+ Traceback (most recent call last):
77
+ ...
78
+ ValueError: None value is not allowed
79
+ """
80
+ if value is None:
81
+ raise ValueError("None value is not allowed")
82
+ # Process the value
83
+ print(f"Processing value: {value}")
84
+
85
+
86
+ def topological_sort_with_parallelism(
87
+ graph: nx.DiGraph, executable_nodes=None
88
+ ) -> typing.List[typing.List[T]]:
89
+ """
90
+ Recurse over the graph to find an optimal execution strategy for processing nodes in an order where
91
+ no node shall be processed before all of its predecessors have been processed. The function handles
92
+ parallel execution by identifying nodes that can be processed in parallel at each step. If the graph
93
+ contains a cycle, the function will not be able to generate an execution plan and will raise an exception.
94
+
95
+ :param graph: A directed acyclic graph (DiGraph) from NetworkX.
96
+ :param executable_nodes: A list of lists where each inner list contains nodes that can be executed
97
+ in parallel at each step. This parameter is used for recursion.
98
+ :return: A list of lists where each inner list contains nodes that can be executed in parallel at each step.
99
+
100
+ >>> g = nx.DiGraph()
101
+ >>> g.add_edges_from([(1, 2), (1, 3), (2, 4), (3, 4)])
102
+ >>> topological_sort_with_parallelism(g)
103
+ [[1], [2, 3], [4]]
104
+
105
+ >>> g = nx.DiGraph()
106
+ >>> g.add_edges_from([(1, 2), (2, 3), (3, 4)])
107
+ >>> topological_sort_with_parallelism(g)
108
+ [[1], [2], [3], [4]]
109
+
110
+ >>> g = nx.DiGraph()
111
+ >>> g.add_edges_from([(1, 2), (2, 3), (3, 1)])
112
+ >>> topological_sort_with_parallelism(g)
113
+ Traceback (most recent call last):
114
+ ...
115
+ NetworkXUnfeasible: Graph contains a cycle, cannot compute a topological sort.
116
+ """
117
+ nodes = list(nx.topological_sort(graph))
118
+ round_executable_nodes = [n for n in nodes if graph.in_degree(n) == 0]
119
+ graph.remove_nodes_from(round_executable_nodes)
120
+
121
+ if len(round_executable_nodes) == 0:
122
+ return executable_nodes
123
+ else:
124
+ executable_nodes = [] if executable_nodes is None else executable_nodes
125
+ executable_nodes.append(round_executable_nodes)
126
+ return topological_sort_with_parallelism(graph, executable_nodes)
127
+
128
+
129
+ def dag_generator(pipeline_id: str, *args, **kwargs):
130
+ pipeline_config = retrieve_latest_pipeline_config(pipeline_id=pipeline_id)["config"]
131
+ graph = get_execution_graph(pipeline_config)
132
+ optimal_execution_graph = topological_sort_with_parallelism(graph.copy())
133
+ broker = get_broker()
134
+
135
+ registered_actors: typing.Dict[str, LazyActor] = {}
136
+ broker.actors.clear()
137
+
138
+ messages: typing.List[typing.List[Message]] = []
139
+
140
+ task_definitions = pipeline_config["taskDefinitions"]
141
+ for task_group in optimal_execution_graph:
142
+ message_group = []
143
+ for task in task_group:
144
+ module_path = task_definitions[task]["handler"]
145
+ tmp_handler = get_callable(module_path)
146
+ lazy_actor = register_lazy_actor(
147
+ broker, tmp_handler, pipeline_config["metadata"]
148
+ )
149
+ registered_actors[task] = lazy_actor
150
+ if args and not kwargs:
151
+ message_group.append(registered_actors[task].message(*args))
152
+ elif kwargs and not args:
153
+ message_group.append(registered_actors[task].message(**kwargs))
154
+ elif args and kwargs:
155
+ message_group.append(registered_actors[task].message(*args, **kwargs))
156
+ else:
157
+ message_group.append(registered_actors[task].message())
158
+ messages.append(message_group)
159
+ p = parallel_pipeline(messages)
160
+
161
+ return p
@@ -0,0 +1,217 @@
1
+ Metadata-Version: 2.1
2
+ Name: scalable-pypeline
3
+ Version: 2.0.1
4
+ Summary: PypeLine - Python pipelines for the Real World
5
+ Home-page: https://gitlab.com/bravos2/pypeline
6
+ Author: Bravos Power Corporation
7
+ License: Apache License 2.0
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: PyYAML (<7,>=6.0.1)
11
+ Requires-Dist: click (==8.0.4)
12
+ Requires-Dist: marshmallow (<4,>=3.2.1)
13
+ Requires-Dist: redis (<5,>=4.5.4)
14
+ Requires-Dist: db-medley[redis] (<2,>=1.0.2)
15
+ Requires-Dist: croniter (<2,>=1.0.15)
16
+ Provides-Extra: build
17
+ Requires-Dist: wheel ; extra == 'build'
18
+ Requires-Dist: twine ; extra == 'build'
19
+ Provides-Extra: dev
20
+ Requires-Dist: blackd ; extra == 'dev'
21
+ Provides-Extra: flask
22
+ Requires-Dist: Werkzeug (==2.0.3) ; extra == 'flask'
23
+ Requires-Dist: itsdangerous (==2.0.1) ; extra == 'flask'
24
+ Requires-Dist: Flask (<2,>=1.1.2) ; extra == 'flask'
25
+ Requires-Dist: flask-smorest (<0.29,>=0.23.0) ; extra == 'flask'
26
+ Requires-Dist: Jinja2 (==3.0.3) ; extra == 'flask'
27
+ Provides-Extra: test
28
+ Requires-Dist: pytest-cov (<3,>=2.6.1) ; extra == 'test'
29
+ Requires-Dist: tox (<4,>=3.14.1) ; extra == 'test'
30
+ Requires-Dist: mock (<2,>=1) ; extra == 'test'
31
+ Requires-Dist: moto (<4,>=1.3.16) ; extra == 'test'
32
+ Requires-Dist: responses (<0.11,>=0.10.16) ; extra == 'test'
33
+ Requires-Dist: fakeredis (<3,>=2.10.3) ; extra == 'test'
34
+ Requires-Dist: importlib-metadata (<5,>=4.12) ; extra == 'test'
35
+ Provides-Extra: web
36
+ Requires-Dist: gunicorn ; extra == 'web'
37
+ Requires-Dist: gevent (<22,>=21.12.0) ; extra == 'web'
38
+ Provides-Extra: workers
39
+ Requires-Dist: networkx (>=2.4) ; extra == 'workers'
40
+ Requires-Dist: dramatiq[rabbitmq] (<2,>=1.17.0) ; extra == 'workers'
41
+ Requires-Dist: apscheduler (<4,>=3.10.4) ; extra == 'workers'
42
+
43
+ ```
44
+ ______ __ ________ _____ _ _____ _ _ _____
45
+ | ___ \\ \ / /| ___ \| ___|| | |_ _|| \ | || ___|
46
+ | |_/ / \ V / | |_/ /| |__ | | | | | \| || |__
47
+ | __/ \ / | __/ | __| | | | | | . ` || __|
48
+ | | | | | | | |___ | |_____| |_ | |\ || |___
49
+ \_| \_/ \_| \____/ \_____/\___/ \_| \_/\____/
50
+ ```
51
+
52
+ ## Overview
53
+
54
+ PypeLine is a versatile open-source library designed to streamline the management of data workflows and APIs. With PypeLine, you can efficiently schedule cron jobs, execute complex Directed Acyclical Graph (DAG) pipelines, and set up a Flask API complete with OpenAPI documentation.
55
+
56
+ #### Key Features
57
+ - Cron Job Scheduling: Easily schedule recurring tasks with flexible cron job functionality, ensuring that your processes run reliably at specified intervals.
58
+ - DAG Pipelines: Define and execute DAGs to manage complex data workflows with dependencies. PypeLine handles the execution order and parallelism, ensuring that each task runs in the correct sequence.
59
+ - Flask API with OpenAPI: Quickly configure a RESTful API using Flask, with built-in support for OpenAPI documentation, allowing for clear, standardized documentation of your endpoints.
60
+
61
+ ## Requirements
62
+
63
+ - RabbitMQ
64
+ - Redis
65
+ - Docker (optional for dev)
66
+
67
+ ## Getting Started
68
+
69
+ Install PypeLines:
70
+
71
+ ```commandline
72
+ pip install scalable-pypeline[flask,web,workers]>=1.2.3
73
+ ```
74
+
75
+ Configure your Flask project (app.py)
76
+
77
+ ```python
78
+ from flask import Flask
79
+ from pypeline.flask import FlaskPypeline
80
+ from pypeline_demo.api import bp
81
+ from pypeline_demo.config import Config
82
+ from pypeline_demo.extensions import dramatiq
83
+
84
+
85
+
86
+ def create_app():
87
+ app = Flask(__name__)
88
+
89
+ dramatiq.init_app(app)
90
+
91
+ # Initialize your app with a configuration
92
+ app.config.from_object(Config)
93
+
94
+ pypeline = FlaskPypeline()
95
+ pypeline.init_app(app, init_api=True)
96
+
97
+ # Register API blueprints you wish
98
+ app.extensions["pypeline_core_api"].register_blueprint(bp)
99
+ # Register application blueprints to application
100
+ app.register_blueprint(bp)
101
+
102
+ return app
103
+
104
+
105
+ if __name__ == "__main__":
106
+ app = create_app()
107
+ app.run(port=5001)
108
+ ```
109
+
110
+ Configure Dramatiq extension (extensions.py)
111
+
112
+ ```python
113
+ from pypeline.dramatiq import Dramatiq
114
+
115
+
116
+ dramatiq = Dramatiq()
117
+ ```
118
+
119
+ Setup your yaml configuration for pypelines (pypeline.yaml)
120
+
121
+ ```yaml
122
+ serviceConfig:
123
+ - name: pipeline-worker
124
+ registeredTasks:
125
+ - handler: pypeline_demo.pipeline.a
126
+ - handler: pypeline_demo.pipeline.b
127
+ - handler: pypeline_demo.pipeline.c
128
+ - handler: pypeline_demo.scheduled_tasks.cron_task
129
+
130
+ pipelines:
131
+ demo_pipeline:
132
+ name: Demo Pipeline
133
+ description: Pipeline to show examples of DAG Adjacency
134
+ schemaVersion: 1
135
+ config:
136
+ dagAdjacency:
137
+ a:
138
+ - b
139
+ - c
140
+ metadata:
141
+ maxRetry: 1
142
+ retryBackoff: 180
143
+ retryBackoffMax: 300
144
+ retryJitter: true
145
+ maxTtl: 10800
146
+ queue: new-queue
147
+ taskDefinitions:
148
+ a:
149
+ handler: pypeline_demo.pipeline.a
150
+ b:
151
+ handler: pypeline_demo.pipeline.b
152
+ c:
153
+ handler: pypeline_demo.pipeline.c
154
+ scheduledTasks:
155
+ cron-task:
156
+ name: Example cron task
157
+ enabled: true
158
+ config:
159
+ task: pypeline_demo.scheduled_tasks.cron_task
160
+ queue: new-queue
161
+ schedule:
162
+ minute: '*'
163
+ hour: '*'
164
+ dayOfWeek: '*'
165
+ dayOfMonth: '*'
166
+ monthOfYear: '*'
167
+ schemaVersion: 1
168
+ ```
169
+
170
+ Setup your modules to be executed by yaml (pipeline.py && scheduled_tasks.py)
171
+
172
+ ```python
173
+ import time
174
+
175
+
176
+ def a(event):
177
+ print("A")
178
+
179
+
180
+ def b(event):
181
+ print("B")
182
+ time.sleep(10)
183
+
184
+
185
+ def c(event):
186
+ print("C")
187
+ ```
188
+
189
+ ```python
190
+ def cron_task():
191
+ print("HI")
192
+ ```
193
+
194
+ Configure your environment variables (demo.env)
195
+
196
+ ```env
197
+ SERMOS_BASE_URL=local
198
+ PYPELINE_CLIENT_PKG_NAME=pypeline_demo
199
+ REDIS_URL=redis://:password@localhost:6379/0
200
+ RABBITMQ_URL=amqp://admin:password@localhost:5672
201
+ ```
202
+
203
+ Start Rabbit & Redis as your message broker and backend results storage. We use `docker compose` for this.
204
+
205
+ ## DEMO PROJECT COMING SOON!
206
+
207
+
208
+ ## Testing
209
+
210
+ If you are developing pypeline and want to test this package,
211
+ install the test dependencies:
212
+
213
+ $ pip install -e .[test]
214
+
215
+ Now, run the tests:
216
+
217
+ $ tox
@@ -0,0 +1,27 @@
1
+ pypeline/__init__.py,sha256=wAxkK8w13vqoF47A8iqWdSlIgRRXmZiQ0R4wePZfzhs,22
2
+ pypeline/barrier.py,sha256=dLDaprH5NB-C7MQjZqPpBBhMjmO0VV_kTonlgweznHc,1096
3
+ pypeline/composition.py,sha256=s0p_KdD-UheyFe2yPG8GJ94txq9C8fPDU58UcLfal6Q,12955
4
+ pypeline/constants.py,sha256=vi4UZz1xd0ZeIuelp4QgCQsMlIHW65-lVB8l_iA8kBE,2578
5
+ pypeline/dramatiq.py,sha256=jr9WORqtusC1gnbvF59CNXny8ORk2Lmlbmf1qsbiLXo,14799
6
+ pypeline/extensions.py,sha256=BzOTnXhNxap3N7uIUUh_hO6dDwx08Vc_RJDE93_K0Lo,610
7
+ pypeline/middleware.py,sha256=6vWNCoRVqnASJ40CAOEpe0JcYRpB6BTkLz8E51q4z2Y,2756
8
+ pypeline/pipeline_config_schema.py,sha256=DQ_RMucnA0AyrndlW6lkb0orGromcO6C9GgLHyG6lJ0,8013
9
+ pypeline/pypeline_yaml.py,sha256=Og08sUKwOjq7JYPnkg-NIcGbHravYCkC5Arz22rZEtA,16981
10
+ pypeline/schedule_config_schema.py,sha256=vtZV-5wpGcAiYcXxdBPRkrjsbR6x_9E-1PC2elrKKbE,3611
11
+ pypeline/flask/__init__.py,sha256=AdljRh0lMiS8ExgDmgzObwVs8jW7hqQuf83Ml8kn8GQ,491
12
+ pypeline/flask/decorators.py,sha256=ki6jkjZwbDbCWuj7ET7N-ncZwrASp4Fy7257WIYiAAQ,1102
13
+ pypeline/flask/flask_pypeline.py,sha256=Uqyu3PnSP3DoVZUJPqV9chjT4xdRgvcL3OMXxkbdTEg,5490
14
+ pypeline/flask/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ pypeline/flask/api/pipelines.py,sha256=sPvEoNwmnJPSA96lZHYS2fwKqZlVyE2OSjUmOPFi91o,7267
16
+ pypeline/flask/api/schedules.py,sha256=31lwoFlGv-S-2ahGUCnD5YbmKws8yddj6_PEzzdBi9s,1321
17
+ pypeline/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ pypeline/utils/config_utils.py,sha256=rAIATyoW7kGETZ_Z2DqiXtGd7bJp5uPfcLtfNPOYsNs,2167
19
+ pypeline/utils/module_utils.py,sha256=boEP9IYr4p_ick7HlVUfIxOYHQlEmo7dgvDBCQc-C28,2914
20
+ pypeline/utils/pipeline_utils.py,sha256=dewzkMajs7uyPHyHjJfISA9pc2-1J5A99Hm4XqNw5qM,6031
21
+ tests/fixtures/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ scalable_pypeline-2.0.1.dist-info/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
23
+ scalable_pypeline-2.0.1.dist-info/METADATA,sha256=4ozytitQKStqd6jqYr1ALmorfspjit-OTgi1ye_jvpk,6239
24
+ scalable_pypeline-2.0.1.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
25
+ scalable_pypeline-2.0.1.dist-info/entry_points.txt,sha256=uWs10ODfHSBKo2Cx_QaUjPHQTpZ3e77j9VlAdRRmMyg,119
26
+ scalable_pypeline-2.0.1.dist-info/top_level.txt,sha256=C7dpkEOc_-nnsAQb28BfQknjD6XHRyS9ZrvVeoIbV7s,15
27
+ scalable_pypeline-2.0.1.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [flask.commands]
2
+ cron-scheduler = pypeline.dramatiq:cron_scheduler
3
+ pypeline-worker = pypeline.dramatiq:pypeline_worker
@@ -1 +0,0 @@
1
- from tests.fixtures.s3_fixtures import *
pypeline/celery.py DELETED
@@ -1,206 +0,0 @@
1
- """ Configure and instantiate Celery
2
- """
3
- import os
4
-
5
- if os.environ.get('USE_GEVENT', "False").lower() == 'true':
6
- from gevent import monkey
7
- monkey.patch_all()
8
-
9
- import sys
10
- import logging
11
- from pypeline.pipeline.chained_task import ChainedTask
12
- from celery_dyrygent.tasks import register_workflow_processor
13
- from typing import List
14
- from celery import Celery
15
- from pypeline.logging_config import setup_logging
16
- from pypeline.utils.module_utils import SermosModuleLoader
17
- from pypeline.utils.task_utils import PipelineResult, \
18
- get_service_config_for_worker
19
- from pypeline.extensions import sermos_config, sermos_client_version
20
- from pypeline import __version__
21
-
22
- logger = logging.getLogger('celery')
23
- ENABLE_TOOLS = str(os.environ.get('ENABLE_TOOLS', 'false')).lower() == 'true'
24
- CELERY_TASKS_ACK_LATE = str(os.environ.get('CELERY_TASKS_ACK_LATE', 'false')).lower() == 'true'
25
- LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
26
- OVERLOAD_ES = os.environ.get('ENV', 'production').lower() == 'production'
27
- PIPELINE_CHORD_COMPRESSION = os.environ.get('PIPELINE_CHORD_COMPRESSION', None)
28
-
29
- setup_logging(app_version=__version__,
30
- client_version=sermos_client_version,
31
- default_level=LOG_LEVEL,
32
- overload_elasticsearch=OVERLOAD_ES,
33
- establish_logging_config=True)
34
-
35
- def task_chain_regulator(*args, **kwargs):
36
- """ Utility task to ensure celery properly waits between groups in a chain.
37
-
38
- For a chain(), if each element is a group() then celery does not
39
- properly adhere to the chain elements occurring sequentially. If you
40
- insert a task that is not a group() in between, though, then the
41
- chain operates as expected.
42
- """
43
- return True
44
-
45
-
46
- def pipeline_success(event: dict):
47
- """ Utility task to ensure celery properly waits between groups in a chain.
48
-
49
- For a chain(), if each element is a group() then celery does not
50
- properly adhere to the chain elements occurring sequentially. If you
51
- insert a task that is not a group() in between, though, then the
52
- chain operates as expected.
53
- """
54
- pr = PipelineResult(event['execution_id'])
55
- pr.load()
56
- pr.save(status='success')
57
-
58
-
59
- class GenerateCeleryTasks(SermosModuleLoader):
60
- """ Use the sermos.yaml configuration to turn customer methods into
61
- decorated celery tasks that are available for work/pipelines
62
- """
63
- def __init__(self, config: dict, celery_instance: Celery):
64
- super(GenerateCeleryTasks, self).__init__()
65
- self.config = config if config else {}
66
- self.celery = celery_instance
67
-
68
- def _get_default_tasks(self) -> List[dict]:
69
- """ Sermos provides default tasks that all workers should know about.
70
- """
71
- return [{
72
- 'handler': 'pypeline.celery.task_chain_regulator'
73
- }, {
74
- 'handler': 'pypeline.celery.pipeline_success'
75
- }]
76
-
77
- def generate(self):
78
- """ Loads methods based on sermos config file and decorates them as
79
- celery tasks.
80
-
81
- Customer's methods:
82
- --------------------------------
83
- def demo_task(*args, **kwargs):
84
- return True
85
-
86
- Turns into the equivallent of:
87
- --------------------------------
88
- @celery.task(queue='queue-name')
89
- def demo_task(*args, **kwargs):t
90
- return True
91
- """
92
- # Set in k8s deployment as an environment variable when Sermos Cloud
93
- # generates the final secrets.yaml file. The name comes from the user's
94
- # sermos.yaml file based on serviceConfig[].name. Each 'worker' will
95
- # have a single name and each individually registers tasks through its
96
- # registeredTasks list. This allows each worker to only attempt
97
- # bootstrapping those tasks that are relevant to the worker and not, for
98
- # example, attempt to import a package that's not used by this worker
99
- service = get_service_config_for_worker(self.config)
100
- if not service:
101
- return
102
- for task in service.get('registeredTasks', []):
103
- pipeline_meta = None
104
- for pipeline_key, pipeline in sermos_config['pipelines'].items():
105
- pipeline_config = pipeline["config"]
106
- pipeline_tasks = [t["handler"] for t in pipeline_config["taskDefinitions"].values()]
107
- if task["handler"] in pipeline_tasks:
108
- pipeline_meta = pipeline_config["metadata"]
109
- break
110
-
111
- try:
112
- worker_path = task['handler'] # Required, no default
113
-
114
- tmp_handler = self.get_callable(worker_path)
115
-
116
- # Decorate the method as a celery task along with a default
117
- # queue if provided in config. Set ChainedTask as the base
118
- # which allows chained tasks to pass kwargs correctly.
119
- if pipeline_meta and pipeline_meta["maxRetry"] > 0:
120
- tmp_handler = self.celery.task(
121
- tmp_handler,
122
- autoretry_for=(Exception,),
123
- max_retries=pipeline_meta["maxRetry"],
124
- retry_backoff=pipeline_meta["retryBackoff"],
125
- retry_jitter=pipeline_meta["retryJitter"],
126
- retry_backoff_max=pipeline_meta["retryBackoffMax"]
127
- )
128
- else:
129
- tmp_handler = self.celery.task(tmp_handler)
130
- except Exception as e:
131
- logger.warning(f"Unable to add a task to celery: {e}")
132
- # Sermos provides default tasks that all workers should know about, add
133
- # them here.
134
- for task in self._get_default_tasks():
135
- tmp_handler = self.get_callable(task['handler'])
136
- tmp_handler = self.celery.task(tmp_handler)
137
-
138
-
139
- def configure_celery(celery: Celery):
140
- """ Configure Sermos-compatible Celery instance. Primarily this means
141
- compatibility with Pipelines and Scheduled Tasks through injecting the
142
- event kwarg. Also sets prebaked defaults that can be overloaded by user.
143
- """
144
- REDIS_URL = os.environ.get('REDIS_URL', 'redis://localhost:6379/0')
145
- CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_URL', REDIS_URL)
146
- CELERY_RESULT_BACKEND = os.environ.get('CELERY_RESULT_BACKEND', REDIS_URL)
147
-
148
- celery.Task = ChainedTask
149
-
150
- # Configure the broker and tasks
151
- celery.conf.broker_url = CELERY_BROKER_URL
152
-
153
- # Use our custom database scheduler for dynamic celery beat updates.
154
- celery.conf.beat_scheduler =\
155
- 'pypeline.celery_beat:SermosScheduler'
156
-
157
- # Reasonable defaults, override as necessary
158
- celery.conf.worker_redirect_stdouts = True
159
- celery.conf.worker_redirect_stdouts_level = LOG_LEVEL
160
- celery.conf.worker_hijack_root_logger = False
161
-
162
- if PIPELINE_CHORD_COMPRESSION:
163
- celery.conf.task_compression = PIPELINE_CHORD_COMPRESSION
164
-
165
- # NOTE: The broker URL may not be the best result backend. For example,
166
- # When using Rabbit as the broker (recommended), you should use Redis
167
- # as the result backend, as Rabbit has horrible support as backend.
168
- celery.conf.result_backend = CELERY_RESULT_BACKEND
169
- celery.conf.task_ignore_result = False # Must not ignore for Chords
170
- celery.conf.result_expires = int(
171
- os.environ.get('CELERY_RESULT_EXPIRES', 10800)) # 3 hours by default
172
- celery.conf.broker_pool_limit = int(os.environ.get('BROKER_POOL_LIMIT',
173
- 10))
174
- celery.conf.worker_max_tasks_per_child = int(
175
- os.environ.get('MAX_TASKS_PER_CHILD', 100))
176
- celery.conf.task_soft_time_limit =\
177
- int(os.environ.get('TASK_TIMEOUT_SECONDS', 3600))
178
- celery.conf.task_time_limit =\
179
- int(os.environ.get('TASK_TIMEOUT_SECONDS', 3600)) + 10 # Cleanup buffer
180
- celery.conf.task_acks_late = CELERY_TASKS_ACK_LATE
181
- celery.conf.task_serializer = 'json'
182
- celery.conf.result_serializer = 'json'
183
- celery.conf.accept_content = ['json']
184
- # Required config options for some brokers we use frequently.
185
- transport_options = {}
186
- celery.conf.broker_transport_options = transport_options
187
-
188
- # Sermos generally has long-running tasks (relatively speaking), so
189
- # limit number of jobs a worker can reserve. This may not be true for
190
- # all tasks, so configure this on a per application basis. In the event
191
- # mutltiple task kinds exist in an application (short and long), see
192
- # http://docs.celeryproject.org/en/latest/userguide/optimizing.html#optimizing-prefetch-limit
193
- # for some guidance on combining multiple workers and routing tasks.
194
- # TODO make configurable from env
195
- celery.conf.worker_prefetch_multiplier = 1
196
-
197
- # Add our application's workers & any other tasks to be made
198
- # available
199
- register_workflow_processor(celery)
200
- try:
201
- GenerateCeleryTasks(sermos_config, celery).generate()
202
- except Exception as e:
203
- logger.error(f"Unable to dynamically generate celery tasks: {e}")
204
- sys.exit(1)
205
-
206
- return celery