scalable-pypeline 1.1.5__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scalable-pypeline-1.1.5/scalable_pypeline.egg-info → scalable-pypeline-1.2.1}/PKG-INFO +1 -1
- scalable-pypeline-1.2.1/pypeline/__init__.py +1 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/celery.py +22 -86
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/constants.py +3 -3
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/api/pipelines.py +16 -3
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/api/utils.py +2 -3
- scalable-pypeline-1.2.1/pypeline/pipeline/chained_task.py +70 -0
- scalable-pypeline-1.2.1/pypeline/pipeline/generator.py +254 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/pipeline_config_schema.py +46 -12
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/sermos_yaml.py +8 -303
- scalable-pypeline-1.2.1/pypeline/utils/__init__.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/utils/task_utils.py +22 -273
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1/scalable_pypeline.egg-info}/PKG-INFO +1 -1
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/scalable_pypeline.egg-info/SOURCES.txt +3 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/scalable_pypeline.egg-info/requires.txt +6 -4
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/setup.py +8 -4
- scalable-pypeline-1.1.5/pypeline/__init__.py +0 -1
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/LICENSE +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/MANIFEST.in +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/README.md +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/celery_beat.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/extensions.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/__init__.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/api/__init__.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/api/schedules.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/decorators.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/flask_sermos.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/generators.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/logging_config.py +0 -0
- {scalable-pypeline-1.1.5/pypeline/utils → scalable-pypeline-1.2.1/pypeline/pipeline}/__init__.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/schedule_config_schema.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/utils/config_utils.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/utils/graph_utils.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/utils/module_utils.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/requirements.txt +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/scalable_pypeline.egg-info/dependency_links.txt +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/scalable_pypeline.egg-info/entry_points.txt +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/scalable_pypeline.egg-info/top_level.txt +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/setup.cfg +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/tests/fixtures/__init__.py +0 -0
- {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/tests/fixtures/s3_fixtures.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '1.2.1'
|
@@ -2,24 +2,19 @@
|
|
2
2
|
"""
|
3
3
|
import os
|
4
4
|
|
5
|
-
from pypeline.constants import DEFAULT_RETRY_TASK_MAX_TTL, DEFAULT_MAX_RETRY
|
6
|
-
|
7
5
|
if os.environ.get('USE_GEVENT', "False").lower() == 'true':
|
8
6
|
from gevent import monkey
|
9
7
|
monkey.patch_all()
|
10
8
|
|
11
|
-
import random
|
12
|
-
import time
|
13
|
-
|
14
|
-
from celery_dyrygent.tasks import register_workflow_processor
|
15
|
-
|
16
9
|
import sys
|
17
10
|
import logging
|
11
|
+
from pypeline.pipeline.chained_task import ChainedTask
|
12
|
+
from celery_dyrygent.tasks import register_workflow_processor
|
18
13
|
from typing import List
|
19
14
|
from celery import Celery
|
20
15
|
from pypeline.logging_config import setup_logging
|
21
16
|
from pypeline.utils.module_utils import SermosModuleLoader
|
22
|
-
from pypeline.utils.task_utils import
|
17
|
+
from pypeline.utils.task_utils import PipelineResult, \
|
23
18
|
get_service_config_for_worker
|
24
19
|
from pypeline.extensions import sermos_config, sermos_client_version
|
25
20
|
from pypeline import __version__
|
@@ -37,46 +32,6 @@ setup_logging(app_version=__version__,
|
|
37
32
|
overload_elasticsearch=OVERLOAD_ES,
|
38
33
|
establish_logging_config=True)
|
39
34
|
|
40
|
-
|
41
|
-
def pipeline_retry(event: dict):
|
42
|
-
""" Handle pipeline retry and deadletter logic.
|
43
|
-
"""
|
44
|
-
access_key = event.get('access_key', None)
|
45
|
-
pipeline_id = event.get('pipeline_id', None)
|
46
|
-
execution_id = event.get('execution_id', None)
|
47
|
-
if pipeline_id is None or execution_id is None:
|
48
|
-
logger.error(f"Unable to retry pipeline {pipeline_id} / "
|
49
|
-
f"execution {execution_id}.")
|
50
|
-
return False
|
51
|
-
|
52
|
-
# generate_chain() will return `None` if the pipeline has exceeded
|
53
|
-
# max retry count or other erorrs happen.
|
54
|
-
gen = PipelineGenerator(pipeline_id=pipeline_id,
|
55
|
-
access_key=access_key,
|
56
|
-
execution_id=execution_id,
|
57
|
-
queue=event.get('queue', None),
|
58
|
-
default_task_ttl=event.get('default_task_ttl',
|
59
|
-
None),
|
60
|
-
add_retry=event.get('add_retry', False),
|
61
|
-
chain_payload=event.get('chain_payload', None))
|
62
|
-
|
63
|
-
if gen.good_to_go:
|
64
|
-
chain = gen.generate_chain()
|
65
|
-
if chain is not None:
|
66
|
-
# Exponential backoff
|
67
|
-
exponential_backoff = min((3 ** gen.pipeline_wrapper.retry_count) +
|
68
|
-
(random.randint(0, 1000) / 1000),
|
69
|
-
DEFAULT_RETRY_TASK_MAX_TTL)
|
70
|
-
logger.debug(f"Exponential backoff sleep {exponential_backoff}")
|
71
|
-
time.sleep(exponential_backoff)
|
72
|
-
# Kick it off again.
|
73
|
-
chain.apply_async()
|
74
|
-
|
75
|
-
logger.warning(f"Pipeline retry was invoked for {pipeline_id} "
|
76
|
-
f"({execution_id})")
|
77
|
-
return True
|
78
|
-
|
79
|
-
|
80
35
|
def task_chain_regulator(*args, **kwargs):
|
81
36
|
""" Utility task to ensure celery properly waits between groups in a chain.
|
82
37
|
|
@@ -114,8 +69,6 @@ class GenerateCeleryTasks(SermosModuleLoader):
|
|
114
69
|
""" Sermos provides default tasks that all workers should know about.
|
115
70
|
"""
|
116
71
|
return [{
|
117
|
-
'handler': 'pypeline.celery.pipeline_retry'
|
118
|
-
}, {
|
119
72
|
'handler': 'pypeline.celery.task_chain_regulator'
|
120
73
|
}, {
|
121
74
|
'handler': 'pypeline.celery.pipeline_success'
|
@@ -147,6 +100,14 @@ class GenerateCeleryTasks(SermosModuleLoader):
|
|
147
100
|
if not service:
|
148
101
|
return
|
149
102
|
for task in service.get('registeredTasks', []):
|
103
|
+
pipeline_meta = None
|
104
|
+
for pipeline_key, pipeline in sermos_config['pipelines'].items():
|
105
|
+
pipeline_config = pipeline["config"]
|
106
|
+
pipeline_tasks = [t["handler"] for t in pipeline_config["taskDefinitions"].values()]
|
107
|
+
if task["handler"] in pipeline_tasks:
|
108
|
+
pipeline_meta = pipeline_config["metadata"]
|
109
|
+
break
|
110
|
+
|
150
111
|
try:
|
151
112
|
worker_path = task['handler'] # Required, no default
|
152
113
|
|
@@ -155,7 +116,17 @@ class GenerateCeleryTasks(SermosModuleLoader):
|
|
155
116
|
# Decorate the method as a celery task along with a default
|
156
117
|
# queue if provided in config. Set ChainedTask as the base
|
157
118
|
# which allows chained tasks to pass kwargs correctly.
|
158
|
-
|
119
|
+
if pipeline_meta and pipeline_meta["maxRetry"] > 0:
|
120
|
+
tmp_handler = self.celery.task(
|
121
|
+
tmp_handler,
|
122
|
+
autoretry_for=(Exception,),
|
123
|
+
max_retries=pipeline_meta["maxRetry"],
|
124
|
+
retry_backoff=pipeline_meta["retryBackoff"],
|
125
|
+
retry_jitter=pipeline_meta["retryJitter"],
|
126
|
+
retry_backoff_max=pipeline_meta["retryBackoffMax"]
|
127
|
+
)
|
128
|
+
else:
|
129
|
+
tmp_handler = self.celery.task(tmp_handler)
|
159
130
|
except Exception as e:
|
160
131
|
logger.warning(f"Unable to add a task to celery: {e}")
|
161
132
|
# Sermos provides default tasks that all workers should know about, add
|
@@ -173,40 +144,6 @@ def configure_celery(celery: Celery):
|
|
173
144
|
REDIS_URL = os.environ.get('REDIS_URL', 'redis://localhost:6379/0')
|
174
145
|
CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_URL', REDIS_URL)
|
175
146
|
CELERY_RESULT_BACKEND = os.environ.get('CELERY_RESULT_BACKEND', REDIS_URL)
|
176
|
-
TaskBase = celery.Task
|
177
|
-
|
178
|
-
class ChainedTask(TaskBase):
|
179
|
-
""" A Celery Task that is used as the _base_ for all dynamically
|
180
|
-
generated tasks (by GenerateCeleryTasks().generate()). This injects
|
181
|
-
`event` into every task's signature, which allows pipelines to pass
|
182
|
-
event information easily through a chain.
|
183
|
-
"""
|
184
|
-
abstract = True
|
185
|
-
autoretry_for = (Exception,)
|
186
|
-
max_retries = DEFAULT_MAX_RETRY
|
187
|
-
retry_backoff = True
|
188
|
-
retry_jitter = True
|
189
|
-
|
190
|
-
def __call__(self, *args, **kwargs):
|
191
|
-
""" Allow the return value of one task to update the kwargs of a
|
192
|
-
subsequent task if it's a dictionary. Important to the function
|
193
|
-
of a pipeline to allow event information to flow easily.
|
194
|
-
"""
|
195
|
-
# Inject app context
|
196
|
-
if len(args) == 1 and isinstance(args[0], dict):
|
197
|
-
kwargs.update(args[0])
|
198
|
-
args = ()
|
199
|
-
|
200
|
-
# Event holds information used in PipelineRunWrapper and
|
201
|
-
# other areas.
|
202
|
-
if 'event' not in kwargs.keys():
|
203
|
-
kwargs['event'] = {}
|
204
|
-
# This is a special worker from dyrygent that orchestrates our
|
205
|
-
# pipelines. It provides a patch in fix for celery's poor
|
206
|
-
# implementation of Canvas work-flows
|
207
|
-
if self.__name__ == 'workflow_processor':
|
208
|
-
kwargs.pop('event', None)
|
209
|
-
return super(ChainedTask, self).__call__(*args, **kwargs)
|
210
147
|
|
211
148
|
celery.Task = ChainedTask
|
212
149
|
|
@@ -230,7 +167,6 @@ def configure_celery(celery: Celery):
|
|
230
167
|
# as the result backend, as Rabbit has horrible support as backend.
|
231
168
|
celery.conf.result_backend = CELERY_RESULT_BACKEND
|
232
169
|
celery.conf.task_ignore_result = False # Must not ignore for Chords
|
233
|
-
celery.conf.task_acks_late = False # Check per worker
|
234
170
|
celery.conf.result_expires = int(
|
235
171
|
os.environ.get('CELERY_RESULT_EXPIRES', 10800)) # 3 hours by default
|
236
172
|
celery.conf.broker_pool_limit = int(os.environ.get('BROKER_POOL_LIMIT',
|
@@ -7,11 +7,9 @@ API_PATH_V1 = '/api/v1'
|
|
7
7
|
|
8
8
|
DEFAULT_RESULT_TTL = 86400 # seconds (1 day)
|
9
9
|
DEFAULT_TASK_TTL = 60 # seconds (1 minute)
|
10
|
-
|
11
|
-
DEFAULT_MAX_RETRY = 10
|
10
|
+
DEFAULT_MAX_RETRY = 3
|
12
11
|
DEFAULT_REGULATOR_TASK = 'pypeline.celery.task_chain_regulator'
|
13
12
|
DEFAULT_SUCCESS_TASK = 'pypeline.celery.pipeline_success'
|
14
|
-
DEFAULT_RETRY_TASK = 'pypeline.celery.pipeline_retry'
|
15
13
|
|
16
14
|
CHAIN_SUCCESS_MSG = 'Chain built successfully ...'
|
17
15
|
CHAIN_FAILURE_MSG = 'Chain failed to build ...'
|
@@ -52,6 +50,8 @@ DEPLOYMENTS_SERVICE_URL = "{}deployments/{}/services/{}"
|
|
52
50
|
DEFAULT_AUTH_URL = urljoin(DEFAULT_BASE_URL, 'auth')
|
53
51
|
USING_SERMOS_CLOUD = DEFAULT_BASE_URL != LOCAL_DEPLOYMENT_VALUE
|
54
52
|
DEFAULT_CONFIG_RETRIEVAL_PAGE_SIZE = 25
|
53
|
+
WORKFLOW_PROCESSOR_QUEUE = os.environ.get('WORKFLOW_PROCESSOR_QUEUE', 'default')
|
54
|
+
|
55
55
|
# Default 'responses' dictionary when decorating endpoints with @api.doc()
|
56
56
|
# Extend as necessary.
|
57
57
|
API_DOC_RESPONSES = {
|
@@ -9,7 +9,8 @@ from flask_smorest import Blueprint
|
|
9
9
|
from flask.views import MethodView
|
10
10
|
from marshmallow import Schema, fields
|
11
11
|
from marshmallow.exceptions import ValidationError
|
12
|
-
from pypeline.constants import API_DOC_RESPONSES, API_DOC_PARAMS, API_PATH_V1
|
12
|
+
from pypeline.constants import API_DOC_RESPONSES, API_DOC_PARAMS, API_PATH_V1,\
|
13
|
+
WORKFLOW_PROCESSOR_QUEUE
|
13
14
|
from pypeline.flask.decorators import require_accesskey
|
14
15
|
from pypeline.flask.api.utils import chain_helper
|
15
16
|
from pypeline.utils.task_utils import PipelineResult
|
@@ -140,7 +141,6 @@ class PipelineInvoke(MethodView):
|
|
140
141
|
def post(self, payload: dict, pipeline_id: str):
|
141
142
|
""" Invoke a pipeline by it's ID; optionally provide pipeline arguments.
|
142
143
|
"""
|
143
|
-
|
144
144
|
access_key = request.headers.get('accesskey')
|
145
145
|
pipeline_config = retrieve_latest_pipeline_config(
|
146
146
|
pipeline_id=pipeline_id, access_key=access_key)
|
@@ -164,9 +164,22 @@ class PipelineInvoke(MethodView):
|
|
164
164
|
abort(400, message=gen.loading_message)
|
165
165
|
|
166
166
|
chain: _chain = gen.chain
|
167
|
-
wf: Workflow = Workflow()
|
167
|
+
wf: Workflow = Workflow({"queue": WORKFLOW_PROCESSOR_QUEUE})
|
168
168
|
wf.add_celery_canvas(chain)
|
169
169
|
wf.apply_async()
|
170
|
+
|
171
|
+
celery_task_status = []
|
172
|
+
for node in wf.nodes:
|
173
|
+
celery_task = dict(
|
174
|
+
name=wf.nodes[node].signature.name,
|
175
|
+
task_id=node,
|
176
|
+
status="RUNNING",
|
177
|
+
retries=0
|
178
|
+
)
|
179
|
+
celery_task_status.append(celery_task)
|
180
|
+
|
181
|
+
gen.pipeline_wrapper.celery_task_status = celery_task_status
|
182
|
+
gen.pipeline_wrapper.save_to_cache()
|
170
183
|
retval['status'] = 'success'
|
171
184
|
retval['execution_id'] = gen.execution_id
|
172
185
|
# Initialize the cached result
|
@@ -2,7 +2,8 @@
|
|
2
2
|
"""
|
3
3
|
import logging
|
4
4
|
from typing import Union
|
5
|
-
|
5
|
+
|
6
|
+
from pypeline.pipeline.generator import PipelineGenerator
|
6
7
|
|
7
8
|
logger = logging.getLogger(__name__)
|
8
9
|
|
@@ -10,7 +11,6 @@ logger = logging.getLogger(__name__)
|
|
10
11
|
def chain_helper(pipeline_id: str,
|
11
12
|
access_key: Union[str, None] = None,
|
12
13
|
chain_payload: Union[dict, None] = None,
|
13
|
-
add_retry: bool = True,
|
14
14
|
queue: Union[str, None] = None,
|
15
15
|
default_task_ttl: int = None):
|
16
16
|
""" Helper method to generate a pipeline chain *with* error handling.
|
@@ -25,7 +25,6 @@ def chain_helper(pipeline_id: str,
|
|
25
25
|
access_key=access_key,
|
26
26
|
queue=queue,
|
27
27
|
default_task_ttl=default_task_ttl,
|
28
|
-
add_retry=add_retry,
|
29
28
|
chain_payload=chain_payload)
|
30
29
|
if gen.good_to_go:
|
31
30
|
# Generate our 'chain', which is the grouping of celery constructs that
|
@@ -0,0 +1,70 @@
|
|
1
|
+
import logging
|
2
|
+
from celery import Task
|
3
|
+
from pypeline.utils.task_utils import PipelineRunWrapper
|
4
|
+
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
class ChainedTask(Task):
|
10
|
+
""" A Celery Task that is used as the _base_ for all dynamically
|
11
|
+
generated tasks (by GenerateCeleryTasks().generate()). This injects
|
12
|
+
`event` into every task's signature, which allows pipelines to pass
|
13
|
+
event information easily through a chain.
|
14
|
+
"""
|
15
|
+
abstract = True
|
16
|
+
|
17
|
+
def __call__(self, *args, **kwargs):
|
18
|
+
""" Allow the return value of one task to update the kwargs of a
|
19
|
+
subsequent task if it's a dictionary. Important to the function
|
20
|
+
of a pipeline to allow event information to flow easily.
|
21
|
+
"""
|
22
|
+
# Inject app context
|
23
|
+
if len(args) == 1 and isinstance(args[0], dict):
|
24
|
+
kwargs.update(args[0])
|
25
|
+
args = ()
|
26
|
+
|
27
|
+
# Event holds information used in PipelineRunWrapper and
|
28
|
+
# other areas.
|
29
|
+
if 'event' not in kwargs.keys():
|
30
|
+
kwargs['event'] = {}
|
31
|
+
# This is a special worker from dyrygent that orchestrates our
|
32
|
+
# pipelines. It provides a patch in fix for celery's poor
|
33
|
+
# implementation of Canvas work-flows
|
34
|
+
if self.__name__ == 'workflow_processor':
|
35
|
+
kwargs.pop('event', None)
|
36
|
+
return super(ChainedTask, self).__call__(*args, **kwargs)
|
37
|
+
|
38
|
+
def after_return(self, status, retval, task_id, args, kwargs, einfo):
|
39
|
+
if "event" in kwargs and "pipeline_id" in kwargs["event"]:
|
40
|
+
try:
|
41
|
+
pipeline_run_wrapper: PipelineRunWrapper = \
|
42
|
+
PipelineRunWrapper.from_event(kwargs["event"])
|
43
|
+
current_task_status = pipeline_run_wrapper.get_task_celery_status(task_id)
|
44
|
+
except Exception:
|
45
|
+
logger.exception("Unable to retreive Pipeline Run Wrapper")
|
46
|
+
return
|
47
|
+
|
48
|
+
if current_task_status:
|
49
|
+
current_task_status["status"] = status
|
50
|
+
try:
|
51
|
+
pipeline_run_wrapper.save_to_cache()
|
52
|
+
except Exception:
|
53
|
+
logger.exception(f"Failed to update celery task status for task {task_id}")
|
54
|
+
|
55
|
+
def on_retry(self, exc, task_id, args, kwargs, einfo):
|
56
|
+
if "event" in kwargs and "pipeline_id" in kwargs["event"]:
|
57
|
+
try:
|
58
|
+
pipeline_run_wrapper: PipelineRunWrapper = \
|
59
|
+
PipelineRunWrapper.from_event(kwargs["event"])
|
60
|
+
current_task_status = pipeline_run_wrapper.get_task_celery_status(task_id)
|
61
|
+
except Exception:
|
62
|
+
logger.exception("Unable to retreive Pipeline Run Wrapper")
|
63
|
+
return
|
64
|
+
|
65
|
+
if current_task_status:
|
66
|
+
current_task_status["retries"] = current_task_status["retries"] + 1
|
67
|
+
try:
|
68
|
+
pipeline_run_wrapper.save_to_cache()
|
69
|
+
except Exception:
|
70
|
+
logger.exception(f"Failed to update celery task status for task {task_id}")
|
@@ -0,0 +1,254 @@
|
|
1
|
+
""" Utilities for running and managing tasks inside pipelines.
|
2
|
+
"""
|
3
|
+
import logging
|
4
|
+
|
5
|
+
from celery import signature, chord, chain
|
6
|
+
from pypeline.utils.graph_utils import get_chainable_tasks
|
7
|
+
from pypeline.utils.config_utils import retrieve_latest_pipeline_config
|
8
|
+
from pypeline.utils.task_utils import PipelineRunWrapper, get_task_signature
|
9
|
+
from pypeline.constants import DEFAULT_TASK_TTL, DEFAULT_MAX_RETRY, \
|
10
|
+
DEFAULT_REGULATOR_TASK, CHAIN_FAILURE_MSG, CHAIN_SUCCESS_MSG, \
|
11
|
+
DEFAULT_SUCCESS_TASK
|
12
|
+
from pypeline.pipeline_config_schema import PipelineConfigValidator
|
13
|
+
|
14
|
+
|
15
|
+
logger = logging.getLogger(__name__)
|
16
|
+
|
17
|
+
|
18
|
+
class PipelineGenerator(object):
|
19
|
+
""" Allows an API endpoint to generate a functional pipeline based on the
|
20
|
+
requested pipeline id. Allows API to then issue the tasks asynchronously
|
21
|
+
to initiate the pipeline. Thereafter, celery will monitor status and
|
22
|
+
handle success/failure modes so the API web worker can return
|
23
|
+
immediately.
|
24
|
+
|
25
|
+
The primary purpose is to unpack the pipeline config, create the
|
26
|
+
requisite cached entities to track pipeline progress, and apply the
|
27
|
+
chained pipeline tasks asynchronously so Celery can take over.
|
28
|
+
|
29
|
+
Usage:
|
30
|
+
gen = PipelineGenerator(pipeline_id)
|
31
|
+
chain = gen.generate_chain()
|
32
|
+
chain.on_error(custom_error_task.s()) # Optional add error handling
|
33
|
+
chain.delay()
|
34
|
+
"""
|
35
|
+
def __init__(self,
|
36
|
+
pipeline_id: str,
|
37
|
+
access_key: str = None,
|
38
|
+
execution_id: str = None,
|
39
|
+
queue: str = None,
|
40
|
+
default_task_ttl: int = None,
|
41
|
+
regulator_queue: str = None,
|
42
|
+
regulator_task: str = None,
|
43
|
+
success_queue: str = None,
|
44
|
+
success_task: str = None,
|
45
|
+
default_max_retry: int = None,
|
46
|
+
retry_backoff: int = None,
|
47
|
+
retry_jitter: bool = None,
|
48
|
+
retry_backoff_max: int = None,
|
49
|
+
chain_payload: dict = None):
|
50
|
+
super().__init__()
|
51
|
+
self.pipeline_id = pipeline_id
|
52
|
+
self.access_key = access_key
|
53
|
+
|
54
|
+
pipeline_config_api_resp = retrieve_latest_pipeline_config(
|
55
|
+
pipeline_id=self.pipeline_id, access_key=self.access_key)
|
56
|
+
|
57
|
+
if pipeline_config_api_resp is None:
|
58
|
+
raise ValueError("Unable to load Pipeline Configuration for "
|
59
|
+
f"pipeline id: {self.pipeline_id} ...")
|
60
|
+
|
61
|
+
# The only part of the API response used for any 'pipeline config'
|
62
|
+
# is the `config` key. The API nests it under `config` to preserve
|
63
|
+
# ability to add additional detail at a later date.
|
64
|
+
self.pipeline_config = pipeline_config_api_resp.get('config', {})
|
65
|
+
schema_version = pipeline_config_api_resp.get('schemaVersion')
|
66
|
+
PipelineConfigValidator(config_dict=self.pipeline_config,
|
67
|
+
schema_version=schema_version)
|
68
|
+
|
69
|
+
self.execution_id = execution_id # UUID string
|
70
|
+
self.good_to_go = False # Indicates initialization/loading success
|
71
|
+
self.loading_message = None # Allows access to success/error messages
|
72
|
+
self.is_retry = False if self.execution_id is None else True
|
73
|
+
|
74
|
+
self.default_max_retry = default_max_retry \
|
75
|
+
if default_max_retry is not None else \
|
76
|
+
self.pipeline_config['metadata'].get('maxRetry', DEFAULT_MAX_RETRY)
|
77
|
+
self.retry_backoff = retry_backoff \
|
78
|
+
if retry_backoff is not None else \
|
79
|
+
self.pipeline_config['metadata'].get('retryBackoff', 3)
|
80
|
+
self.retry_backoff_max = retry_backoff \
|
81
|
+
if retry_backoff_max is not None else \
|
82
|
+
self.pipeline_config['metadata'].get('retryBackoffMax', 600)
|
83
|
+
self.retry_jitter = retry_jitter \
|
84
|
+
if retry_jitter is not None else \
|
85
|
+
self.pipeline_config['metadata'].get('retryJitter', False)
|
86
|
+
|
87
|
+
# Queue on which to place tasks by default and default TTL per task
|
88
|
+
# These can be overridden in PipelineConfig.config['taskDefinitions']
|
89
|
+
self.queue = queue \
|
90
|
+
if queue is not None \
|
91
|
+
else self.pipeline_config['metadata']['queue']
|
92
|
+
self.default_task_ttl = default_task_ttl \
|
93
|
+
if default_task_ttl is not None else \
|
94
|
+
self.pipeline_config['metadata'].get('maxTtl', DEFAULT_TASK_TTL)
|
95
|
+
|
96
|
+
# See docstring in self._get_regulator()
|
97
|
+
self.regulator_queue = regulator_queue \
|
98
|
+
if regulator_queue is not None \
|
99
|
+
else self.pipeline_config['metadata']['queue']
|
100
|
+
self.regulator_task = regulator_task\
|
101
|
+
if regulator_task is not None else DEFAULT_REGULATOR_TASK
|
102
|
+
|
103
|
+
# See docstring in self._get_success_task()
|
104
|
+
self.success_queue = success_queue \
|
105
|
+
if success_queue is not None \
|
106
|
+
else self.pipeline_config['metadata']['queue']
|
107
|
+
self.success_task = success_task\
|
108
|
+
if success_task is not None else DEFAULT_SUCCESS_TASK
|
109
|
+
|
110
|
+
# Optional data to pass to each step in chain
|
111
|
+
self.chain_payload = chain_payload\
|
112
|
+
if chain_payload is not None else {}
|
113
|
+
|
114
|
+
self.pipeline_wrapper = None # Allows access to the PipelineRunWrapper
|
115
|
+
self.chain = None # Must be intentionally built with generate_chain()
|
116
|
+
|
117
|
+
try:
|
118
|
+
# Generate our wrapper for this pipeline_id / execution_id
|
119
|
+
self.pipeline_wrapper = PipelineRunWrapper(
|
120
|
+
pipeline_id=self.pipeline_id,
|
121
|
+
pipeline_config=self.pipeline_config,
|
122
|
+
execution_id=self.execution_id,
|
123
|
+
max_ttl=self.default_task_ttl,
|
124
|
+
max_retry=self.default_max_retry,
|
125
|
+
chain_payload=self.chain_payload)
|
126
|
+
|
127
|
+
# Loads pipeline config from remote or cache if it's already there
|
128
|
+
# `is_retry` will be True for any PipelineGenerator instantiated
|
129
|
+
# with an execution_id. This flag helps the wrapper increment the
|
130
|
+
# retry count and determine if this should be deadlettered.
|
131
|
+
# This step also saves the valid/initialized run wrapper to cache.
|
132
|
+
self.pipeline_wrapper.load(is_retry=self.is_retry)
|
133
|
+
|
134
|
+
# Set all variables that were established from the run wrapper
|
135
|
+
# initialization. Notably, default_task_ttl can be overloaded
|
136
|
+
# if the pipeline config has an explicit maxTtl set in metadata.
|
137
|
+
self.good_to_go = self.pipeline_wrapper.good_to_go
|
138
|
+
self.loading_message = self.pipeline_wrapper.loading_message
|
139
|
+
self.execution_id = self.pipeline_wrapper.execution_id
|
140
|
+
|
141
|
+
except Exception as e:
|
142
|
+
fail_msg = "Failed to load Pipeline for id {} ... {}".format(
|
143
|
+
self.pipeline_id, e)
|
144
|
+
self.loading_message = fail_msg
|
145
|
+
logger.error(fail_msg)
|
146
|
+
raise e
|
147
|
+
|
148
|
+
def _get_regulator(self):
|
149
|
+
""" Create a chain regulator celery task signature.
|
150
|
+
|
151
|
+
For a chain(), if each element is a group() then celery does not
|
152
|
+
properly adhere to the chain elements occurring sequentially. If you
|
153
|
+
insert a task that is not a group() in between, though, then the
|
154
|
+
chain operates as expected.
|
155
|
+
"""
|
156
|
+
return signature(self.regulator_task,
|
157
|
+
queue=self.regulator_queue,
|
158
|
+
immutable=True)
|
159
|
+
|
160
|
+
def _get_success_task(self):
|
161
|
+
""" A final 'success' task that's added to the end of every pipeline.
|
162
|
+
|
163
|
+
This stores the 'success' state in the cached result. Users can
|
164
|
+
set other values by using TaskRunner().save_result()
|
165
|
+
"""
|
166
|
+
return get_task_signature(task_path=self.success_task,
|
167
|
+
queue=self.success_queue,
|
168
|
+
pipeline_id=self.pipeline_id,
|
169
|
+
execution_id=self.execution_id)
|
170
|
+
|
171
|
+
def _get_signature(self, node):
|
172
|
+
""" Create a celery task signature based on a graph node.
|
173
|
+
"""
|
174
|
+
metadata = self.pipeline_config['metadata']
|
175
|
+
node_config = self.pipeline_config['taskDefinitions'][node]
|
176
|
+
|
177
|
+
# Node config takes precedence, pipeline metadata as default
|
178
|
+
queue = node_config.get('queue', metadata['queue'])
|
179
|
+
max_ttl = node_config.get('maxTtl', metadata.get('maxTtl', None))
|
180
|
+
|
181
|
+
# Ensures task signatures include requisite information to retrieve
|
182
|
+
# PipelineRunWrapper from cache using the pipeline id, and execution id.
|
183
|
+
# We set immutable=True to ensure each client task can be defined
|
184
|
+
# with this specific signature (event)
|
185
|
+
# http://docs.celeryproject.org/en/master/userguide/canvas.html#immutability
|
186
|
+
return get_task_signature(task_path=node_config.get('handler'),
|
187
|
+
queue=queue,
|
188
|
+
access_key=self.access_key,
|
189
|
+
pipeline_id=self.pipeline_id,
|
190
|
+
execution_id=self.execution_id,
|
191
|
+
max_ttl=max_ttl,
|
192
|
+
immutable=True,
|
193
|
+
task_config=node_config)
|
194
|
+
|
195
|
+
def generate_chain(self):
|
196
|
+
""" Generate the full pipeline chain.
|
197
|
+
"""
|
198
|
+
logger.debug(f'Starting Pipeline {self.pipeline_id}')
|
199
|
+
|
200
|
+
if not self.good_to_go:
|
201
|
+
logger.info("Chain deemed to be not good to go.")
|
202
|
+
if self.loading_message is None:
|
203
|
+
self.loading_message = CHAIN_FAILURE_MSG
|
204
|
+
return None
|
205
|
+
|
206
|
+
try:
|
207
|
+
# Create the task chain such that all concurrent tasks are grouped
|
208
|
+
# and all high level node groups are run serially
|
209
|
+
G = self.pipeline_wrapper.execution_graph
|
210
|
+
|
211
|
+
total_tasks = 0
|
212
|
+
pipeline_chain = []
|
213
|
+
chainable_tasks = get_chainable_tasks(G, None, [])
|
214
|
+
|
215
|
+
# Current chord+chain solution based on
|
216
|
+
# https://stackoverflow.com/questions/15123772/celery-chaining-groups-and-subtasks-out-of-order-execution
|
217
|
+
# Look also at last comment from Nov 7, 2017 here
|
218
|
+
# https://github.com/celery/celery/issues/3597
|
219
|
+
# Big outstanding bug in Celery related to failures in chords that
|
220
|
+
# results in really nasty log output. See
|
221
|
+
# https://github.com/celery/celery/issues/4834
|
222
|
+
for i, node_group in enumerate(chainable_tasks):
|
223
|
+
total_tasks += len(node_group)
|
224
|
+
this_group = []
|
225
|
+
for node in node_group:
|
226
|
+
node_signature = self._get_signature(node)
|
227
|
+
this_group.append(node_signature)
|
228
|
+
|
229
|
+
if len(this_group) <= 1:
|
230
|
+
this_group.append(self._get_regulator())
|
231
|
+
|
232
|
+
the_chord = chord(header=this_group,
|
233
|
+
body=self._get_regulator())
|
234
|
+
|
235
|
+
pipeline_chain.append(the_chord)
|
236
|
+
|
237
|
+
# Add a 'finished/success' task to the end of all pipelines
|
238
|
+
pipeline_chain.append(
|
239
|
+
chord(header=self._get_success_task(),
|
240
|
+
body=self._get_regulator()))
|
241
|
+
|
242
|
+
the_chain = chain(*pipeline_chain)
|
243
|
+
|
244
|
+
self.loading_message = CHAIN_SUCCESS_MSG
|
245
|
+
|
246
|
+
self.chain = the_chain
|
247
|
+
except Exception as e:
|
248
|
+
self.loading_message = CHAIN_FAILURE_MSG + " {}".format(e)
|
249
|
+
logger.exception(e)
|
250
|
+
the_chain = None
|
251
|
+
|
252
|
+
self.chain = the_chain
|
253
|
+
|
254
|
+
return the_chain
|