scalable-pypeline 1.1.5__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {scalable-pypeline-1.1.5/scalable_pypeline.egg-info → scalable-pypeline-1.2.1}/PKG-INFO +1 -1
  2. scalable-pypeline-1.2.1/pypeline/__init__.py +1 -0
  3. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/celery.py +22 -86
  4. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/constants.py +3 -3
  5. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/api/pipelines.py +16 -3
  6. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/api/utils.py +2 -3
  7. scalable-pypeline-1.2.1/pypeline/pipeline/chained_task.py +70 -0
  8. scalable-pypeline-1.2.1/pypeline/pipeline/generator.py +254 -0
  9. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/pipeline_config_schema.py +46 -12
  10. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/sermos_yaml.py +8 -303
  11. scalable-pypeline-1.2.1/pypeline/utils/__init__.py +0 -0
  12. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/utils/task_utils.py +22 -273
  13. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1/scalable_pypeline.egg-info}/PKG-INFO +1 -1
  14. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/scalable_pypeline.egg-info/SOURCES.txt +3 -0
  15. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/scalable_pypeline.egg-info/requires.txt +6 -4
  16. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/setup.py +8 -4
  17. scalable-pypeline-1.1.5/pypeline/__init__.py +0 -1
  18. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/LICENSE +0 -0
  19. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/MANIFEST.in +0 -0
  20. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/README.md +0 -0
  21. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/celery_beat.py +0 -0
  22. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/extensions.py +0 -0
  23. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/__init__.py +0 -0
  24. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/api/__init__.py +0 -0
  25. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/api/schedules.py +0 -0
  26. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/decorators.py +0 -0
  27. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/flask/flask_sermos.py +0 -0
  28. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/generators.py +0 -0
  29. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/logging_config.py +0 -0
  30. {scalable-pypeline-1.1.5/pypeline/utils → scalable-pypeline-1.2.1/pypeline/pipeline}/__init__.py +0 -0
  31. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/schedule_config_schema.py +0 -0
  32. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/utils/config_utils.py +0 -0
  33. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/utils/graph_utils.py +0 -0
  34. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/pypeline/utils/module_utils.py +0 -0
  35. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/requirements.txt +0 -0
  36. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/scalable_pypeline.egg-info/dependency_links.txt +0 -0
  37. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/scalable_pypeline.egg-info/entry_points.txt +0 -0
  38. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/scalable_pypeline.egg-info/top_level.txt +0 -0
  39. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/setup.cfg +0 -0
  40. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/tests/fixtures/__init__.py +0 -0
  41. {scalable-pypeline-1.1.5 → scalable-pypeline-1.2.1}/tests/fixtures/s3_fixtures.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scalable-pypeline
3
- Version: 1.1.5
3
+ Version: 1.2.1
4
4
  Summary: PypeLine - Python pipelines for the Real World
5
5
  Home-page: https://gitlab.com/bravos2/pypeline
6
6
  Author: Bravos Power Corporation
@@ -0,0 +1 @@
1
+ __version__ = '1.2.1'
@@ -2,24 +2,19 @@
2
2
  """
3
3
  import os
4
4
 
5
- from pypeline.constants import DEFAULT_RETRY_TASK_MAX_TTL, DEFAULT_MAX_RETRY
6
-
7
5
  if os.environ.get('USE_GEVENT', "False").lower() == 'true':
8
6
  from gevent import monkey
9
7
  monkey.patch_all()
10
8
 
11
- import random
12
- import time
13
-
14
- from celery_dyrygent.tasks import register_workflow_processor
15
-
16
9
  import sys
17
10
  import logging
11
+ from pypeline.pipeline.chained_task import ChainedTask
12
+ from celery_dyrygent.tasks import register_workflow_processor
18
13
  from typing import List
19
14
  from celery import Celery
20
15
  from pypeline.logging_config import setup_logging
21
16
  from pypeline.utils.module_utils import SermosModuleLoader
22
- from pypeline.utils.task_utils import PipelineGenerator, PipelineResult, \
17
+ from pypeline.utils.task_utils import PipelineResult, \
23
18
  get_service_config_for_worker
24
19
  from pypeline.extensions import sermos_config, sermos_client_version
25
20
  from pypeline import __version__
@@ -37,46 +32,6 @@ setup_logging(app_version=__version__,
37
32
  overload_elasticsearch=OVERLOAD_ES,
38
33
  establish_logging_config=True)
39
34
 
40
-
41
- def pipeline_retry(event: dict):
42
- """ Handle pipeline retry and deadletter logic.
43
- """
44
- access_key = event.get('access_key', None)
45
- pipeline_id = event.get('pipeline_id', None)
46
- execution_id = event.get('execution_id', None)
47
- if pipeline_id is None or execution_id is None:
48
- logger.error(f"Unable to retry pipeline {pipeline_id} / "
49
- f"execution {execution_id}.")
50
- return False
51
-
52
- # generate_chain() will return `None` if the pipeline has exceeded
53
- # max retry count or other erorrs happen.
54
- gen = PipelineGenerator(pipeline_id=pipeline_id,
55
- access_key=access_key,
56
- execution_id=execution_id,
57
- queue=event.get('queue', None),
58
- default_task_ttl=event.get('default_task_ttl',
59
- None),
60
- add_retry=event.get('add_retry', False),
61
- chain_payload=event.get('chain_payload', None))
62
-
63
- if gen.good_to_go:
64
- chain = gen.generate_chain()
65
- if chain is not None:
66
- # Exponential backoff
67
- exponential_backoff = min((3 ** gen.pipeline_wrapper.retry_count) +
68
- (random.randint(0, 1000) / 1000),
69
- DEFAULT_RETRY_TASK_MAX_TTL)
70
- logger.debug(f"Exponential backoff sleep {exponential_backoff}")
71
- time.sleep(exponential_backoff)
72
- # Kick it off again.
73
- chain.apply_async()
74
-
75
- logger.warning(f"Pipeline retry was invoked for {pipeline_id} "
76
- f"({execution_id})")
77
- return True
78
-
79
-
80
35
  def task_chain_regulator(*args, **kwargs):
81
36
  """ Utility task to ensure celery properly waits between groups in a chain.
82
37
 
@@ -114,8 +69,6 @@ class GenerateCeleryTasks(SermosModuleLoader):
114
69
  """ Sermos provides default tasks that all workers should know about.
115
70
  """
116
71
  return [{
117
- 'handler': 'pypeline.celery.pipeline_retry'
118
- }, {
119
72
  'handler': 'pypeline.celery.task_chain_regulator'
120
73
  }, {
121
74
  'handler': 'pypeline.celery.pipeline_success'
@@ -147,6 +100,14 @@ class GenerateCeleryTasks(SermosModuleLoader):
147
100
  if not service:
148
101
  return
149
102
  for task in service.get('registeredTasks', []):
103
+ pipeline_meta = None
104
+ for pipeline_key, pipeline in sermos_config['pipelines'].items():
105
+ pipeline_config = pipeline["config"]
106
+ pipeline_tasks = [t["handler"] for t in pipeline_config["taskDefinitions"].values()]
107
+ if task["handler"] in pipeline_tasks:
108
+ pipeline_meta = pipeline_config["metadata"]
109
+ break
110
+
150
111
  try:
151
112
  worker_path = task['handler'] # Required, no default
152
113
 
@@ -155,7 +116,17 @@ class GenerateCeleryTasks(SermosModuleLoader):
155
116
  # Decorate the method as a celery task along with a default
156
117
  # queue if provided in config. Set ChainedTask as the base
157
118
  # which allows chained tasks to pass kwargs correctly.
158
- tmp_handler = self.celery.task(tmp_handler)
119
+ if pipeline_meta and pipeline_meta["maxRetry"] > 0:
120
+ tmp_handler = self.celery.task(
121
+ tmp_handler,
122
+ autoretry_for=(Exception,),
123
+ max_retries=pipeline_meta["maxRetry"],
124
+ retry_backoff=pipeline_meta["retryBackoff"],
125
+ retry_jitter=pipeline_meta["retryJitter"],
126
+ retry_backoff_max=pipeline_meta["retryBackoffMax"]
127
+ )
128
+ else:
129
+ tmp_handler = self.celery.task(tmp_handler)
159
130
  except Exception as e:
160
131
  logger.warning(f"Unable to add a task to celery: {e}")
161
132
  # Sermos provides default tasks that all workers should know about, add
@@ -173,40 +144,6 @@ def configure_celery(celery: Celery):
173
144
  REDIS_URL = os.environ.get('REDIS_URL', 'redis://localhost:6379/0')
174
145
  CELERY_BROKER_URL = os.environ.get('CELERY_BROKER_URL', REDIS_URL)
175
146
  CELERY_RESULT_BACKEND = os.environ.get('CELERY_RESULT_BACKEND', REDIS_URL)
176
- TaskBase = celery.Task
177
-
178
- class ChainedTask(TaskBase):
179
- """ A Celery Task that is used as the _base_ for all dynamically
180
- generated tasks (by GenerateCeleryTasks().generate()). This injects
181
- `event` into every task's signature, which allows pipelines to pass
182
- event information easily through a chain.
183
- """
184
- abstract = True
185
- autoretry_for = (Exception,)
186
- max_retries = DEFAULT_MAX_RETRY
187
- retry_backoff = True
188
- retry_jitter = True
189
-
190
- def __call__(self, *args, **kwargs):
191
- """ Allow the return value of one task to update the kwargs of a
192
- subsequent task if it's a dictionary. Important to the function
193
- of a pipeline to allow event information to flow easily.
194
- """
195
- # Inject app context
196
- if len(args) == 1 and isinstance(args[0], dict):
197
- kwargs.update(args[0])
198
- args = ()
199
-
200
- # Event holds information used in PipelineRunWrapper and
201
- # other areas.
202
- if 'event' not in kwargs.keys():
203
- kwargs['event'] = {}
204
- # This is a special worker from dyrygent that orchestrates our
205
- # pipelines. It provides a patch in fix for celery's poor
206
- # implementation of Canvas work-flows
207
- if self.__name__ == 'workflow_processor':
208
- kwargs.pop('event', None)
209
- return super(ChainedTask, self).__call__(*args, **kwargs)
210
147
 
211
148
  celery.Task = ChainedTask
212
149
 
@@ -230,7 +167,6 @@ def configure_celery(celery: Celery):
230
167
  # as the result backend, as Rabbit has horrible support as backend.
231
168
  celery.conf.result_backend = CELERY_RESULT_BACKEND
232
169
  celery.conf.task_ignore_result = False # Must not ignore for Chords
233
- celery.conf.task_acks_late = False # Check per worker
234
170
  celery.conf.result_expires = int(
235
171
  os.environ.get('CELERY_RESULT_EXPIRES', 10800)) # 3 hours by default
236
172
  celery.conf.broker_pool_limit = int(os.environ.get('BROKER_POOL_LIMIT',
@@ -7,11 +7,9 @@ API_PATH_V1 = '/api/v1'
7
7
 
8
8
  DEFAULT_RESULT_TTL = 86400 # seconds (1 day)
9
9
  DEFAULT_TASK_TTL = 60 # seconds (1 minute)
10
- DEFAULT_RETRY_TASK_MAX_TTL = 300
11
- DEFAULT_MAX_RETRY = 10
10
+ DEFAULT_MAX_RETRY = 3
12
11
  DEFAULT_REGULATOR_TASK = 'pypeline.celery.task_chain_regulator'
13
12
  DEFAULT_SUCCESS_TASK = 'pypeline.celery.pipeline_success'
14
- DEFAULT_RETRY_TASK = 'pypeline.celery.pipeline_retry'
15
13
 
16
14
  CHAIN_SUCCESS_MSG = 'Chain built successfully ...'
17
15
  CHAIN_FAILURE_MSG = 'Chain failed to build ...'
@@ -52,6 +50,8 @@ DEPLOYMENTS_SERVICE_URL = "{}deployments/{}/services/{}"
52
50
  DEFAULT_AUTH_URL = urljoin(DEFAULT_BASE_URL, 'auth')
53
51
  USING_SERMOS_CLOUD = DEFAULT_BASE_URL != LOCAL_DEPLOYMENT_VALUE
54
52
  DEFAULT_CONFIG_RETRIEVAL_PAGE_SIZE = 25
53
+ WORKFLOW_PROCESSOR_QUEUE = os.environ.get('WORKFLOW_PROCESSOR_QUEUE', 'default')
54
+
55
55
  # Default 'responses' dictionary when decorating endpoints with @api.doc()
56
56
  # Extend as necessary.
57
57
  API_DOC_RESPONSES = {
@@ -9,7 +9,8 @@ from flask_smorest import Blueprint
9
9
  from flask.views import MethodView
10
10
  from marshmallow import Schema, fields
11
11
  from marshmallow.exceptions import ValidationError
12
- from pypeline.constants import API_DOC_RESPONSES, API_DOC_PARAMS, API_PATH_V1
12
+ from pypeline.constants import API_DOC_RESPONSES, API_DOC_PARAMS, API_PATH_V1,\
13
+ WORKFLOW_PROCESSOR_QUEUE
13
14
  from pypeline.flask.decorators import require_accesskey
14
15
  from pypeline.flask.api.utils import chain_helper
15
16
  from pypeline.utils.task_utils import PipelineResult
@@ -140,7 +141,6 @@ class PipelineInvoke(MethodView):
140
141
  def post(self, payload: dict, pipeline_id: str):
141
142
  """ Invoke a pipeline by it's ID; optionally provide pipeline arguments.
142
143
  """
143
-
144
144
  access_key = request.headers.get('accesskey')
145
145
  pipeline_config = retrieve_latest_pipeline_config(
146
146
  pipeline_id=pipeline_id, access_key=access_key)
@@ -164,9 +164,22 @@ class PipelineInvoke(MethodView):
164
164
  abort(400, message=gen.loading_message)
165
165
 
166
166
  chain: _chain = gen.chain
167
- wf: Workflow = Workflow()
167
+ wf: Workflow = Workflow({"queue": WORKFLOW_PROCESSOR_QUEUE})
168
168
  wf.add_celery_canvas(chain)
169
169
  wf.apply_async()
170
+
171
+ celery_task_status = []
172
+ for node in wf.nodes:
173
+ celery_task = dict(
174
+ name=wf.nodes[node].signature.name,
175
+ task_id=node,
176
+ status="RUNNING",
177
+ retries=0
178
+ )
179
+ celery_task_status.append(celery_task)
180
+
181
+ gen.pipeline_wrapper.celery_task_status = celery_task_status
182
+ gen.pipeline_wrapper.save_to_cache()
170
183
  retval['status'] = 'success'
171
184
  retval['execution_id'] = gen.execution_id
172
185
  # Initialize the cached result
@@ -2,7 +2,8 @@
2
2
  """
3
3
  import logging
4
4
  from typing import Union
5
- from pypeline.utils.task_utils import PipelineGenerator
5
+
6
+ from pypeline.pipeline.generator import PipelineGenerator
6
7
 
7
8
  logger = logging.getLogger(__name__)
8
9
 
@@ -10,7 +11,6 @@ logger = logging.getLogger(__name__)
10
11
  def chain_helper(pipeline_id: str,
11
12
  access_key: Union[str, None] = None,
12
13
  chain_payload: Union[dict, None] = None,
13
- add_retry: bool = True,
14
14
  queue: Union[str, None] = None,
15
15
  default_task_ttl: int = None):
16
16
  """ Helper method to generate a pipeline chain *with* error handling.
@@ -25,7 +25,6 @@ def chain_helper(pipeline_id: str,
25
25
  access_key=access_key,
26
26
  queue=queue,
27
27
  default_task_ttl=default_task_ttl,
28
- add_retry=add_retry,
29
28
  chain_payload=chain_payload)
30
29
  if gen.good_to_go:
31
30
  # Generate our 'chain', which is the grouping of celery constructs that
@@ -0,0 +1,70 @@
1
+ import logging
2
+ from celery import Task
3
+ from pypeline.utils.task_utils import PipelineRunWrapper
4
+
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ class ChainedTask(Task):
10
+ """ A Celery Task that is used as the _base_ for all dynamically
11
+ generated tasks (by GenerateCeleryTasks().generate()). This injects
12
+ `event` into every task's signature, which allows pipelines to pass
13
+ event information easily through a chain.
14
+ """
15
+ abstract = True
16
+
17
+ def __call__(self, *args, **kwargs):
18
+ """ Allow the return value of one task to update the kwargs of a
19
+ subsequent task if it's a dictionary. Important to the function
20
+ of a pipeline to allow event information to flow easily.
21
+ """
22
+ # Inject app context
23
+ if len(args) == 1 and isinstance(args[0], dict):
24
+ kwargs.update(args[0])
25
+ args = ()
26
+
27
+ # Event holds information used in PipelineRunWrapper and
28
+ # other areas.
29
+ if 'event' not in kwargs.keys():
30
+ kwargs['event'] = {}
31
+ # This is a special worker from dyrygent that orchestrates our
32
+ # pipelines. It provides a patch in fix for celery's poor
33
+ # implementation of Canvas work-flows
34
+ if self.__name__ == 'workflow_processor':
35
+ kwargs.pop('event', None)
36
+ return super(ChainedTask, self).__call__(*args, **kwargs)
37
+
38
+ def after_return(self, status, retval, task_id, args, kwargs, einfo):
39
+ if "event" in kwargs and "pipeline_id" in kwargs["event"]:
40
+ try:
41
+ pipeline_run_wrapper: PipelineRunWrapper = \
42
+ PipelineRunWrapper.from_event(kwargs["event"])
43
+ current_task_status = pipeline_run_wrapper.get_task_celery_status(task_id)
44
+ except Exception:
45
+ logger.exception("Unable to retreive Pipeline Run Wrapper")
46
+ return
47
+
48
+ if current_task_status:
49
+ current_task_status["status"] = status
50
+ try:
51
+ pipeline_run_wrapper.save_to_cache()
52
+ except Exception:
53
+ logger.exception(f"Failed to update celery task status for task {task_id}")
54
+
55
+ def on_retry(self, exc, task_id, args, kwargs, einfo):
56
+ if "event" in kwargs and "pipeline_id" in kwargs["event"]:
57
+ try:
58
+ pipeline_run_wrapper: PipelineRunWrapper = \
59
+ PipelineRunWrapper.from_event(kwargs["event"])
60
+ current_task_status = pipeline_run_wrapper.get_task_celery_status(task_id)
61
+ except Exception:
62
+ logger.exception("Unable to retreive Pipeline Run Wrapper")
63
+ return
64
+
65
+ if current_task_status:
66
+ current_task_status["retries"] = current_task_status["retries"] + 1
67
+ try:
68
+ pipeline_run_wrapper.save_to_cache()
69
+ except Exception:
70
+ logger.exception(f"Failed to update celery task status for task {task_id}")
@@ -0,0 +1,254 @@
1
+ """ Utilities for running and managing tasks inside pipelines.
2
+ """
3
+ import logging
4
+
5
+ from celery import signature, chord, chain
6
+ from pypeline.utils.graph_utils import get_chainable_tasks
7
+ from pypeline.utils.config_utils import retrieve_latest_pipeline_config
8
+ from pypeline.utils.task_utils import PipelineRunWrapper, get_task_signature
9
+ from pypeline.constants import DEFAULT_TASK_TTL, DEFAULT_MAX_RETRY, \
10
+ DEFAULT_REGULATOR_TASK, CHAIN_FAILURE_MSG, CHAIN_SUCCESS_MSG, \
11
+ DEFAULT_SUCCESS_TASK
12
+ from pypeline.pipeline_config_schema import PipelineConfigValidator
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class PipelineGenerator(object):
19
+ """ Allows an API endpoint to generate a functional pipeline based on the
20
+ requested pipeline id. Allows API to then issue the tasks asynchronously
21
+ to initiate the pipeline. Thereafter, celery will monitor status and
22
+ handle success/failure modes so the API web worker can return
23
+ immediately.
24
+
25
+ The primary purpose is to unpack the pipeline config, create the
26
+ requisite cached entities to track pipeline progress, and apply the
27
+ chained pipeline tasks asynchronously so Celery can take over.
28
+
29
+ Usage:
30
+ gen = PipelineGenerator(pipeline_id)
31
+ chain = gen.generate_chain()
32
+ chain.on_error(custom_error_task.s()) # Optional add error handling
33
+ chain.delay()
34
+ """
35
+ def __init__(self,
36
+ pipeline_id: str,
37
+ access_key: str = None,
38
+ execution_id: str = None,
39
+ queue: str = None,
40
+ default_task_ttl: int = None,
41
+ regulator_queue: str = None,
42
+ regulator_task: str = None,
43
+ success_queue: str = None,
44
+ success_task: str = None,
45
+ default_max_retry: int = None,
46
+ retry_backoff: int = None,
47
+ retry_jitter: bool = None,
48
+ retry_backoff_max: int = None,
49
+ chain_payload: dict = None):
50
+ super().__init__()
51
+ self.pipeline_id = pipeline_id
52
+ self.access_key = access_key
53
+
54
+ pipeline_config_api_resp = retrieve_latest_pipeline_config(
55
+ pipeline_id=self.pipeline_id, access_key=self.access_key)
56
+
57
+ if pipeline_config_api_resp is None:
58
+ raise ValueError("Unable to load Pipeline Configuration for "
59
+ f"pipeline id: {self.pipeline_id} ...")
60
+
61
+ # The only part of the API response used for any 'pipeline config'
62
+ # is the `config` key. The API nests it under `config` to preserve
63
+ # ability to add additional detail at a later date.
64
+ self.pipeline_config = pipeline_config_api_resp.get('config', {})
65
+ schema_version = pipeline_config_api_resp.get('schemaVersion')
66
+ PipelineConfigValidator(config_dict=self.pipeline_config,
67
+ schema_version=schema_version)
68
+
69
+ self.execution_id = execution_id # UUID string
70
+ self.good_to_go = False # Indicates initialization/loading success
71
+ self.loading_message = None # Allows access to success/error messages
72
+ self.is_retry = False if self.execution_id is None else True
73
+
74
+ self.default_max_retry = default_max_retry \
75
+ if default_max_retry is not None else \
76
+ self.pipeline_config['metadata'].get('maxRetry', DEFAULT_MAX_RETRY)
77
+ self.retry_backoff = retry_backoff \
78
+ if retry_backoff is not None else \
79
+ self.pipeline_config['metadata'].get('retryBackoff', 3)
80
+ self.retry_backoff_max = retry_backoff \
81
+ if retry_backoff_max is not None else \
82
+ self.pipeline_config['metadata'].get('retryBackoffMax', 600)
83
+ self.retry_jitter = retry_jitter \
84
+ if retry_jitter is not None else \
85
+ self.pipeline_config['metadata'].get('retryJitter', False)
86
+
87
+ # Queue on which to place tasks by default and default TTL per task
88
+ # These can be overridden in PipelineConfig.config['taskDefinitions']
89
+ self.queue = queue \
90
+ if queue is not None \
91
+ else self.pipeline_config['metadata']['queue']
92
+ self.default_task_ttl = default_task_ttl \
93
+ if default_task_ttl is not None else \
94
+ self.pipeline_config['metadata'].get('maxTtl', DEFAULT_TASK_TTL)
95
+
96
+ # See docstring in self._get_regulator()
97
+ self.regulator_queue = regulator_queue \
98
+ if regulator_queue is not None \
99
+ else self.pipeline_config['metadata']['queue']
100
+ self.regulator_task = regulator_task\
101
+ if regulator_task is not None else DEFAULT_REGULATOR_TASK
102
+
103
+ # See docstring in self._get_success_task()
104
+ self.success_queue = success_queue \
105
+ if success_queue is not None \
106
+ else self.pipeline_config['metadata']['queue']
107
+ self.success_task = success_task\
108
+ if success_task is not None else DEFAULT_SUCCESS_TASK
109
+
110
+ # Optional data to pass to each step in chain
111
+ self.chain_payload = chain_payload\
112
+ if chain_payload is not None else {}
113
+
114
+ self.pipeline_wrapper = None # Allows access to the PipelineRunWrapper
115
+ self.chain = None # Must be intentionally built with generate_chain()
116
+
117
+ try:
118
+ # Generate our wrapper for this pipeline_id / execution_id
119
+ self.pipeline_wrapper = PipelineRunWrapper(
120
+ pipeline_id=self.pipeline_id,
121
+ pipeline_config=self.pipeline_config,
122
+ execution_id=self.execution_id,
123
+ max_ttl=self.default_task_ttl,
124
+ max_retry=self.default_max_retry,
125
+ chain_payload=self.chain_payload)
126
+
127
+ # Loads pipeline config from remote or cache if it's already there
128
+ # `is_retry` will be True for any PipelineGenerator instantiated
129
+ # with an execution_id. This flag helps the wrapper increment the
130
+ # retry count and determine if this should be deadlettered.
131
+ # This step also saves the valid/initialized run wrapper to cache.
132
+ self.pipeline_wrapper.load(is_retry=self.is_retry)
133
+
134
+ # Set all variables that were established from the run wrapper
135
+ # initialization. Notably, default_task_ttl can be overloaded
136
+ # if the pipeline config has an explicit maxTtl set in metadata.
137
+ self.good_to_go = self.pipeline_wrapper.good_to_go
138
+ self.loading_message = self.pipeline_wrapper.loading_message
139
+ self.execution_id = self.pipeline_wrapper.execution_id
140
+
141
+ except Exception as e:
142
+ fail_msg = "Failed to load Pipeline for id {} ... {}".format(
143
+ self.pipeline_id, e)
144
+ self.loading_message = fail_msg
145
+ logger.error(fail_msg)
146
+ raise e
147
+
148
+ def _get_regulator(self):
149
+ """ Create a chain regulator celery task signature.
150
+
151
+ For a chain(), if each element is a group() then celery does not
152
+ properly adhere to the chain elements occurring sequentially. If you
153
+ insert a task that is not a group() in between, though, then the
154
+ chain operates as expected.
155
+ """
156
+ return signature(self.regulator_task,
157
+ queue=self.regulator_queue,
158
+ immutable=True)
159
+
160
+ def _get_success_task(self):
161
+ """ A final 'success' task that's added to the end of every pipeline.
162
+
163
+ This stores the 'success' state in the cached result. Users can
164
+ set other values by using TaskRunner().save_result()
165
+ """
166
+ return get_task_signature(task_path=self.success_task,
167
+ queue=self.success_queue,
168
+ pipeline_id=self.pipeline_id,
169
+ execution_id=self.execution_id)
170
+
171
+ def _get_signature(self, node):
172
+ """ Create a celery task signature based on a graph node.
173
+ """
174
+ metadata = self.pipeline_config['metadata']
175
+ node_config = self.pipeline_config['taskDefinitions'][node]
176
+
177
+ # Node config takes precedence, pipeline metadata as default
178
+ queue = node_config.get('queue', metadata['queue'])
179
+ max_ttl = node_config.get('maxTtl', metadata.get('maxTtl', None))
180
+
181
+ # Ensures task signatures include requisite information to retrieve
182
+ # PipelineRunWrapper from cache using the pipeline id, and execution id.
183
+ # We set immutable=True to ensure each client task can be defined
184
+ # with this specific signature (event)
185
+ # http://docs.celeryproject.org/en/master/userguide/canvas.html#immutability
186
+ return get_task_signature(task_path=node_config.get('handler'),
187
+ queue=queue,
188
+ access_key=self.access_key,
189
+ pipeline_id=self.pipeline_id,
190
+ execution_id=self.execution_id,
191
+ max_ttl=max_ttl,
192
+ immutable=True,
193
+ task_config=node_config)
194
+
195
+ def generate_chain(self):
196
+ """ Generate the full pipeline chain.
197
+ """
198
+ logger.debug(f'Starting Pipeline {self.pipeline_id}')
199
+
200
+ if not self.good_to_go:
201
+ logger.info("Chain deemed to be not good to go.")
202
+ if self.loading_message is None:
203
+ self.loading_message = CHAIN_FAILURE_MSG
204
+ return None
205
+
206
+ try:
207
+ # Create the task chain such that all concurrent tasks are grouped
208
+ # and all high level node groups are run serially
209
+ G = self.pipeline_wrapper.execution_graph
210
+
211
+ total_tasks = 0
212
+ pipeline_chain = []
213
+ chainable_tasks = get_chainable_tasks(G, None, [])
214
+
215
+ # Current chord+chain solution based on
216
+ # https://stackoverflow.com/questions/15123772/celery-chaining-groups-and-subtasks-out-of-order-execution
217
+ # Look also at last comment from Nov 7, 2017 here
218
+ # https://github.com/celery/celery/issues/3597
219
+ # Big outstanding bug in Celery related to failures in chords that
220
+ # results in really nasty log output. See
221
+ # https://github.com/celery/celery/issues/4834
222
+ for i, node_group in enumerate(chainable_tasks):
223
+ total_tasks += len(node_group)
224
+ this_group = []
225
+ for node in node_group:
226
+ node_signature = self._get_signature(node)
227
+ this_group.append(node_signature)
228
+
229
+ if len(this_group) <= 1:
230
+ this_group.append(self._get_regulator())
231
+
232
+ the_chord = chord(header=this_group,
233
+ body=self._get_regulator())
234
+
235
+ pipeline_chain.append(the_chord)
236
+
237
+ # Add a 'finished/success' task to the end of all pipelines
238
+ pipeline_chain.append(
239
+ chord(header=self._get_success_task(),
240
+ body=self._get_regulator()))
241
+
242
+ the_chain = chain(*pipeline_chain)
243
+
244
+ self.loading_message = CHAIN_SUCCESS_MSG
245
+
246
+ self.chain = the_chain
247
+ except Exception as e:
248
+ self.loading_message = CHAIN_FAILURE_MSG + " {}".format(e)
249
+ logger.exception(e)
250
+ the_chain = None
251
+
252
+ self.chain = the_chain
253
+
254
+ return the_chain