scalable-pypeline 1.2.3__py2.py3-none-any.whl → 2.0.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. pypeline/__init__.py +1 -1
  2. pypeline/barrier.py +34 -0
  3. pypeline/composition.py +349 -0
  4. pypeline/constants.py +51 -84
  5. pypeline/dramatiq.py +470 -0
  6. pypeline/extensions.py +9 -8
  7. pypeline/flask/__init__.py +3 -5
  8. pypeline/flask/api/pipelines.py +109 -148
  9. pypeline/flask/api/schedules.py +14 -39
  10. pypeline/flask/decorators.py +18 -53
  11. pypeline/flask/flask_pypeline.py +156 -0
  12. pypeline/middleware.py +61 -0
  13. pypeline/pipeline_config_schema.py +105 -92
  14. pypeline/pypeline_yaml.py +458 -0
  15. pypeline/schedule_config_schema.py +35 -120
  16. pypeline/utils/config_utils.py +52 -310
  17. pypeline/utils/module_utils.py +35 -71
  18. pypeline/utils/pipeline_utils.py +161 -0
  19. scalable_pypeline-2.0.2.dist-info/METADATA +217 -0
  20. scalable_pypeline-2.0.2.dist-info/RECORD +27 -0
  21. scalable_pypeline-2.0.2.dist-info/entry_points.txt +3 -0
  22. tests/fixtures/__init__.py +0 -1
  23. pypeline/celery.py +0 -206
  24. pypeline/celery_beat.py +0 -254
  25. pypeline/flask/api/utils.py +0 -35
  26. pypeline/flask/flask_sermos.py +0 -156
  27. pypeline/generators.py +0 -196
  28. pypeline/logging_config.py +0 -171
  29. pypeline/pipeline/__init__.py +0 -0
  30. pypeline/pipeline/chained_task.py +0 -70
  31. pypeline/pipeline/generator.py +0 -254
  32. pypeline/sermos_yaml.py +0 -442
  33. pypeline/utils/graph_utils.py +0 -144
  34. pypeline/utils/task_utils.py +0 -552
  35. scalable_pypeline-1.2.3.dist-info/METADATA +0 -163
  36. scalable_pypeline-1.2.3.dist-info/RECORD +0 -33
  37. scalable_pypeline-1.2.3.dist-info/entry_points.txt +0 -2
  38. tests/fixtures/s3_fixtures.py +0 -52
  39. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/LICENSE +0 -0
  40. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/WHEEL +0 -0
  41. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/top_level.txt +0 -0
@@ -1,171 +0,0 @@
1
- import os
2
- import logging
3
- import logging.config
4
- from pypeline import __version__
5
- from logging import StreamHandler
6
-
7
- logging_set = False
8
-
9
-
10
- def get_log_level(level: str = None) -> int:
11
- """ Attempt to get the log level from the environment, otherwise use the
12
- default INFO level. The environment variable LOG_LEVEL should be e.g.,
13
- 'DEBUG'
14
- """
15
- if level is not None:
16
- level_str = str(level)
17
- else:
18
- level_str = os.environ.get('LOG_LEVEL', 'INFO')
19
- return getattr(logging, level_str)
20
-
21
-
22
- def get_log_format(type: str = 'standard',
23
- app_version: str = None,
24
- client_version: str = None):
25
- """ Standard log format. Supports `standard` and `simple`
26
- """
27
- if app_version is None:
28
- app_version = "?"
29
- if client_version is None:
30
- client_version = "?"
31
-
32
- format = '%(message)s'
33
- if type == 'standard':
34
- format = '%(process)d - %(levelname)s - %(asctime)s - '\
35
- + '%(filename)s (%(lineno)d) - '\
36
- + 'sermos v{} - client v{} - %(message)s'\
37
- .format(app_version, client_version)
38
- elif type == 'simple':
39
- format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
40
-
41
- return format
42
-
43
-
44
- def get_date_format():
45
- """ Standard date format
46
- """
47
- return '%Y-%m-%dT%H:%M:%S'
48
-
49
-
50
- def setup_logging(app_version: str = None,
51
- client_version: str = None,
52
- default_level: str = None,
53
- overload_elasticsearch: bool = False,
54
- establish_logging_config: bool = True):
55
- """ Setup logging configuration for standard streaming output + optional
56
- log aggregator.
57
-
58
- Standard usage is to invoke this at application bootstrapping time
59
- to establish default log handling. e.g.
60
-
61
- def create_app():
62
- setup_logging()
63
-
64
- Individual application modules should load a logger like normal:
65
- import logging
66
- logger = logging.getLogger(__name__)
67
-
68
- elasticsearch-py is overly verbose with it's 'info' logging. This
69
- will set that logger to `warning` if `overload_elasticsearch` is True
70
-
71
- `establish_logging_config` is intended to be used by something invoking
72
- setup_logging() explicitly with the intention of setting the final
73
- configuration, which is the default behavior. Set this to `False` in the
74
- case where you might not be sure if logging has been set up yet.
75
- """
76
- global logging_set
77
-
78
- if logging_set and not establish_logging_config:
79
- return
80
-
81
- if establish_logging_config or not logging_set:
82
- logging_set = True
83
-
84
- # Set our application version values, which can be passed to this method.
85
- # By default, we report the app versions for sermos and the client
86
- A_VERSION = __version__ # sermos version
87
- CA_VERSION = None # application version of client app using sermos
88
- if app_version is not None:
89
- A_VERSION = app_version
90
- if client_version is not None:
91
- CA_VERSION = client_version
92
-
93
- log_level = get_log_level(default_level)
94
-
95
- config = {
96
- 'disable_existing_loggers': False,
97
- 'version': 1,
98
- 'formatters': {
99
- 'simple': {
100
- 'format':
101
- get_log_format(type='simple',
102
- app_version=A_VERSION,
103
- client_version=CA_VERSION),
104
- 'datefmt':
105
- get_date_format()
106
- },
107
- 'standard': {
108
- 'format':
109
- get_log_format(type='standard',
110
- app_version=A_VERSION,
111
- client_version=CA_VERSION),
112
- 'datefmt':
113
- get_date_format()
114
- },
115
- },
116
- 'handlers': {
117
- 'consoleFull': {
118
- 'level': 'DEBUG',
119
- 'formatter': 'standard',
120
- 'class': 'logging.StreamHandler',
121
- 'stream': 'ext://sys.stdout'
122
- },
123
- },
124
- 'loggers': {
125
- '': {
126
- 'handlers': ['consoleFull'],
127
- 'level': 'ERROR',
128
- },
129
- 'sermos': {
130
- 'handlers': ['consoleFull'],
131
- 'level': 'DEBUG',
132
- 'propagate': False
133
- },
134
- 'timing': {
135
- 'handlers': ['consoleFull'],
136
- 'level': 'DEBUG',
137
- 'propagate': False
138
- },
139
- 'celery': {
140
- 'handlers': ['consoleFull'],
141
- 'level': 'DEBUG',
142
- 'propagate': False
143
- },
144
- 'bin': {
145
- 'handlers': ['consoleFull'],
146
- 'level': 'DEBUG',
147
- 'propagate': False
148
- },
149
- },
150
- 'root': {
151
- 'level': 'DEBUG',
152
- 'handlers': ['consoleFull']
153
- }
154
- }
155
-
156
- for handler, handler_config in config['handlers'].items():
157
- # Override this handler's level to the level passed to this method
158
- handler_config['level'] = log_level
159
- config['handlers'][handler] = handler_config
160
-
161
- # Set the root handler's level
162
- config['root']['level'] = log_level
163
-
164
- logging.config.dictConfig(config)
165
-
166
- es_logger = logging.getLogger('elasticsearch')
167
- if overload_elasticsearch is True:
168
- es_logger.setLevel(logging.WARNING)
169
- else:
170
- # Ensure to set to baseline in the event this is invoked multiple times.
171
- es_logger.setLevel(logging.INFO)
File without changes
@@ -1,70 +0,0 @@
1
- import logging
2
- from celery import Task
3
- from pypeline.utils.task_utils import PipelineRunWrapper
4
-
5
-
6
- logger = logging.getLogger(__name__)
7
-
8
-
9
- class ChainedTask(Task):
10
- """ A Celery Task that is used as the _base_ for all dynamically
11
- generated tasks (by GenerateCeleryTasks().generate()). This injects
12
- `event` into every task's signature, which allows pipelines to pass
13
- event information easily through a chain.
14
- """
15
- abstract = True
16
-
17
- def __call__(self, *args, **kwargs):
18
- """ Allow the return value of one task to update the kwargs of a
19
- subsequent task if it's a dictionary. Important to the function
20
- of a pipeline to allow event information to flow easily.
21
- """
22
- # Inject app context
23
- if len(args) == 1 and isinstance(args[0], dict):
24
- kwargs.update(args[0])
25
- args = ()
26
-
27
- # Event holds information used in PipelineRunWrapper and
28
- # other areas.
29
- if 'event' not in kwargs.keys():
30
- kwargs['event'] = {}
31
- # This is a special worker from dyrygent that orchestrates our
32
- # pipelines. It provides a patch in fix for celery's poor
33
- # implementation of Canvas work-flows
34
- if self.__name__ == 'workflow_processor':
35
- kwargs.pop('event', None)
36
- return super(ChainedTask, self).__call__(*args, **kwargs)
37
-
38
- def after_return(self, status, retval, task_id, args, kwargs, einfo):
39
- if "event" in kwargs and "pipeline_id" in kwargs["event"]:
40
- try:
41
- pipeline_run_wrapper: PipelineRunWrapper = \
42
- PipelineRunWrapper.from_event(kwargs["event"])
43
- current_task_status = pipeline_run_wrapper.get_task_celery_status(task_id)
44
- except Exception:
45
- logger.exception("Unable to retreive Pipeline Run Wrapper")
46
- return
47
-
48
- if current_task_status:
49
- current_task_status["status"] = status
50
- try:
51
- pipeline_run_wrapper.save_to_cache()
52
- except Exception:
53
- logger.exception(f"Failed to update celery task status for task {task_id}")
54
-
55
- def on_retry(self, exc, task_id, args, kwargs, einfo):
56
- if "event" in kwargs and "pipeline_id" in kwargs["event"]:
57
- try:
58
- pipeline_run_wrapper: PipelineRunWrapper = \
59
- PipelineRunWrapper.from_event(kwargs["event"])
60
- current_task_status = pipeline_run_wrapper.get_task_celery_status(task_id)
61
- except Exception:
62
- logger.exception("Unable to retreive Pipeline Run Wrapper")
63
- return
64
-
65
- if current_task_status:
66
- current_task_status["retries"] = current_task_status["retries"] + 1
67
- try:
68
- pipeline_run_wrapper.save_to_cache()
69
- except Exception:
70
- logger.exception(f"Failed to update celery task status for task {task_id}")
@@ -1,254 +0,0 @@
1
- """ Utilities for running and managing tasks inside pipelines.
2
- """
3
- import logging
4
-
5
- from celery import signature, chord, chain
6
- from pypeline.utils.graph_utils import get_chainable_tasks
7
- from pypeline.utils.config_utils import retrieve_latest_pipeline_config
8
- from pypeline.utils.task_utils import PipelineRunWrapper, get_task_signature
9
- from pypeline.constants import DEFAULT_TASK_TTL, DEFAULT_MAX_RETRY, \
10
- DEFAULT_REGULATOR_TASK, CHAIN_FAILURE_MSG, CHAIN_SUCCESS_MSG, \
11
- DEFAULT_SUCCESS_TASK
12
- from pypeline.pipeline_config_schema import PipelineConfigValidator
13
-
14
-
15
- logger = logging.getLogger(__name__)
16
-
17
-
18
- class PipelineGenerator(object):
19
- """ Allows an API endpoint to generate a functional pipeline based on the
20
- requested pipeline id. Allows API to then issue the tasks asynchronously
21
- to initiate the pipeline. Thereafter, celery will monitor status and
22
- handle success/failure modes so the API web worker can return
23
- immediately.
24
-
25
- The primary purpose is to unpack the pipeline config, create the
26
- requisite cached entities to track pipeline progress, and apply the
27
- chained pipeline tasks asynchronously so Celery can take over.
28
-
29
- Usage:
30
- gen = PipelineGenerator(pipeline_id)
31
- chain = gen.generate_chain()
32
- chain.on_error(custom_error_task.s()) # Optional add error handling
33
- chain.delay()
34
- """
35
- def __init__(self,
36
- pipeline_id: str,
37
- access_key: str = None,
38
- execution_id: str = None,
39
- queue: str = None,
40
- default_task_ttl: int = None,
41
- regulator_queue: str = None,
42
- regulator_task: str = None,
43
- success_queue: str = None,
44
- success_task: str = None,
45
- default_max_retry: int = None,
46
- retry_backoff: int = None,
47
- retry_jitter: bool = None,
48
- retry_backoff_max: int = None,
49
- chain_payload: dict = None):
50
- super().__init__()
51
- self.pipeline_id = pipeline_id
52
- self.access_key = access_key
53
-
54
- pipeline_config_api_resp = retrieve_latest_pipeline_config(
55
- pipeline_id=self.pipeline_id, access_key=self.access_key)
56
-
57
- if pipeline_config_api_resp is None:
58
- raise ValueError("Unable to load Pipeline Configuration for "
59
- f"pipeline id: {self.pipeline_id} ...")
60
-
61
- # The only part of the API response used for any 'pipeline config'
62
- # is the `config` key. The API nests it under `config` to preserve
63
- # ability to add additional detail at a later date.
64
- self.pipeline_config = pipeline_config_api_resp.get('config', {})
65
- schema_version = pipeline_config_api_resp.get('schemaVersion')
66
- PipelineConfigValidator(config_dict=self.pipeline_config,
67
- schema_version=schema_version)
68
-
69
- self.execution_id = execution_id # UUID string
70
- self.good_to_go = False # Indicates initialization/loading success
71
- self.loading_message = None # Allows access to success/error messages
72
- self.is_retry = False if self.execution_id is None else True
73
-
74
- self.default_max_retry = default_max_retry \
75
- if default_max_retry is not None else \
76
- self.pipeline_config['metadata'].get('maxRetry', DEFAULT_MAX_RETRY)
77
- self.retry_backoff = retry_backoff \
78
- if retry_backoff is not None else \
79
- self.pipeline_config['metadata'].get('retryBackoff', 3)
80
- self.retry_backoff_max = retry_backoff \
81
- if retry_backoff_max is not None else \
82
- self.pipeline_config['metadata'].get('retryBackoffMax', 600)
83
- self.retry_jitter = retry_jitter \
84
- if retry_jitter is not None else \
85
- self.pipeline_config['metadata'].get('retryJitter', False)
86
-
87
- # Queue on which to place tasks by default and default TTL per task
88
- # These can be overridden in PipelineConfig.config['taskDefinitions']
89
- self.queue = queue \
90
- if queue is not None \
91
- else self.pipeline_config['metadata']['queue']
92
- self.default_task_ttl = default_task_ttl \
93
- if default_task_ttl is not None else \
94
- self.pipeline_config['metadata'].get('maxTtl', DEFAULT_TASK_TTL)
95
-
96
- # See docstring in self._get_regulator()
97
- self.regulator_queue = regulator_queue \
98
- if regulator_queue is not None \
99
- else self.pipeline_config['metadata']['queue']
100
- self.regulator_task = regulator_task\
101
- if regulator_task is not None else DEFAULT_REGULATOR_TASK
102
-
103
- # See docstring in self._get_success_task()
104
- self.success_queue = success_queue \
105
- if success_queue is not None \
106
- else self.pipeline_config['metadata']['queue']
107
- self.success_task = success_task\
108
- if success_task is not None else DEFAULT_SUCCESS_TASK
109
-
110
- # Optional data to pass to each step in chain
111
- self.chain_payload = chain_payload\
112
- if chain_payload is not None else {}
113
-
114
- self.pipeline_wrapper = None # Allows access to the PipelineRunWrapper
115
- self.chain = None # Must be intentionally built with generate_chain()
116
-
117
- try:
118
- # Generate our wrapper for this pipeline_id / execution_id
119
- self.pipeline_wrapper = PipelineRunWrapper(
120
- pipeline_id=self.pipeline_id,
121
- pipeline_config=self.pipeline_config,
122
- execution_id=self.execution_id,
123
- max_ttl=self.default_task_ttl,
124
- max_retry=self.default_max_retry,
125
- chain_payload=self.chain_payload)
126
-
127
- # Loads pipeline config from remote or cache if it's already there
128
- # `is_retry` will be True for any PipelineGenerator instantiated
129
- # with an execution_id. This flag helps the wrapper increment the
130
- # retry count and determine if this should be deadlettered.
131
- # This step also saves the valid/initialized run wrapper to cache.
132
- self.pipeline_wrapper.load(is_retry=self.is_retry)
133
-
134
- # Set all variables that were established from the run wrapper
135
- # initialization. Notably, default_task_ttl can be overloaded
136
- # if the pipeline config has an explicit maxTtl set in metadata.
137
- self.good_to_go = self.pipeline_wrapper.good_to_go
138
- self.loading_message = self.pipeline_wrapper.loading_message
139
- self.execution_id = self.pipeline_wrapper.execution_id
140
-
141
- except Exception as e:
142
- fail_msg = "Failed to load Pipeline for id {} ... {}".format(
143
- self.pipeline_id, e)
144
- self.loading_message = fail_msg
145
- logger.error(fail_msg)
146
- raise e
147
-
148
- def _get_regulator(self):
149
- """ Create a chain regulator celery task signature.
150
-
151
- For a chain(), if each element is a group() then celery does not
152
- properly adhere to the chain elements occurring sequentially. If you
153
- insert a task that is not a group() in between, though, then the
154
- chain operates as expected.
155
- """
156
- return signature(self.regulator_task,
157
- queue=self.regulator_queue,
158
- immutable=True)
159
-
160
- def _get_success_task(self):
161
- """ A final 'success' task that's added to the end of every pipeline.
162
-
163
- This stores the 'success' state in the cached result. Users can
164
- set other values by using TaskRunner().save_result()
165
- """
166
- return get_task_signature(task_path=self.success_task,
167
- queue=self.success_queue,
168
- pipeline_id=self.pipeline_id,
169
- execution_id=self.execution_id)
170
-
171
- def _get_signature(self, node):
172
- """ Create a celery task signature based on a graph node.
173
- """
174
- metadata = self.pipeline_config['metadata']
175
- node_config = self.pipeline_config['taskDefinitions'][node]
176
-
177
- # Node config takes precedence, pipeline metadata as default
178
- queue = node_config.get('queue', metadata['queue'])
179
- max_ttl = node_config.get('maxTtl', metadata.get('maxTtl', None))
180
-
181
- # Ensures task signatures include requisite information to retrieve
182
- # PipelineRunWrapper from cache using the pipeline id, and execution id.
183
- # We set immutable=True to ensure each client task can be defined
184
- # with this specific signature (event)
185
- # http://docs.celeryproject.org/en/master/userguide/canvas.html#immutability
186
- return get_task_signature(task_path=node_config.get('handler'),
187
- queue=queue,
188
- access_key=self.access_key,
189
- pipeline_id=self.pipeline_id,
190
- execution_id=self.execution_id,
191
- max_ttl=max_ttl,
192
- immutable=True,
193
- task_config=node_config)
194
-
195
- def generate_chain(self):
196
- """ Generate the full pipeline chain.
197
- """
198
- logger.debug(f'Starting Pipeline {self.pipeline_id}')
199
-
200
- if not self.good_to_go:
201
- logger.info("Chain deemed to be not good to go.")
202
- if self.loading_message is None:
203
- self.loading_message = CHAIN_FAILURE_MSG
204
- return None
205
-
206
- try:
207
- # Create the task chain such that all concurrent tasks are grouped
208
- # and all high level node groups are run serially
209
- G = self.pipeline_wrapper.execution_graph
210
-
211
- total_tasks = 0
212
- pipeline_chain = []
213
- chainable_tasks = get_chainable_tasks(G, None, [])
214
-
215
- # Current chord+chain solution based on
216
- # https://stackoverflow.com/questions/15123772/celery-chaining-groups-and-subtasks-out-of-order-execution
217
- # Look also at last comment from Nov 7, 2017 here
218
- # https://github.com/celery/celery/issues/3597
219
- # Big outstanding bug in Celery related to failures in chords that
220
- # results in really nasty log output. See
221
- # https://github.com/celery/celery/issues/4834
222
- for i, node_group in enumerate(chainable_tasks):
223
- total_tasks += len(node_group)
224
- this_group = []
225
- for node in node_group:
226
- node_signature = self._get_signature(node)
227
- this_group.append(node_signature)
228
-
229
- if len(this_group) <= 1:
230
- this_group.append(self._get_regulator())
231
-
232
- the_chord = chord(header=this_group,
233
- body=self._get_regulator())
234
-
235
- pipeline_chain.append(the_chord)
236
-
237
- # Add a 'finished/success' task to the end of all pipelines
238
- pipeline_chain.append(
239
- chord(header=self._get_success_task(),
240
- body=self._get_regulator()))
241
-
242
- the_chain = chain(*pipeline_chain)
243
-
244
- self.loading_message = CHAIN_SUCCESS_MSG
245
-
246
- self.chain = the_chain
247
- except Exception as e:
248
- self.loading_message = CHAIN_FAILURE_MSG + " {}".format(e)
249
- logger.exception(e)
250
- the_chain = None
251
-
252
- self.chain = the_chain
253
-
254
- return the_chain