scalable-pypeline 1.2.2__py2.py3-none-any.whl → 2.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. pypeline/__init__.py +1 -1
  2. pypeline/barrier.py +34 -0
  3. pypeline/composition.py +348 -0
  4. pypeline/constants.py +51 -84
  5. pypeline/dramatiq.py +470 -0
  6. pypeline/extensions.py +9 -8
  7. pypeline/flask/__init__.py +3 -5
  8. pypeline/flask/api/pipelines.py +109 -148
  9. pypeline/flask/api/schedules.py +14 -39
  10. pypeline/flask/decorators.py +18 -53
  11. pypeline/flask/flask_pypeline.py +156 -0
  12. pypeline/middleware.py +61 -0
  13. pypeline/pipeline_config_schema.py +104 -91
  14. pypeline/pypeline_yaml.py +458 -0
  15. pypeline/schedule_config_schema.py +35 -120
  16. pypeline/utils/config_utils.py +52 -310
  17. pypeline/utils/module_utils.py +35 -71
  18. pypeline/utils/pipeline_utils.py +161 -0
  19. scalable_pypeline-2.0.1.dist-info/METADATA +217 -0
  20. scalable_pypeline-2.0.1.dist-info/RECORD +27 -0
  21. scalable_pypeline-2.0.1.dist-info/entry_points.txt +3 -0
  22. tests/fixtures/__init__.py +0 -1
  23. pypeline/celery.py +0 -206
  24. pypeline/celery_beat.py +0 -254
  25. pypeline/flask/api/utils.py +0 -35
  26. pypeline/flask/flask_sermos.py +0 -156
  27. pypeline/generators.py +0 -196
  28. pypeline/logging_config.py +0 -171
  29. pypeline/pipeline/__init__.py +0 -0
  30. pypeline/pipeline/chained_task.py +0 -70
  31. pypeline/pipeline/generator.py +0 -254
  32. pypeline/sermos_yaml.py +0 -442
  33. pypeline/utils/graph_utils.py +0 -144
  34. pypeline/utils/task_utils.py +0 -552
  35. scalable_pypeline-1.2.2.dist-info/METADATA +0 -163
  36. scalable_pypeline-1.2.2.dist-info/RECORD +0 -33
  37. scalable_pypeline-1.2.2.dist-info/entry_points.txt +0 -2
  38. tests/fixtures/s3_fixtures.py +0 -52
  39. {scalable_pypeline-1.2.2.dist-info → scalable_pypeline-2.0.1.dist-info}/LICENSE +0 -0
  40. {scalable_pypeline-1.2.2.dist-info → scalable_pypeline-2.0.1.dist-info}/WHEEL +0 -0
  41. {scalable_pypeline-1.2.2.dist-info → scalable_pypeline-2.0.1.dist-info}/top_level.txt +0 -0
@@ -1,552 +0,0 @@
1
- """ Utilities for running and managing tasks inside pipelines.
2
- """
3
- import os
4
- import logging
5
- import typing
6
- import uuid
7
- from typing import List, Any, Union
8
- from networkx.classes.digraph import DiGraph
9
- from celery import signature
10
-
11
- from pypeline.constants import DEFAULT_TASK_TTL, \
12
- PIPELINE_RUN_WRAPPER_CACHE_KEY, DEFAULT_RESULT_TTL, \
13
- PIPELINE_RESULT_CACHE_KEY
14
- from pypeline.utils.graph_utils import get_execution_graph
15
- from pypeline.utils.config_utils import load_json_config_from_redis, set_json_config_to_redis
16
- from pypeline.pipeline_config_schema import PipelineConfigValidator
17
-
18
-
19
- logger = logging.getLogger(__name__)
20
- WORKER_NAME = os.environ.get('WORKER_NAME', None)
21
-
22
-
23
- def get_service_config_for_worker(sermos_config: dict,
24
- worker_name: str = None
25
- ) -> Union[dict, None]:
26
- """ For the current WORKER_NAME (which must be present in the environment
27
- of this worker instance for a valid deployment), return the worker's
28
- serviceConfig object.
29
- """
30
- if sermos_config is None:
31
- raise ValueError('Sermos config was not provided')
32
- if worker_name is None:
33
- worker_name = WORKER_NAME
34
- if worker_name is None:
35
- return None
36
-
37
- service_config = sermos_config.get('serviceConfig', [])
38
- for service in service_config:
39
- if service['name'] == worker_name:
40
- return service
41
-
42
- raise ValueError('Could not find a service config for worker '
43
- f'`{worker_name}`. Make sure you have added the service in'
44
- f' your sermos.yaml with `name: {worker_name}` and '
45
- '`type: celery-worker`.')
46
-
47
-
48
- def get_task_signature(task_path: str,
49
- queue: str,
50
- access_key: str = None,
51
- pipeline_id: str = None,
52
- execution_id: str = None,
53
- max_ttl: int = None,
54
- immutable: bool = True,
55
- task_config: dict = None,
56
- custom_event_data: dict = None) -> signature:
57
- """ Generate a task signature with enforced event keyword
58
- """
59
- if task_config is None:
60
- task_config = dict()
61
- if custom_event_data is None:
62
- custom_event_data = dict()
63
-
64
- if queue is None:
65
- # Look for a pipeline task configuration, if one was provided then we
66
- # use queue specified on that task if it's specified.
67
- queue = task_config.get('queue', None)
68
-
69
- # If we still have None or 'default' (for backwards compability), raise
70
- # because we're not requiring that a queue is specified.
71
- if queue in (None, 'default'):
72
- raise ValueError('Must set queue for a worker or registeredTask.')
73
-
74
- if max_ttl is None:
75
- # First look on the pipeline configuration, if a max_ttl is specified,
76
- # then we're using that regardless.
77
- max_ttl = task_config.get('maxTtl', None)
78
-
79
- # If we still have None or 'default', set the default queue!
80
- if max_ttl in (None, 'default'):
81
- max_ttl = DEFAULT_TASK_TTL
82
- task_id = str(uuid.uuid4())
83
- kwargs = {
84
- 'event': {
85
- 'access_key': access_key,
86
- 'pipeline_id': pipeline_id,
87
- 'execution_id': execution_id,
88
- 'task_id': task_id
89
- }
90
- }
91
- if custom_event_data is not None:
92
- kwargs['event'] = {**kwargs['event'], **custom_event_data}
93
-
94
- sig = signature(
95
- task_path,
96
- args=(),
97
- kwargs=kwargs,
98
- immutable=immutable,
99
- task_id=task_id,
100
- options={
101
- 'queue': queue,
102
- 'expires': 86400, # Expire after 1 day. TODO make tunable.
103
- 'soft_time_limit': max_ttl,
104
- 'time_limit': max_ttl + 10, # Add 10s buffer for cleanup
105
- }
106
- )
107
- return sig
108
-
109
-
110
- class PipelineRunWrapper:
111
- """ A wrapper for a single "run" of a Pipeline.
112
-
113
- A 'run' is defined as a single execution of a pipeline, a pipeline
114
- consisting of one or more steps in a chain.
115
-
116
- When a pipeline's run is first executed, the execution id is generated
117
- as a uuid. Subsequent retries of this 'run' will be able to look up
118
- using that execution id.
119
-
120
- The primary purpose for the PipelineRunWrapper is to provide a cached
121
- representation of the full 'run' including retry count and any payload
122
- that should be accessible to any step in the chain. Remember, a pipeline
123
- is running asynchronously and, as such, each node in the graph operates
124
- independent the others, this allows for consistent coordination.
125
- """
126
- pipeline_id: str = None
127
- pipeline_config: dict = None # Pipeline configuration in dictionary format
128
- celery_task_status: dict = None # This tracks the state of tasks within the pipeline
129
- dag_config: dict = None
130
- execution_id: str = None
131
- current_event: dict = None # For single task when from_event(). NOT cached.
132
- cache_key: str = None # Set on init
133
- max_ttl: int = 60 # Overloaded when pipeline_config provided and it's set
134
- max_retry: int = 3 # Overloaded when pipeline_config provided and it's set
135
- retry_count: int = 0
136
- chain_payload: dict = None # Optional data to pass to each step in chain
137
- execution_graph: DiGraph = None
138
- good_to_go = False
139
- loading_message = None
140
-
141
- def __init__(self,
142
- pipeline_id: str,
143
- pipeline_config: dict = None,
144
- execution_id: str = None,
145
- max_ttl: int = 60,
146
- max_retry: int = 3,
147
- chain_payload: dict = None,
148
- current_event: dict = None):
149
- super().__init__()
150
- self.pipeline_id = pipeline_id
151
- self.pipeline_config = pipeline_config
152
-
153
- self.max_ttl = max_ttl
154
- self.max_retry = max_retry
155
-
156
- # Execution IDs uniquely identify a single run of a given pipeline.
157
- # If None is provided, a random id is generated, which will be cached
158
- # and used downstream in the event of a retry. Initial invocations
159
- # should generally not set this value manually.
160
- self.execution_id = execution_id
161
- if self.execution_id is None:
162
- self.execution_id = str(uuid.uuid4())
163
-
164
- self.chain_payload = chain_payload\
165
- if chain_payload is not None else {}
166
-
167
- self.current_event = current_event\
168
- if current_event is not None else {}
169
-
170
- self.cache_key = PIPELINE_RUN_WRAPPER_CACHE_KEY.format(
171
- self.pipeline_id, self.execution_id)
172
-
173
- self.good_to_go = True
174
-
175
- @property
176
- def _cachable_keys(self):
177
- """ For caching purposes, only store json serializable values that are
178
- required for caching / loading from cache.
179
-
180
- Note: Several keys are pulled from the pipeline_config where they are
181
- camelCase and set on this as snake_case. This is done for convenience
182
- in the wrapper. Style convention switching is to keep with the naming
183
- convention of all yaml files following camelCase to conform with k8s
184
- and all local python variables being snake_case. This extraction of
185
- the yaml file variables to place onto the wrapper object is done
186
- during the .load() stage.
187
- """
188
- return ('pipeline_config', 'max_ttl', 'max_retry', 'retry_count',
189
- 'chain_payload', 'pipeline_id', 'celery_task_status')
190
-
191
- def _load_from_cache(self, is_retry=False):
192
- """ Attempt to load this PipelineRunWrapper from cache.
193
- """
194
- logger.debug(f"Attempting to load {self.cache_key} from cache")
195
- try:
196
- cached_wrapper = load_json_config_from_redis(self.cache_key)
197
- if cached_wrapper is not None:
198
- for key in self._cachable_keys:
199
- setattr(self, key, cached_wrapper[key])
200
-
201
- msg = f"{self.cache_key} found in cache ..."
202
- self.loading_message = msg
203
- logger.debug(msg)
204
- else:
205
- raise ValueError(f"Unable to find {self.cache_key} ...")
206
- except Exception as e:
207
- if not is_retry:
208
- self.good_to_go = False
209
- self.loading_message = e
210
- logger.exception(e)
211
-
212
- if self.pipeline_config is None:
213
- raise ValueError("pipeline_config not set, invalid ...")
214
-
215
- return
216
-
217
- def get_task_celery_status(self, task_id: type[uuid.uuid4()]) -> typing.Union[dict, None]:
218
- return next(filter(lambda task: task["task_id"] == task_id, self.celery_task_status), None)
219
-
220
- def save_to_cache(self):
221
- """ Save current state of PipelineRunWrapper to cache, json serialized.
222
- Re-set the key's TTL
223
-
224
- TODO: Lock this so no race condition on concurrent steps.
225
- """
226
- logger.debug(f"Saving {self.cache_key} to cache")
227
- cached_json = {}
228
- for key in self._cachable_keys:
229
- cached_json[key] = getattr(self, key)
230
- ttl = (self.max_ttl *
231
- len(self.pipeline_config['taskDefinitions'])) + 10
232
- set_json_config_to_redis(self.cache_key, cached_json, ttl)
233
-
234
- @classmethod
235
- def from_event(cls, event):
236
- """ Create instance of PipelineRunWrapper from pipeline event.
237
-
238
- Loads the cached PipelineRunWrapper instance, which is assumed to exist
239
- when loading from an event (which should only occur inside a pipeline
240
- node, which means the pipeline has been invoked/generated previously).
241
-
242
- Usage::
243
-
244
- pipeline_wrapper = PipelineRunWrapper.from_event(event)
245
- # pipeline_wrapper.load() # TODO deprecate
246
- """
247
- wrapper = cls(pipeline_id=event.get('pipeline_id', None),
248
- execution_id=event.get('execution_id', None),
249
- current_event=event)
250
- wrapper.load()
251
- return wrapper
252
-
253
- def load(self,
254
- verify_retry_count: bool = True,
255
- allow_deadletter: bool = True,
256
- is_retry: bool = False):
257
- """ Loads PipelineRunWrapper from cache
258
-
259
- If verify_retry_count is True, this will deadletter the task wrapper
260
- immediately (if deadletter=True) if retry count is exceeded.
261
- """
262
- try:
263
- # Pipeline config is expected to be provided when first initializing
264
- # a pipeline run wrapper. On subsequent runs or when loading from
265
- # an event, the run wrapper can be loaded using only the pipeline
266
- # id and execution id, the pipeline config is then initialized from
267
- # the wrapper
268
- if self.pipeline_config is None or is_retry:
269
- self._load_from_cache(is_retry=is_retry)
270
- else:
271
- # If the pipeline_config is set before .load(), that means
272
- # this invocation is coming from an initial load, not cache.
273
- # We don't want to re-set pipeline_config and the retry_count
274
- # and chain_payload are not going to exist, as they are an
275
- # artifact of the caching process. We also explicitly skip
276
- # pipeline_id, max_retry, and max_ttl keys because those are
277
- # metadata keys in the pipeline_config and are camel case
278
- # (pipelineId/maxRetry/maxTtl), we set them on this wrapper
279
- # object purely for convenience and to provide logical defaults.
280
- for key in self._cachable_keys:
281
- if key in ('pipeline_config', 'pipeline_id', 'max_retry',
282
- 'max_retry', 'max_ttl', 'retry_count',
283
- 'chain_payload', 'celery_task_status'):
284
- continue
285
- setattr(self, key, self.pipeline_config[key])
286
-
287
- # Validate pipeline config
288
- PipelineConfigValidator(config_dict=self.pipeline_config)
289
-
290
- # Initialize the actual pipeline configuration and execution graph
291
- self.dag_config = self.pipeline_config['dagAdjacency']
292
- self.execution_graph = get_execution_graph(self.pipeline_config)
293
-
294
- # Overload defaults if explicitly provided
295
- self.max_ttl = self.pipeline_config['metadata'].get(
296
- 'maxTtl', self.max_ttl)
297
- self.max_retry = self.pipeline_config['metadata'].get(
298
- 'maxRetry', self.max_retry)
299
-
300
- if is_retry:
301
- self.increment_retry()
302
-
303
- if verify_retry_count and self.retry_exceeded:
304
- msg = "Attempted to retry {}_{}; exceeded retry count."\
305
- .format(self.pipeline_id, self.execution_id)
306
- logger.warning(msg)
307
- self.loading_message = msg
308
- if allow_deadletter:
309
- self.deadletter()
310
- return
311
-
312
- self.save_to_cache() # Always save back to cache
313
- except Exception as e:
314
- logger.exception(e)
315
- self.loading_message = e
316
- if allow_deadletter:
317
- self.deadletter()
318
- return
319
-
320
- self.loading_message = "Loaded Successfully."
321
- return
322
-
323
- def increment_retry(self, exceed_max: bool = False):
324
- """ Increment retry_count by 1
325
-
326
- `cache` determines whether this will re-cache object after increment
327
- `exceed_max` allows an instant kickout of this to deadletter.
328
- """
329
- if exceed_max:
330
- new_count = self.max_retry + 1
331
- else:
332
- new_count = self.retry_count + 1
333
-
334
- logger.debug(f"Incrementing Retry to {new_count}")
335
- self.retry_count = new_count
336
- self.save_to_cache()
337
-
338
- @property
339
- def retry_exceeded(self):
340
- """ Determine if retry_count has been exceeded.
341
- """
342
- logger.debug(f"Checking retry count: {self.retry_count} / "
343
- f"{self.max_retry} / {self.retry_count > self.max_retry}")
344
- if self.retry_count >= self.max_retry:
345
- return True
346
- return False
347
-
348
- def deadletter(self):
349
- """ Add details of this PipelineTask to a deadletter queue.
350
-
351
- TODO:
352
- - add to a system for tracking failed pipeline runs
353
- - delete task wrapper and all tasks from cache
354
- """
355
- self.good_to_go = False
356
- pr = PipelineResult(
357
- self.execution_id,
358
- status='failed',
359
- result='Pipeline retried and failed {} times.'.format(
360
- self.retry_count))
361
- pr.save()
362
- self.increment_retry(
363
- exceed_max=True) # Ensure this won't be retried...
364
- return
365
-
366
-
367
- class PipelineResult:
368
- """ Standard store for pipeline results.
369
-
370
- Helps keep standard way to store/retrieve results + status messages
371
- for pipelines.
372
-
373
- Can get fancier in the future by tracking retry count, pipeline
374
- execution time, etc.
375
- """
376
- def __init__(self,
377
- execution_id: str,
378
- status: str = None,
379
- result: Any = None,
380
- result_ttl: int = DEFAULT_RESULT_TTL):
381
- super().__init__()
382
- self.execution_id = execution_id
383
- if self.execution_id is None:
384
- raise ValueError("Must provide an execution_id!")
385
- self.status = status
386
- self.result = result
387
- self.results = result # TODO Deprecate in future release, keep singular
388
- self.result_ttl = result_ttl
389
- self.cache_key =\
390
- PIPELINE_RESULT_CACHE_KEY.format(self.execution_id)
391
-
392
- self.valid_status_types = ('pending', 'success', 'failed',
393
- 'unavailable')
394
-
395
- # Always validate status
396
- self._validate_status()
397
-
398
- def _validate_status(self):
399
- if self.status and self.status not in self.valid_status_types:
400
- raise ValueError("{} is not a valid status type ({})".format(
401
- self.status, self.valid_status_types))
402
-
403
- def save(self, status: str = None, result: Any = None):
404
- """ Save the result's current state.
405
-
406
- If status and/or result are not provided, then the existing instance
407
- state is used. You can override either by passing to this fn.
408
-
409
- Typical use case would be to initialize the PipelineResult with only
410
- the execution ID, then 'save_result()' and pass status/result.
411
- """
412
- if status is not None:
413
- self.status = status
414
- if result is not None:
415
- self.result = result
416
- self.results = result # TODO Deprecate in future release
417
- set_json_config_to_redis(self.cache_key, self.to_dict(),
418
- self.result_ttl)
419
-
420
- def load(self):
421
- """ Load a pipeline result from cache.
422
- """
423
- results = load_json_config_from_redis(self.cache_key)
424
- if results is not None:
425
- for k in results:
426
- setattr(self, k, results[k])
427
- else:
428
- self.status = 'unavailable'
429
- self.result = None
430
- self.results = None # TODO Deprecate in future release
431
-
432
- @classmethod
433
- def from_event(cls, event):
434
- """ Create initialized instance of PipelineResult from a pipeline event.
435
-
436
- Usage::
437
-
438
- pipeline_result = PipelineResult.from_event(event)
439
- pipeline_result.save(
440
- result='my result value'
441
- )
442
- """
443
- pr = cls(execution_id=event.get('execution_id', None))
444
- pr.load()
445
- return pr
446
-
447
- def to_dict(self):
448
- """ Return serializable version of result for storage/retrieval.
449
- """
450
- return {
451
- 'execution_id': self.execution_id,
452
- 'status': self.status,
453
- 'result': self.result,
454
- 'results': self.result, # TODO Deprecate in future release
455
- 'result_ttl': self.result_ttl
456
- }
457
-
458
-
459
- class TaskRunner:
460
- """ Run tasks in Sermos
461
- """
462
- @classmethod
463
- def save_result(cls):
464
- """ Save a task result
465
- """
466
- # TODO Implement
467
-
468
- @classmethod
469
- def publish_work(cls,
470
- task_path: str,
471
- task_payload: dict,
472
- queue: str = None,
473
- max_ttl: int = None):
474
- """ Uniform way to issue a task to another celery worker.
475
-
476
- Args:
477
- task_path (str): Full path to task intended to run. e.g.
478
- sermos_company_client.workers.my_work.task_name
479
- task_payload (dict): A dictionary containing whatever payload
480
- the receiving task expects. This is merged into the `event`
481
- argument for the receiving task such that any top level
482
- keys in your `task_payload` are found at event['the_key']
483
- queue (str): The queue on which to place this task.
484
- Ensure there are workers available to accept work on
485
- that queue.
486
- max_ttl (int): Optional. Max time to live for the issued task.
487
- If not specified, system default is used.
488
- """
489
- try:
490
- worker = get_task_signature(task_path=task_path,
491
- queue=queue,
492
- max_ttl=max_ttl,
493
- custom_event_data=task_payload)
494
- worker.delay()
495
- except Exception as e:
496
- logger.error(f"Failed to publish work ... {e}")
497
- return False
498
-
499
- return True
500
-
501
- @classmethod
502
- def publish_work_in_batches(cls,
503
- task_path: str,
504
- task_payload_list: List[dict],
505
- queue: str,
506
- grouping_key: str = 'tasks',
507
- max_per_task: int = 5,
508
- max_ttl: int = None):
509
- """ Uniform way to issue tasks to celery in 'batches'.
510
-
511
- This allows work to be spread over multiple workers, each worker is
512
- able to consume one or more messages in a single task.
513
-
514
- Args:
515
- task_path (str): Full path to task intended to run. e.g.
516
- sermos_company_client.workers.my_work.task_name
517
- task_payload_list (list): A list of dictionaries containing
518
- whatever payload the receiving task expects. This is broken
519
- into batches according to `max_per_task` and nested under
520
- the `grouping_key` in the `event` argument for the receiving
521
- task such that payload dicts are found at event['grouping_key']
522
- queue (str): The queue on which to place this task.
523
- Ensure there are workers available to accept work on
524
- that queue.
525
- grouping_key (str): Default: tasks. Sets the key name under the
526
- receiving task's `event` where the payload items are found.
527
- max_per_task (int): Default: 5. Maximum number of tasks from the
528
- `task_payload_list` that will be bundled under the `grouping_key`
529
- and issued as a single task to the receiving worker.
530
- max_ttl (int): Optional. Max time to live for the issued task.
531
- If not specified, system default is used.
532
- """
533
- try:
534
- if len(task_payload_list) > 0:
535
- for idx in range(len(task_payload_list)):
536
- if idx % max_per_task == 0:
537
- custom_event_data = {
538
- grouping_key:
539
- task_payload_list[idx:idx + max_per_task]
540
- }
541
-
542
- worker = get_task_signature(
543
- task_path=task_path,
544
- queue=queue,
545
- max_ttl=max_ttl,
546
- custom_event_data=custom_event_data)
547
- worker.delay()
548
- except Exception as e:
549
- logger.error(f"Failed to publish work in batches ... {e}")
550
- return False
551
-
552
- return True