PyPI - scalable-pypeline - Versions diffs - 1.2.3__py2.py3-none-any.whl → 2.0.1__py2.py3-none-any.whl - Mend

scalable-pypeline 1.2.3py2.py3-none-any.whl → 2.0.1py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

pypeline/__init__.py +1 -1
pypeline/barrier.py +34 -0
pypeline/composition.py +348 -0
pypeline/constants.py +51 -84
pypeline/dramatiq.py +470 -0
pypeline/extensions.py +9 -8
pypeline/flask/__init__.py +3 -5
pypeline/flask/api/pipelines.py +109 -148
pypeline/flask/api/schedules.py +14 -39
pypeline/flask/decorators.py +18 -53
pypeline/flask/flask_pypeline.py +156 -0
pypeline/middleware.py +61 -0
pypeline/pipeline_config_schema.py +105 -92
pypeline/pypeline_yaml.py +458 -0
pypeline/schedule_config_schema.py +35 -120
pypeline/utils/config_utils.py +52 -310
pypeline/utils/module_utils.py +35 -71
pypeline/utils/pipeline_utils.py +161 -0
scalable_pypeline-2.0.1.dist-info/METADATA +217 -0
scalable_pypeline-2.0.1.dist-info/RECORD +27 -0
scalable_pypeline-2.0.1.dist-info/entry_points.txt +3 -0
tests/fixtures/__init__.py +0 -1
pypeline/celery.py +0 -206
pypeline/celery_beat.py +0 -254
pypeline/flask/api/utils.py +0 -35
pypeline/flask/flask_sermos.py +0 -156
pypeline/generators.py +0 -196
pypeline/logging_config.py +0 -171
pypeline/pipeline/__init__.py +0 -0
pypeline/pipeline/chained_task.py +0 -70
pypeline/pipeline/generator.py +0 -254
pypeline/sermos_yaml.py +0 -442
pypeline/utils/graph_utils.py +0 -144
pypeline/utils/task_utils.py +0 -552
scalable_pypeline-1.2.3.dist-info/METADATA +0 -163
scalable_pypeline-1.2.3.dist-info/RECORD +0 -33
scalable_pypeline-1.2.3.dist-info/entry_points.txt +0 -2
tests/fixtures/s3_fixtures.py +0 -52
{scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.1.dist-info}/LICENSE +0 -0
{scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.1.dist-info}/WHEEL +0 -0
{scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.1.dist-info}/top_level.txt +0 -0

pypeline/logging_config.py DELETED Viewed

@@ -1,171 +0,0 @@
-import os
-import logging
-import logging.config
-from pypeline import __version__
-from logging import StreamHandler
-logging_set = False
-def get_log_level(level: str = None) -> int:
-    """ Attempt to get the log level from the environment, otherwise use the
-        default INFO level.  The environment variable LOG_LEVEL should be e.g.,
-        'DEBUG'
-    """
-    if level is not None:
-        level_str = str(level)
-    else:
-        level_str = os.environ.get('LOG_LEVEL', 'INFO')
-    return getattr(logging, level_str)
-def get_log_format(type: str = 'standard',
-                   app_version: str = None,
-                   client_version: str = None):
-    """ Standard log format. Supports `standard` and `simple`
-    """
-    if app_version is None:
-        app_version = "?"
-    if client_version is None:
-        client_version = "?"
-    format = '%(message)s'
-    if type == 'standard':
-        format = '%(process)d - %(levelname)s - %(asctime)s - '\
-            + '%(filename)s (%(lineno)d) - '\
-            + 'sermos v{} - client v{} - %(message)s'\
-            .format(app_version, client_version)
-    elif type == 'simple':
-        format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    return format
-def get_date_format():
-    """ Standard date format
-    """
-    return '%Y-%m-%dT%H:%M:%S'
-def setup_logging(app_version: str = None,
-                  client_version: str = None,
-                  default_level: str = None,
-                  overload_elasticsearch: bool = False,
-                  establish_logging_config: bool = True):
-    """ Setup logging configuration for standard streaming output + optional
-        log aggregator.
-        Standard usage is to invoke this at application bootstrapping time
-        to establish default log handling. e.g.
-        def create_app():
-            setup_logging()
-        Individual application modules should load a logger like normal:
-        import logging
-        logger = logging.getLogger(__name__)
-        elasticsearch-py is overly verbose with it's 'info' logging. This
-        will set that logger to `warning` if `overload_elasticsearch` is True
-        `establish_logging_config` is intended to be used by something invoking
-        setup_logging() explicitly with the intention of setting the final
-        configuration, which is the default behavior. Set this to `False` in the
-        case where you might not be sure if logging has been set up yet.
-    """
-    global logging_set
-    if logging_set and not establish_logging_config:
-        return
-    if establish_logging_config or not logging_set:
-        logging_set = True
-    # Set our application version values, which can be passed to this method.
-    # By default, we report the app versions for sermos and the client
-    A_VERSION = __version__  # sermos version
-    CA_VERSION = None  # application version of client app using sermos
-    if app_version is not None:
-        A_VERSION = app_version
-    if client_version is not None:
-        CA_VERSION = client_version
-    log_level = get_log_level(default_level)
-    config = {
-        'disable_existing_loggers': False,
-        'version': 1,
-        'formatters': {
-            'simple': {
-                'format':
-                get_log_format(type='simple',
-                               app_version=A_VERSION,
-                               client_version=CA_VERSION),
-                'datefmt':
-                get_date_format()
-            },
-            'standard': {
-                'format':
-                get_log_format(type='standard',
-                               app_version=A_VERSION,
-                               client_version=CA_VERSION),
-                'datefmt':
-                get_date_format()
-            },
-        },
-        'handlers': {
-            'consoleFull': {
-                'level': 'DEBUG',
-                'formatter': 'standard',
-                'class': 'logging.StreamHandler',
-                'stream': 'ext://sys.stdout'
-            },
-        },
-        'loggers': {
-            '': {
-                'handlers': ['consoleFull'],
-                'level': 'ERROR',
-            },
-            'sermos': {
-                'handlers': ['consoleFull'],
-                'level': 'DEBUG',
-                'propagate': False
-            },
-            'timing': {
-                'handlers': ['consoleFull'],
-                'level': 'DEBUG',
-                'propagate': False
-            },
-            'celery': {
-                'handlers': ['consoleFull'],
-                'level': 'DEBUG',
-                'propagate': False
-            },
-            'bin': {
-                'handlers': ['consoleFull'],
-                'level': 'DEBUG',
-                'propagate': False
-            },
-        },
-        'root': {
-            'level': 'DEBUG',
-            'handlers': ['consoleFull']
-        }
-    }
-    for handler, handler_config in config['handlers'].items():
-        # Override this handler's level to the level passed to this method
-        handler_config['level'] = log_level
-        config['handlers'][handler] = handler_config
-    # Set the root handler's level
-    config['root']['level'] = log_level
-    logging.config.dictConfig(config)
-    es_logger = logging.getLogger('elasticsearch')
-    if overload_elasticsearch is True:
-        es_logger.setLevel(logging.WARNING)
-    else:
-        # Ensure to set to baseline in the event this is invoked multiple times.
-        es_logger.setLevel(logging.INFO)

pypeline/pipeline/__init__.py DELETED Viewed

File without changes

pypeline/pipeline/chained_task.py DELETED Viewed

@@ -1,70 +0,0 @@
-import logging
-from celery import Task
-from pypeline.utils.task_utils import PipelineRunWrapper
-logger = logging.getLogger(__name__)
-class ChainedTask(Task):
-    """ A Celery Task that is used as the _base_ for all dynamically
-    generated tasks (by GenerateCeleryTasks().generate()). This injects
-    `event` into every task's signature, which allows pipelines to pass
-    event information easily through a chain.
-    """
-    abstract = True
-    def __call__(self, *args, **kwargs):
-        """ Allow the return value of one task to update the kwargs of a
-            subsequent task if it's a dictionary. Important to the function
-            of a pipeline to allow event information to flow easily.
-        """
-        # Inject app context
-        if len(args) == 1 and isinstance(args[0], dict):
-            kwargs.update(args[0])
-            args = ()
-        # Event holds information used in PipelineRunWrapper and
-        # other areas.
-        if 'event' not in kwargs.keys():
-            kwargs['event'] = {}
-        # This is a special worker from dyrygent that orchestrates our
-        # pipelines.  It provides a patch in fix for celery's poor
-        # implementation of Canvas work-flows
-        if self.__name__ == 'workflow_processor':
-            kwargs.pop('event', None)
-        return super(ChainedTask, self).__call__(*args, **kwargs)
-    def after_return(self, status, retval, task_id, args, kwargs, einfo):
-        if "event" in kwargs and "pipeline_id" in kwargs["event"]:
-            try:
-                pipeline_run_wrapper: PipelineRunWrapper = \
-                    PipelineRunWrapper.from_event(kwargs["event"])
-                current_task_status = pipeline_run_wrapper.get_task_celery_status(task_id)
-            except Exception:
-                logger.exception("Unable to retreive Pipeline Run Wrapper")
-                return
-            if current_task_status:
-                current_task_status["status"] = status
-            try:
-                pipeline_run_wrapper.save_to_cache()
-            except Exception:
-                logger.exception(f"Failed to update celery task status for task {task_id}")
-    def on_retry(self, exc, task_id, args, kwargs, einfo):
-        if "event" in kwargs and "pipeline_id" in kwargs["event"]:
-            try:
-                pipeline_run_wrapper: PipelineRunWrapper = \
-                    PipelineRunWrapper.from_event(kwargs["event"])
-                current_task_status = pipeline_run_wrapper.get_task_celery_status(task_id)
-            except Exception:
-                logger.exception("Unable to retreive Pipeline Run Wrapper")
-                return
-            if current_task_status:
-                current_task_status["retries"] = current_task_status["retries"] + 1
-            try:
-                pipeline_run_wrapper.save_to_cache()
-            except Exception:
-                logger.exception(f"Failed to update celery task status for task {task_id}")

pypeline/pipeline/generator.py DELETED Viewed

@@ -1,254 +0,0 @@
-""" Utilities for running and managing tasks inside pipelines.
-"""
-import logging
-from celery import signature, chord, chain
-from pypeline.utils.graph_utils import get_chainable_tasks
-from pypeline.utils.config_utils import retrieve_latest_pipeline_config
-from pypeline.utils.task_utils import PipelineRunWrapper, get_task_signature
-from pypeline.constants import DEFAULT_TASK_TTL, DEFAULT_MAX_RETRY, \
-    DEFAULT_REGULATOR_TASK, CHAIN_FAILURE_MSG, CHAIN_SUCCESS_MSG, \
-    DEFAULT_SUCCESS_TASK
-from pypeline.pipeline_config_schema import PipelineConfigValidator
-logger = logging.getLogger(__name__)
-class PipelineGenerator(object):
-    """ Allows an API endpoint to generate a functional pipeline based on the
-        requested pipeline id. Allows API to then issue the tasks asynchronously
-        to initiate the pipeline. Thereafter, celery will monitor status and
-        handle success/failure modes so the API web worker can return
-        immediately.
-        The primary purpose is to unpack the pipeline config, create the
-        requisite cached entities to track pipeline progress, and apply the
-        chained pipeline tasks asynchronously so Celery can take over.
-        Usage:
-            gen = PipelineGenerator(pipeline_id)
-            chain = gen.generate_chain()
-            chain.on_error(custom_error_task.s())  # Optional add error handling
-            chain.delay()
-    """
-    def __init__(self,
-                 pipeline_id: str,
-                 access_key: str = None,
-                 execution_id: str = None,
-                 queue: str = None,
-                 default_task_ttl: int = None,
-                 regulator_queue: str = None,
-                 regulator_task: str = None,
-                 success_queue: str = None,
-                 success_task: str = None,
-                 default_max_retry: int = None,
-                 retry_backoff: int = None,
-                 retry_jitter: bool = None,
-                 retry_backoff_max: int = None,
-                 chain_payload: dict = None):
-        super().__init__()
-        self.pipeline_id = pipeline_id
-        self.access_key = access_key
-        pipeline_config_api_resp = retrieve_latest_pipeline_config(
-            pipeline_id=self.pipeline_id, access_key=self.access_key)
-        if pipeline_config_api_resp is None:
-            raise ValueError("Unable to load Pipeline Configuration for "
-                             f"pipeline id: {self.pipeline_id} ...")
-        # The only part of the API response used for any 'pipeline config'
-        # is the `config` key. The API nests it under `config` to preserve
-        # ability to add additional detail at a later date.
-        self.pipeline_config = pipeline_config_api_resp.get('config', {})
-        schema_version = pipeline_config_api_resp.get('schemaVersion')
-        PipelineConfigValidator(config_dict=self.pipeline_config,
-                                schema_version=schema_version)
-        self.execution_id = execution_id  # UUID string
-        self.good_to_go = False  # Indicates initialization/loading success
-        self.loading_message = None  # Allows access to success/error messages
-        self.is_retry = False if self.execution_id is None else True
-        self.default_max_retry = default_max_retry \
-            if default_max_retry is not None else \
-            self.pipeline_config['metadata'].get('maxRetry', DEFAULT_MAX_RETRY)
-        self.retry_backoff = retry_backoff \
-            if retry_backoff is not None else \
-            self.pipeline_config['metadata'].get('retryBackoff', 3)
-        self.retry_backoff_max = retry_backoff \
-            if retry_backoff_max is not None else \
-            self.pipeline_config['metadata'].get('retryBackoffMax', 600)
-        self.retry_jitter = retry_jitter \
-            if retry_jitter is not None else \
-            self.pipeline_config['metadata'].get('retryJitter', False)
-        # Queue on which to place tasks by default and default TTL per task
-        # These can be overridden in PipelineConfig.config['taskDefinitions']
-        self.queue = queue \
-            if queue is not None \
-            else self.pipeline_config['metadata']['queue']
-        self.default_task_ttl = default_task_ttl \
-            if default_task_ttl is not None else \
-            self.pipeline_config['metadata'].get('maxTtl', DEFAULT_TASK_TTL)
-        # See docstring in self._get_regulator()
-        self.regulator_queue = regulator_queue \
-            if regulator_queue is not None \
-            else self.pipeline_config['metadata']['queue']
-        self.regulator_task = regulator_task\
-            if regulator_task is not None else DEFAULT_REGULATOR_TASK
-        # See docstring in self._get_success_task()
-        self.success_queue = success_queue \
-            if success_queue is not None \
-            else self.pipeline_config['metadata']['queue']
-        self.success_task = success_task\
-            if success_task is not None else DEFAULT_SUCCESS_TASK
-        # Optional data to pass to each step in chain
-        self.chain_payload = chain_payload\
-            if chain_payload is not None else {}
-        self.pipeline_wrapper = None  # Allows access to the PipelineRunWrapper
-        self.chain = None  # Must be intentionally built with generate_chain()
-        try:
-            # Generate our wrapper for this pipeline_id / execution_id
-            self.pipeline_wrapper = PipelineRunWrapper(
-                pipeline_id=self.pipeline_id,
-                pipeline_config=self.pipeline_config,
-                execution_id=self.execution_id,
-                max_ttl=self.default_task_ttl,
-                max_retry=self.default_max_retry,
-                chain_payload=self.chain_payload)
-            # Loads pipeline config from remote or cache if it's already there
-            # `is_retry` will be True for any PipelineGenerator instantiated
-            # with an execution_id. This flag helps the wrapper increment the
-            # retry count and determine if this should be deadlettered.
-            # This step also saves the valid/initialized run wrapper to cache.
-            self.pipeline_wrapper.load(is_retry=self.is_retry)
-            # Set all variables that were established from the run wrapper
-            # initialization. Notably, default_task_ttl can be overloaded
-            # if the pipeline config has an explicit maxTtl set in metadata.
-            self.good_to_go = self.pipeline_wrapper.good_to_go
-            self.loading_message = self.pipeline_wrapper.loading_message
-            self.execution_id = self.pipeline_wrapper.execution_id
-        except Exception as e:
-            fail_msg = "Failed to load Pipeline for id {} ... {}".format(
-                self.pipeline_id, e)
-            self.loading_message = fail_msg
-            logger.error(fail_msg)
-            raise e
-    def _get_regulator(self):
-        """ Create a chain regulator celery task signature.
-            For a chain(), if each element is a group() then celery does not
-            properly adhere to the chain elements occurring sequentially. If you
-            insert a task that is not a group() in between, though, then the
-            chain operates as expected.
-        """
-        return signature(self.regulator_task,
-                         queue=self.regulator_queue,
-                         immutable=True)
-    def _get_success_task(self):
-        """ A final 'success' task that's added to the end of every pipeline.
-            This stores the 'success' state in the cached result. Users can
-            set other values by using TaskRunner().save_result()
-        """
-        return get_task_signature(task_path=self.success_task,
-                                  queue=self.success_queue,
-                                  pipeline_id=self.pipeline_id,
-                                  execution_id=self.execution_id)
-    def _get_signature(self, node):
-        """ Create a celery task signature based on a graph node.
-        """
-        metadata = self.pipeline_config['metadata']
-        node_config = self.pipeline_config['taskDefinitions'][node]
-        # Node config takes precedence, pipeline metadata as default
-        queue = node_config.get('queue', metadata['queue'])
-        max_ttl = node_config.get('maxTtl', metadata.get('maxTtl', None))
-        # Ensures task signatures include requisite information to retrieve
-        # PipelineRunWrapper from cache using the pipeline id, and execution id.
-        # We set immutable=True to ensure each client task can be defined
-        # with this specific signature (event)
-        # http://docs.celeryproject.org/en/master/userguide/canvas.html#immutability
-        return get_task_signature(task_path=node_config.get('handler'),
-                                  queue=queue,
-                                  access_key=self.access_key,
-                                  pipeline_id=self.pipeline_id,
-                                  execution_id=self.execution_id,
-                                  max_ttl=max_ttl,
-                                  immutable=True,
-                                  task_config=node_config)
-    def generate_chain(self):
-        """ Generate the full pipeline chain.
-        """
-        logger.debug(f'Starting Pipeline {self.pipeline_id}')
-        if not self.good_to_go:
-            logger.info("Chain deemed to be not good to go.")
-            if self.loading_message is None:
-                self.loading_message = CHAIN_FAILURE_MSG
-            return None
-        try:
-            # Create the task chain such that all concurrent tasks are grouped
-            # and all high level node groups are run serially
-            G = self.pipeline_wrapper.execution_graph
-            total_tasks = 0
-            pipeline_chain = []
-            chainable_tasks = get_chainable_tasks(G, None, [])
-            # Current chord+chain solution based on
-            # https://stackoverflow.com/questions/15123772/celery-chaining-groups-and-subtasks-out-of-order-execution
-            # Look also at last comment from Nov 7, 2017 here
-            # https://github.com/celery/celery/issues/3597
-            # Big outstanding bug in Celery related to failures in chords that
-            # results in really nasty log output. See
-            # https://github.com/celery/celery/issues/4834
-            for i, node_group in enumerate(chainable_tasks):
-                total_tasks += len(node_group)
-                this_group = []
-                for node in node_group:
-                    node_signature = self._get_signature(node)
-                    this_group.append(node_signature)
-                if len(this_group) <= 1:
-                    this_group.append(self._get_regulator())
-                the_chord = chord(header=this_group,
-                                  body=self._get_regulator())
-                pipeline_chain.append(the_chord)
-            # Add a 'finished/success' task to the end of all pipelines
-            pipeline_chain.append(
-                chord(header=self._get_success_task(),
-                      body=self._get_regulator()))
-            the_chain = chain(*pipeline_chain)
-            self.loading_message = CHAIN_SUCCESS_MSG
-            self.chain = the_chain
-        except Exception as e:
-            self.loading_message = CHAIN_FAILURE_MSG + " {}".format(e)
-            logger.exception(e)
-            the_chain = None
-        self.chain = the_chain
-        return the_chain

scalable-pypeline 1.2.3__py2.py3-none-any.whl → 2.0.1__py2.py3-none-any.whl

scalable-pypeline 1.2.3py2.py3-none-any.whl → 2.0.1py2.py3-none-any.whl