PyPI - scalable-pypeline - Versions diffs - 1.1.0__py2.py3-none-any.whl - Mend

scalable-pypeline 1.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

pypeline/__init__.py +1 -0
pypeline/celery.py +270 -0
pypeline/celery_beat.py +254 -0
pypeline/cli/__init__.py +0 -0
pypeline/cli/config_server.py +48 -0
pypeline/cli/core.py +32 -0
pypeline/cli/deploy.py +138 -0
pypeline/cloud.py +80 -0
pypeline/constants.py +139 -0
pypeline/deploy.py +167 -0
pypeline/extensions.py +16 -0
pypeline/flask/__init__.py +28 -0
pypeline/flask/api/__init__.py +0 -0
pypeline/flask/api/pipelines.py +245 -0
pypeline/flask/api/schedules.py +67 -0
pypeline/flask/api/utils.py +36 -0
pypeline/flask/decorators.py +92 -0
pypeline/flask/flask_sermos.py +219 -0
pypeline/generators.py +196 -0
pypeline/lib/__init__.py +0 -0
pypeline/lib/config_server.py +159 -0
pypeline/logging_config.py +171 -0
pypeline/pipeline_config_schema.py +197 -0
pypeline/schedule_config_schema.py +210 -0
pypeline/sermos_yaml.py +737 -0
pypeline/utils/__init__.py +0 -0
pypeline/utils/config_utils.py +327 -0
pypeline/utils/graph_utils.py +144 -0
pypeline/utils/module_utils.py +119 -0
pypeline/utils/task_utils.py +803 -0
scalable_pypeline-1.1.0.dist-info/LICENSE +177 -0
scalable_pypeline-1.1.0.dist-info/METADATA +166 -0
scalable_pypeline-1.1.0.dist-info/RECORD +38 -0
scalable_pypeline-1.1.0.dist-info/WHEEL +6 -0
scalable_pypeline-1.1.0.dist-info/entry_points.txt +2 -0
scalable_pypeline-1.1.0.dist-info/top_level.txt +2 -0
tests/fixtures/__init__.py +1 -0
tests/fixtures/s3_fixtures.py +52 -0

pypeline/generators.py ADDED Viewed

@@ -0,0 +1,196 @@
+import os
+from boto3 import Session
+import logging
+logger = logging.getLogger(__name__)
+class KeyGenerator(object):
+    """ Common functions for key generators.
+    """
+    def __init__(self):
+        super(KeyGenerator, self).__init__()
+        self.hidden_files = ('.DS_Store', '.git', 'Icon', '.Dropbox')
+    def get_file_key(self, file_obj):
+        """ Required for each specific generator - how to extract key
+        """
+        return file_obj
+    def get_file_name(self, file_obj):
+        """ Required for each specific generator - how to extract file name
+        """
+        return file_obj
+    def get_file_size(self, base_path, file_obj):
+        """ Required for each specific generator - how to find file size (BYTES)
+        """
+        return 0
+    def get_final_path(self, base_path, file_name, return_full_path):
+        """ Required for each specific generator - create final file path that
+            is added to list.
+        """
+        if return_full_path:
+            return os.path.normpath(base_path + '/' + file_name)
+        return file_name
+    def list_iterator(self, all_files, base_path, limit=None, offset=None,
+                      size_limit=None, return_full_path=True,
+                      skip_common_hidden=True):
+        """ accept vars from everywhere to handle offset/limit/size logic
+        """
+        filtered_files = []
+        try:
+            # Compile list of all files within limit/offset if those exist
+            idx = -1
+            listed_files = 0
+            offset_reached = False
+            for f in all_files:
+                this_key = self.get_file_key(f)
+                this_filename = self.get_file_name(f)
+                if skip_common_hidden and this_filename in self.hidden_files:
+                    continue
+                idx += 1
+                if offset and idx >= int(offset):
+                    offset_reached = True
+                if (limit and listed_files >= int(limit))\
+                        or (offset and not offset_reached):
+                    continue
+                # Verify filesize. Having some issues with large PDFs (process
+                # simply killed). So allow option of skipping files above certain
+                # size in megabytes.
+                if size_limit is not None:
+                    size_in_bytes = self.get_file_size(base_path, f)
+                    if size_in_bytes > size_limit:
+                        continue
+                filtered_files.append(
+                    self.get_final_path(base_path, this_key, return_full_path)
+                )
+                listed_files += 1
+        except Exception as e:
+            logger.error("Unable to list objects: {0}".format(e))
+        return filtered_files
+class S3KeyGenerator(KeyGenerator):
+    """ Produce a list of object keys from S3.
+    """
+    def __init__(self, aws_access_key_id, aws_secret_access_key,
+                 aws_region='us-east-1'):
+        super(S3KeyGenerator, self).__init__()
+        session = Session(
+            aws_access_key_id=aws_access_key_id,
+            aws_secret_access_key=aws_secret_access_key,
+            region_name=aws_region
+        )
+        self.s3 = session.client('s3')
+    def get_file_key(self, file_obj):
+        """ Get file key from s3 object """
+        return file_obj.get('Key', None)
+    def get_file_name(self, file_obj):
+        """ Get file name from s3 object """
+        if file_obj is not None:
+            key = file_obj.get('Key', None)
+            if key is not None:
+                return key.split('/')[-1]
+        return None
+    def get_file_size(self, base_path, file_obj):
+        """ Return file size of s3 object """
+        return file_obj.get('Size', 0)
+    # All files in bucket
+    # Range of files with an offset
+    def list_files(self, bucket, folder='', limit=None, offset=None,
+                   size_limit=None, return_full_path=True,
+                   skip_common_hidden=True):
+        """ Lists files inside an S3 bucket+folder
+            Note: This does not guarantee any sort of order. Boto+S3 does not
+            provide an interface for sorting results, so that would need
+            to happen in memory.
+            limit will include a maximum of 'limit' values
+            offset will start including values only after 'offset' keys
+            size_limit will not include files over a specific size (in bytes)
+            skip_common_hidden will exclude common hidden files
+            return_full_path will include 'bucket/' in key.
+        """
+        files = []
+        try:
+            file_data = self.s3.list_objects_v2(
+                Bucket=bucket, Delimiter='/', Prefix=folder)
+            files = self.list_iterator(
+                file_data['Contents'],
+                bucket,
+                limit=limit,
+                offset=offset,
+                size_limit=size_limit,
+                return_full_path=return_full_path,
+                skip_common_hidden=skip_common_hidden
+            )
+        except Exception as e:
+            logger.error("Unable to list objects: {0}".format(e))
+        return files
+class LocalKeyGenerator(KeyGenerator):
+    """ Generic generator to produce a list of file names from filesystem.
+    """
+    def __init__(self):
+        super(LocalKeyGenerator, self).__init__()
+    def get_file_key(self, file_obj):
+        """ Get file key from local object """
+        return file_obj
+    def get_file_name(self, file_obj):
+        """ Get file name from local object """
+        return file_obj
+    def get_file_size(self, base_path, file_obj):
+        """ Get file size from local object """
+        full_path = os.path.normpath(base_path + '/' + file_obj)
+        try:
+            return os.stat(full_path).st_size
+        except Exception as e:
+            logger.error("File {0} not found ...".format(full_path))
+        return 0
+    def list_files(self, folder_path, limit=None, offset=None,
+                   size_limit=None, return_full_path=True,
+                   skip_common_hidden=True):
+        """ Lists all file names inside a path.
+            skip_common_hidden will exclude common hidden files
+            return_full_path will include path in addition to filename
+        """
+        files = []
+        try:
+            file_data = os.listdir(folder_path)
+            files = self.list_iterator(
+                file_data,
+                folder_path,
+                limit=limit,
+                offset=offset,
+                size_limit=size_limit,
+                return_full_path=return_full_path,
+                skip_common_hidden=skip_common_hidden
+            )
+        except Exception as e:
+            logger.error("Unable to list objects: {0}".format(e))
+        return files

pypeline/lib/__init__.py ADDED Viewed

File without changes

pypeline/lib/config_server.py ADDED Viewed

@@ -0,0 +1,159 @@
+""" A *local* configuration server to serve local pipeline and schedule configs.
+This should be used for development purposes only.
+Note: `deployment_id` is unused in all endpoints but is included in this
+development server for full compatibility with Sermos managed deployments.
+"""
+import json
+import yaml
+import logging
+from typing import Union
+from marshmallow.exceptions import ValidationError
+from flask import Flask, request, jsonify
+from rho_web.response import abort
+from pypeline.schedule_config_schema import BaseScheduleSchema
+logger = logging.getLogger(__name__)
+api = Flask(__name__)
+PREFIX = '/api/v1'
+def set_api_config(base_dir: str = None,
+                   pipelines_yaml: str = None,
+                   schedules_json: str = None):
+    """ Establish baseline api configuration (where to find config files)
+    """
+    api.config.update(
+        BASE_DIR=base_dir if base_dir else 'dev',
+        PIPELINES_YAML=pipelines_yaml if pipelines_yaml else 'pipelines.yaml',
+        SCHEDULES_JSON=schedules_json if schedules_json else 'schedules.json')
+set_api_config()  # Set by default, can overload manually before starting
+def _retrieve_schedules() -> Union[dict, None]:
+    """ Load local schedules.json file
+    """
+    filename = api.config['BASE_DIR'] + '/' + api.config['SCHEDULES_JSON']
+    with open(filename, 'r') as f:
+        schedules = json.loads(f.read())
+    return schedules
+def _save_schedules(schedules: dict) -> Union[dict, None]:
+    """ Save local schedules.json file
+    """
+    filename = api.config['BASE_DIR'] + '/' + api.config['SCHEDULES_JSON']
+    with open(filename, 'w') as f:
+        f.write(json.dumps(schedules))
+def _retrieve_pipelines() -> Union[dict, None]:
+    """ Load local pipelines.yaml and load a specific pipeline configuration.
+    """
+    filename = api.config['BASE_DIR'] + '/' + api.config['PIPELINES_YAML']
+    with open(filename, 'r') as f:
+        pipelines = yaml.safe_load(f.read())
+    return pipelines
+def _retrieve_pipeline(pipeline_id: str) -> Union[dict, None]:
+    """ Load local pipelines.yaml and load a specific pipeline configuration.
+    """
+    pipelines = _retrieve_pipelines()
+    for pipeline in pipelines['pipelines']:
+        if pipeline['metadata']['pipelineId'] == pipeline_id:
+            return pipeline
+    return None
+@api.route(PREFIX + '/deployments/<string:deployment_id>/schedule_tasks',
+           methods=['GET'])
+def get_schedules(deployment_id: str):
+    """ Load local schedules.json file.
+    """
+    logger.debug(f"Retrieving schedules for {deployment_id}")
+    schedules = _retrieve_schedules()
+    try:
+        BaseScheduleSchema().load(schedules)
+    except Exception as e:
+        logger.error(f"Error retrieving schedules: {e}")
+        abort(400, message=e)
+    return jsonify(schedules)
+@api.route(PREFIX + '/deployments/<string:deployment_id>/schedule_tasks/'
+           '<string:task_id>',
+           methods=['POST'])
+def update_schedules(deployment_id: str):
+    """ Update local schedules.json file with values from provided schedule.
+    Primarily this is intended to keep the last_run_at and total_run_count
+    values up to date.
+    """
+    logger.debug(f"Updating schedules for {deployment_id}")
+    new_schedules = json.loads(request.data)  # Schedules with updates
+    schedules = _retrieve_schedules()  # Schedules known to Sermos
+    update_vars = ('last_run_at', 'total_run_count')
+    for s in schedules['schedules']:
+        for new_s in new_schedules['schedules']:
+            if s['name'] == new_s['name']:
+                for var in update_vars:
+                    s[var] = new_s[var]
+    try:
+        scs = BaseScheduleSchema()
+        scs.load(schedules)  # Validate new schedule
+    except ValidationError:
+        abort(400, message="Invalid new schedule ...")
+    _save_schedules(schedules)
+    return jsonify({'message': 'Schedules updated ...'})
+@api.route(PREFIX + '/deployments/<string:deployment_id>/pipelines',
+           methods=['GET'])
+def get_pipelines(deployment_id: str):
+    """ Load local pipelines.yaml file
+    """
+    logger.debug(f"Retrieving pipelines for {deployment_id}")
+    pipelines = _retrieve_pipelines()
+    # Transform into what we expect from Cloud API server. The local
+    # pipelines.yaml file format is for your own development and reference
+    # if you choose to deploy independently.
+    retval = []
+    for pipeline in pipelines['pipelines']:
+        retval.append(pipeline)
+    return jsonify({'data': {'results': retval}})
+@api.route(PREFIX + '/pipelines/<string:deployment_id>/<string:pipeline_id>',
+           methods=['GET'])
+def get_pipeline(deployment_id: str, pipeline_id: str):
+    """ Load local pipeline.yaml and retrieve a specific pipeline.
+    """
+    logger.debug(f"Retrieving pipeline for {deployment_id} / {pipeline_id}")
+    pipeline = _retrieve_pipeline(pipeline_id)
+    if pipeline is not None:
+        return jsonify({'data': pipeline})
+    return jsonify({}), 404
+@api.route(PREFIX + '/auth', methods=['POST'])
+def validate_access_key():
+    """ Validate a provided API key.
+    NOTE: This is a *mock* endpoint, no actual validation occurs.
+    """
+    access_key = request.headers.get('accesskey', None)
+    if access_key is None:
+        abort(401)
+    return jsonify({})

pypeline/logging_config.py ADDED Viewed

@@ -0,0 +1,171 @@
+import os
+import logging
+import logging.config
+from pypeline import __version__
+from logging import StreamHandler
+logging_set = False
+def get_log_level(level: str = None) -> int:
+    """ Attempt to get the log level from the environment, otherwise use the
+        default INFO level.  The environment variable LOG_LEVEL should be e.g.,
+        'DEBUG'
+    """
+    if level is not None:
+        level_str = str(level)
+    else:
+        level_str = os.environ.get('LOG_LEVEL', 'INFO')
+    return getattr(logging, level_str)
+def get_log_format(type: str = 'standard',
+                   app_version: str = None,
+                   client_version: str = None):
+    """ Standard log format. Supports `standard` and `simple`
+    """
+    if app_version is None:
+        app_version = "?"
+    if client_version is None:
+        client_version = "?"
+    format = '%(message)s'
+    if type == 'standard':
+        format = '%(process)d - %(levelname)s - %(asctime)s - '\
+            + '%(filename)s (%(lineno)d) - '\
+            + 'sermos v{} - client v{} - %(message)s'\
+            .format(app_version, client_version)
+    elif type == 'simple':
+        format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    return format
+def get_date_format():
+    """ Standard date format
+    """
+    return '%Y-%m-%dT%H:%M:%S'
+def setup_logging(app_version: str = None,
+                  client_version: str = None,
+                  default_level: str = None,
+                  overload_elasticsearch: bool = False,
+                  establish_logging_config: bool = True):
+    """ Setup logging configuration for standard streaming output + optional
+        log aggregator.
+        Standard usage is to invoke this at application bootstrapping time
+        to establish default log handling. e.g.
+        def create_app():
+            setup_logging()
+        Individual application modules should load a logger like normal:
+        import logging
+        logger = logging.getLogger(__name__)
+        elasticsearch-py is overly verbose with it's 'info' logging. This
+        will set that logger to `warning` if `overload_elasticsearch` is True
+        `establish_logging_config` is intended to be used by something invoking
+        setup_logging() explicitly with the intention of setting the final
+        configuration, which is the default behavior. Set this to `False` in the
+        case where you might not be sure if logging has been set up yet.
+    """
+    global logging_set
+    if logging_set and not establish_logging_config:
+        return
+    if establish_logging_config or not logging_set:
+        logging_set = True
+    # Set our application version values, which can be passed to this method.
+    # By default, we report the app versions for sermos and the client
+    A_VERSION = __version__  # sermos version
+    CA_VERSION = None  # application version of client app using sermos
+    if app_version is not None:
+        A_VERSION = app_version
+    if client_version is not None:
+        CA_VERSION = client_version
+    log_level = get_log_level(default_level)
+    config = {
+        'disable_existing_loggers': False,
+        'version': 1,
+        'formatters': {
+            'simple': {
+                'format':
+                get_log_format(type='simple',
+                               app_version=A_VERSION,
+                               client_version=CA_VERSION),
+                'datefmt':
+                get_date_format()
+            },
+            'standard': {
+                'format':
+                get_log_format(type='standard',
+                               app_version=A_VERSION,
+                               client_version=CA_VERSION),
+                'datefmt':
+                get_date_format()
+            },
+        },
+        'handlers': {
+            'consoleFull': {
+                'level': 'DEBUG',
+                'formatter': 'standard',
+                'class': 'logging.StreamHandler',
+                'stream': 'ext://sys.stdout'
+            },
+        },
+        'loggers': {
+            '': {
+                'handlers': ['consoleFull'],
+                'level': 'ERROR',
+            },
+            'sermos': {
+                'handlers': ['consoleFull'],
+                'level': 'DEBUG',
+                'propagate': False
+            },
+            'timing': {
+                'handlers': ['consoleFull'],
+                'level': 'DEBUG',
+                'propagate': False
+            },
+            'celery': {
+                'handlers': ['consoleFull'],
+                'level': 'DEBUG',
+                'propagate': False
+            },
+            'bin': {
+                'handlers': ['consoleFull'],
+                'level': 'DEBUG',
+                'propagate': False
+            },
+        },
+        'root': {
+            'level': 'DEBUG',
+            'handlers': ['consoleFull']
+        }
+    }
+    for handler, handler_config in config['handlers'].items():
+        # Override this handler's level to the level passed to this method
+        handler_config['level'] = log_level
+        config['handlers'][handler] = handler_config
+    # Set the root handler's level
+    config['root']['level'] = log_level
+    logging.config.dictConfig(config)
+    es_logger = logging.getLogger('elasticsearch')
+    if overload_elasticsearch is True:
+        es_logger.setLevel(logging.WARNING)
+    else:
+        # Ensure to set to baseline in the event this is invoked multiple times.
+        es_logger.setLevel(logging.INFO)