PyPI - scalable-pypeline - Versions diffs - 1.1.0__py2.py3-none-any.whl - Mend

scalable-pypeline 1.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

pypeline/__init__.py +1 -0
pypeline/celery.py +270 -0
pypeline/celery_beat.py +254 -0
pypeline/cli/__init__.py +0 -0
pypeline/cli/config_server.py +48 -0
pypeline/cli/core.py +32 -0
pypeline/cli/deploy.py +138 -0
pypeline/cloud.py +80 -0
pypeline/constants.py +139 -0
pypeline/deploy.py +167 -0
pypeline/extensions.py +16 -0
pypeline/flask/__init__.py +28 -0
pypeline/flask/api/__init__.py +0 -0
pypeline/flask/api/pipelines.py +245 -0
pypeline/flask/api/schedules.py +67 -0
pypeline/flask/api/utils.py +36 -0
pypeline/flask/decorators.py +92 -0
pypeline/flask/flask_sermos.py +219 -0
pypeline/generators.py +196 -0
pypeline/lib/__init__.py +0 -0
pypeline/lib/config_server.py +159 -0
pypeline/logging_config.py +171 -0
pypeline/pipeline_config_schema.py +197 -0
pypeline/schedule_config_schema.py +210 -0
pypeline/sermos_yaml.py +737 -0
pypeline/utils/__init__.py +0 -0
pypeline/utils/config_utils.py +327 -0
pypeline/utils/graph_utils.py +144 -0
pypeline/utils/module_utils.py +119 -0
pypeline/utils/task_utils.py +803 -0
scalable_pypeline-1.1.0.dist-info/LICENSE +177 -0
scalable_pypeline-1.1.0.dist-info/METADATA +166 -0
scalable_pypeline-1.1.0.dist-info/RECORD +38 -0
scalable_pypeline-1.1.0.dist-info/WHEEL +6 -0
scalable_pypeline-1.1.0.dist-info/entry_points.txt +2 -0
scalable_pypeline-1.1.0.dist-info/top_level.txt +2 -0
tests/fixtures/__init__.py +1 -0
tests/fixtures/s3_fixtures.py +52 -0

pypeline/utils/__init__.py ADDED Viewed

File without changes

pypeline/utils/config_utils.py ADDED Viewed

@@ -0,0 +1,327 @@
+""" General utilities used frequently in configuration-related tasks.
+More specifically, these are methods that help interact with Pipeline and
+Schedule configurations that originate from your `sermos.yaml` file. These
+utility functions make it easy to switch between `local` and `cloud` modes
+based on the value of `DEFAULT_BASE_URL` in your environment.
+- If the base url is `local`, then all config tasks will read directly from
+your local `sermos.yaml` file. Update operations will *not* do anything (that
+is, your sermos.yaml file will not be updated).
+- If the base url is anything other than `local`, this will assume a cloud
+api url was provided (if None is set in environment, Sermos will default to
+the Sermos Cloud base API assuming this is a Sermos Cloud deployment). You can
+provide your own cloud API endpoints if desired, look to documentation for best
+practices.
+TODO Need to remove the dependency on Redis and make caching behavior optional.
+"""
+import os
+import logging
+import json
+from typing import Union, Any
+from urllib.parse import urljoin
+import requests
+from rhodb.redis_conf import RedisConnector
+from pypeline.constants import DEFAULT_BASE_URL, PIPELINE_CONFIG_CACHE_KEY, \
+    SCHEDULE_CONFIG_CACHE_KEY, CONFIG_REFRESH_RATE, USING_SERMOS_CLOUD, \
+    LOCAL_DEPLOYMENT_VALUE, DEFAULT_CONFIG_RETRIEVAL_PAGE_SIZE
+from pypeline.sermos_yaml import load_sermos_config
+from pypeline.pipeline_config_schema import BasePipelineSchema
+from pypeline.schedule_config_schema import BaseScheduleSchema
+logger = logging.getLogger(__name__)
+redis_conn = RedisConnector().get_connection()
+def get_access_key(access_key: Union[str, None] = None,
+                   env_var_name: str = 'SERMOS_ACCESS_KEY'):
+    """ Simple helper to get admin server access key in a standard fashion. If
+    one is provided, return it back. If not, look in environment for
+    `env_var_name`. If that doesn't exist, raise useful error.
+    If this is a local deployment, no access key is required/relevant,
+    so simply return 'local'
+    """
+    if access_key is not None:
+        return access_key
+    if not USING_SERMOS_CLOUD:
+        return LOCAL_DEPLOYMENT_VALUE  # e.g. 'local'
+    try:
+        return os.environ[env_var_name]
+    except KeyError:
+        raise KeyError(
+            f"{env_var_name} not found in this environment. Find a valid "
+            "access key in your Sermos Cloud administration console.")
+# TODO cast to UUID?
+def get_deployment_id(deployment_id: Union[str, None] = None,
+                      env_var_name: str = 'SERMOS_DEPLOYMENT_ID'):
+    """ Simple helper to get the deployment id in a standard fashion. Look in
+    the environment for `env_var_name`. If that doesn't exist, raise useful
+    error.
+    If this is a local deployment, no deployment id is required/relevant,
+    so this will simply return 'local' in the event the DEFAULT_BASE_URL is
+    set to the LOCAL_DEPLOYMENT_VALUE ('local' by default) in the environment.
+    """
+    if deployment_id is not None:
+        return deployment_id
+    if not USING_SERMOS_CLOUD:
+        return LOCAL_DEPLOYMENT_VALUE  # e.g. 'local'
+    try:
+        return os.environ[env_var_name]
+    except KeyError:
+        raise KeyError(
+            f"{env_var_name} not found in this environment. Note: this is "
+            "required when running a Celery worker as `beat`. Find this ID "
+            "in your administration console. For local development, this can "
+            "be any arbitrary string.")
+def load_json_config_from_redis(key: str) -> Any:
+    """ Load a json key from redis. Special carve out for keys explicitly set
+    to "none".
+    """
+    val = redis_conn.get(key)
+    if val is None or val.decode('utf-8').lower() == 'none':
+        return None
+    return json.loads(val)
+def set_json_config_to_redis(key: str,
+                             data: Union[dict, None],
+                             refresh_rate: int = CONFIG_REFRESH_RATE):
+    """ For Admin API actions (e.g. schedules/pipelines), deployments cache
+    results. The standard method for doing this is through a refresh key, which
+    is set in redis to expire after the CONFIG_REFRESH_RATE. This will set
+    the cached key.
+    Rationale for manually setting a "None" key instead of simply skipping
+    is to protect against case of a spammed config request for an unknown
+    pipeline, for example. This will still limit our requests to Sermos Cloud
+    based on the refresh rate even in that scenario.
+    """
+    if data is None:
+        data = 'None'
+    else:
+        data = json.dumps(data)
+    redis_conn.setex(key, refresh_rate, data)
+def _generate_api_url(endpoint: str = ''):
+    """ Provide a normalized url based on the base url and endpoint and add in
+    the deployment_id to the url, which is required for all default
+    pipeline/schedule endpoints if using Sermos Cloud.
+    The Sermos Cloud API spec bases everything on the notion of `deployments`,
+    so if you are rolling your own 'non-local' API, you will need to mock this
+    concept in order to use the built in helper functions for retrieving
+    pipelines and schedules from an API source.
+    """
+    deployment_id = get_deployment_id()  # From env if None
+    return urljoin(DEFAULT_BASE_URL, f'deployments/{deployment_id}/{endpoint}')
+def _retrieve_and_cache_config(key: str,
+                               admin_api_endpoint: str,
+                               access_key: str,
+                               refresh_rate: int = CONFIG_REFRESH_RATE) -> Any:
+    """ Attempt to load a configuration (pipeline/schedule) from cache If not available,
+    retrieve API response from Sermos Config Server and cache the response for
+    CONFIG_REFRESH_RATE seconds in local Redis.
+    """
+    conf = load_json_config_from_redis(key)
+    if conf is not None:
+        return conf
+    # Ask Sermos Cloud (Note: Sermos Cloud's API expects `apikey`)
+    headers = {
+        'apikey': access_key,
+    }
+    params = {
+        'page_size': DEFAULT_CONFIG_RETRIEVAL_PAGE_SIZE,
+        'page': 1
+    }
+    r = requests.get(admin_api_endpoint, headers=headers, verify=True,
+                     params=params)
+    data = None
+    if r.status_code == 200:
+        data = r.json()
+    else:
+        logger.warning(f"Non-200 response retrieving {admin_api_endpoint}: "
+                       f"{r.status_code}, {r.reason}")
+    # There's a chance we need to request ALL schedule configs from sermos cloud
+    # for the scheduled tasks.  Lets loop and grab all of them.
+    while key == SCHEDULE_CONFIG_CACHE_KEY and \
+            len(data['data']['results']) < data['data']['count']:
+        params['page'] += 1
+        r = requests.get(admin_api_endpoint, headers=headers, verify=True,
+                         params=params)
+        if r.status_code == 200:
+            paginated_data = r.json()
+            data['data']['results'] = data['data']['results'] + \
+                                      paginated_data['data']['results']
+        else:
+            logger.warning(f"Non-200 response retrieving {admin_api_endpoint}: "
+                       f"{r.status_code}, {r.reason}")
+            break
+    # Cache result
+    if data is not None:
+        set_json_config_to_redis(key, data, refresh_rate)
+    return data
+def retrieve_latest_pipeline_config(
+        pipeline_id: Union[str, None] = None,
+        access_key: Union[str, None] = None,
+        refresh_rate: int = CONFIG_REFRESH_RATE) -> Union[dict, list]:
+    """ Retrieve the 'latest' pipeline configuration.
+    Sermos can be deployed in 'local' mode by setting DEFAULT_BASE_URL=local
+    in your environment. In this case, Sermos will retrieve the latest
+    configuration from the local filesystem, specifically looking inside the
+    sermos.yaml file.
+    If the DEFAULT_BASE_URL is anything else, this will assume that it is a
+    valid API base url and make a request. The request will be formatted to
+    match what Sermos Cloud expects for seamless Sermos Cloud deployments.
+    However, you can provide any base url and stand up your own API if desired!
+    This utilizes redis (required for Sermos-based pipelines/scheduled tasks)
+    to cache the result for a predetermined amount of time before requesting an
+    update. This is because pipelines/tasks can be invoked rapidly but do not
+    change frequently.
+    """
+    # If this is a LOCAL deployment, look to sermos.yaml directly
+    if not USING_SERMOS_CLOUD:
+        sermos_config = load_sermos_config()
+        if 'pipelines' in sermos_config:
+            pipelines = []
+            found_pipeline = None
+            for p_id, config in sermos_config['pipelines'].items():
+                config['sermosPipelineId'] = p_id
+                if pipeline_id == p_id:
+                    found_pipeline = config
+                    break
+                pipelines.append(config)
+            if pipeline_id:
+                if found_pipeline:
+                    return found_pipeline
+                raise ValueError(f'Invalid pipeline {pipeline_id}')
+            return pipelines
+        return None
+    # If this is a CLOUD deployment, generate a valid API url and ask the API
+    # service for pipeline configuration. If this deployment is set up to
+    # cache results, do so.
+    cache_key = PIPELINE_CONFIG_CACHE_KEY.format(pipeline_id)
+    access_key = get_access_key(access_key)  # From env if None
+    # Generate pipeline specific API endpoint. If pipeline_id
+    # is None, then we're asking for 'all' pipelines.
+    api_url = _generate_api_url('pipelines')
+    if pipeline_id is not None:
+        api_url = urljoin(api_url + '/', pipeline_id)  # Add pipeline ID
+    # Retrieve (and cache) result - this will be the exact result from the
+    # API response.
+    data = _retrieve_and_cache_config(cache_key, api_url, access_key,
+                                      refresh_rate)
+    if data:
+        if pipeline_id:
+            return data['data']
+        return data['data']['results']
+    return None
+def retrieve_latest_schedule_config(access_key: Union[str, None] = None,
+                                    refresh_rate: int = CONFIG_REFRESH_RATE):
+    """ Retrieve the 'latest' scheduled tasks configuration.
+    Sermos can be deployed in 'local' mode by setting DEFAULT_BASE_URL=local
+    in your environment. In this case, Sermos will retrieve the latest configuration
+    from the local filesystem, specifically looking inside the sermos.yaml file.
+    If the DEFAULT_BASE_URL is anything else, this will assume that it is a valid
+    API base url and make a request. The request will be formatted to match what
+    Sermos Cloud expects for seamless Sermos Cloud deployments. However, you can
+    provide any base url and stand up your own API if desired!
+    This utilizes redis (required for Sermos-based pipelines/scheduled tasks) to
+    cache the result for a predetermined amount of time before requesting an
+    update. This is because pipelines/tasks can be invoked rapidly but do not
+    change frequently.
+    """
+    if not USING_SERMOS_CLOUD:
+        sermos_config = load_sermos_config()
+        if 'scheduledTasks' in sermos_config:
+            tasks = []
+            for task_id, config in sermos_config['scheduledTasks'].items():
+                config['sermosScheduledTasksId'] = task_id
+                tasks.append(config)
+            return tasks
+        return None
+    cache_key = SCHEDULE_CONFIG_CACHE_KEY
+    access_key = get_access_key(access_key)  # From env if None
+    api_url = _generate_api_url('scheduled_tasks')
+    data = _retrieve_and_cache_config(cache_key, api_url, access_key,
+                                      refresh_rate)
+    schedules = []
+    for schedule in data['data']['results']:
+        ScheduleSchema = \
+            BaseScheduleSchema.get_by_version(schedule['schemaVersion'])
+        schema = ScheduleSchema()
+        _schedule = schema.load(schedule)
+        _schedule['id'] = schedule['id']
+        schedules.append(_schedule)
+    return schedules
+def update_schedule_config(new_schedule_config: dict,
+                           access_key: Union[str, None] = None,
+                           schedule_config_endpoint: Union[str, None] = None):
+    """ Tell Sermos to update a deployment's schedule with new version.
+    """
+    # Don't send status to sermos-cloud if we're running in local mode
+    if not USING_SERMOS_CLOUD:
+        return True
+    access_key = get_access_key(access_key)  # From env if None
+    api_url = _generate_api_url('scheduled_tasks')
+    # Ask Sermos Cloud (Note: Sermos Cloud's API expects `apikey`)
+    headers = {'apikey': access_key}
+    for scheduled_task in new_schedule_config['schedules']:
+        copy_task = dict(scheduled_task)
+        task_id = copy_task.pop('id')
+        url = f"{api_url}/{task_id}"
+        r = requests.put(url, json=copy_task, headers=headers, verify=True)
+        if r.status_code != 200:
+            logger.error("Unable to update schedule task in sermos cloud")
+            logger.error(r.json())
+            return False
+    return True

pypeline/utils/graph_utils.py ADDED Viewed

@@ -0,0 +1,144 @@
+import logging
+import networkx as nx
+from typing import List, Union
+logger = logging.getLogger(__name__)
+def get_execution_graph(
+        config: dict,
+        adjacency_key: str = 'dagAdjacency',
+        task_definitions_key: str = 'taskDefinitions') -> nx.DiGraph:
+    """ Generate a directed graph based on a pipeline config's adjacency list
+        and task definitions.
+        `dagAdjacency` is a dictionary containing all nodes and downstream
+        nodes.
+        `taskDefinitions` is a dictionary containing metadata required for
+        each node such as the worker, model version, etc. This metadata is
+        attached to each node so it can be retrieved directly from the graph.
+    """
+    G = nx.DiGraph()
+    # Get our adjacency list and task definitions
+    adjacency_dict = config.get(adjacency_key, {})
+    task_definitions = config.get(task_definitions_key, {})
+    if len(adjacency_dict.keys()) == 0:
+        logger.warning('Adjacency definition `{}` was not found ...'.format(
+            adjacency_key))
+    # Build the graph
+    for node in adjacency_dict.keys():
+        adjacent_nodes = adjacency_dict[node]
+        # If no adjacent nodes, then this is a terminal node
+        if len(adjacent_nodes) == 0:
+            G.add_node(node, attr_dict=task_definitions.get(node, {}))
+            continue
+        # Otherwise, we'll add an edge from this node to all adjacent nodes
+        # and add the task defnition metadata to the edge
+        G.add_edges_from([(node, n, task_definitions.get(n, {}))
+                          for n in adjacent_nodes])
+    return G
+def find_entry_points(G: nx.DiGraph) -> List[str]:
+    """ Find the entrypoint(s) for this graph.
+        An entrypoint is one for which no predecessors exist.
+    """
+    result = []
+    for node in G.nodes:
+        if len(list(G.predecessors(node))) == 0:
+            result.append(node)
+    return result
+def find_successors(G: nx.DiGraph,
+                    nodes: Union[List[str], str],
+                    dedup: bool = True) -> Union[List[str], List[List[str]]]:
+    """ Find the next point(s) for graph node(s).
+        If dedeup is True (default), return a single list of deduplicated
+        values. This is useful when creating a task chain that is comprised
+        of groups that can execute concurrently. If two upstream tasks in the
+        chain each invoke the same downstream task later in the chain, then
+        there is no reason to run that downstream task twice.
+        Examples:
+          `G`:
+            t1:
+              - t3
+            t2:
+              - t3
+              - t4
+            t4:
+              - t5
+          `nodes`: [t1, t2]
+          Return with dedup==True: [t3, t4]
+          Return with dedup==False: [[t3], [t3, t4]]
+    """
+    if type(nodes) != list:
+        nodes = [nodes]
+    successors = []
+    for node in nodes:
+        successors.append(list(G.successors(node)))
+    # Return as-is if we're not deduplicating.
+    if not dedup:
+        return successors
+    # Deduplicate the list of successors.
+    deduped_successors = []
+    for group in successors:
+        group = [group] if type(group) != list else group
+        for node in group:
+            if node not in deduped_successors:
+                deduped_successors.append(node)
+    successors = deduped_successors
+    return successors
+def get_chainable_tasks(G: nx.DiGraph,
+                        starting_nodes: List[str] = None,
+                        graph_tasks: list = []) -> List[str]:
+    """ Recursive function to get a list of grouped nodes that can be used
+        in a task chain.
+        Recursive portion is for everything other than first entrypoint(s)
+        wherein we can re-call this method with the starting node(s) being the
+        nodes in the graph that are successors to the entrypoint(s), each
+        batch of starting nodes is a group, essentially, so return value is
+        something like:
+            [
+                [t1, t2],
+                [t3, t4],
+                [t5]
+            ]
+    """
+    if starting_nodes is None:
+        starting_nodes = find_entry_points(G)
+        graph_tasks.append(starting_nodes)
+    successors = find_successors(G, starting_nodes)
+    if len(successors) == 0:
+        return graph_tasks
+    graph_tasks.append(successors)
+    return get_chainable_tasks(G, successors, graph_tasks)
+def find_all_nodes(G: nx.DiGraph) -> List[str]:
+    """ Get a list of all nodes in the graph.
+    """
+    return list(G.nodes)
+def find_all_edges(G: nx.DiGraph) -> List[str]:
+    """ Get a list of all edges in the graph.
+    """
+    return list(G.edges)

pypeline/utils/module_utils.py ADDED Viewed

@@ -0,0 +1,119 @@
+""" Utilities for loading modules/callables based on strings.
+"""
+import os
+import re
+import logging
+import importlib
+from typing import Callable
+from pypeline.constants import SERMOS_ACCESS_KEY, SERMOS_CLIENT_PKG_NAME
+logger = logging.getLogger(__name__)
+class SermosModuleLoader(object):
+    """ Helper class to load modules / classes / methods based on a path string.
+    """
+    def get_module(self, resource_dot_path: str):
+        """ Retrieve the module based on a 'resource dot path'.
+            e.g. package.subdir.feature_file.MyCallable
+        """
+        module_path = '.'.join(resource_dot_path.split('.')[:-1])
+        module = importlib.import_module(module_path)
+        return module
+    def get_callable_name(self, resource_dot_path: str) -> str:
+        """ Retrieve the callable based on config string.
+            e.g. package.subdir.feature_file.MyCallable
+        """
+        callable_name = resource_dot_path.split('.')[-1]
+        return callable_name
+    def get_callable(self, resource_dot_path: str) -> Callable:
+        """ Retrieve the actual handler class based on config string.
+            e.g. package.subdir.feature_file.MyCallable
+        """
+        module = self.get_module(resource_dot_path)
+        callable_name = self.get_callable_name(resource_dot_path)
+        return getattr(module, callable_name)
+def normalized_pkg_name(pkg_name: str, dashed: bool = False):
+    """ We maintain consistency by always specifying the package name as
+        the "dashed version".
+        Python/setuptools will replace "_" with "-" but resource_filename()
+        expects the exact directory name, essentially. In order to keep it
+        simple upstream and *always* provide package name as the dashed
+        version, we do replacement here to 'normalize' both versions to
+        whichever convention you need at the time.
+        if `dashed`:
+            my-package-name --> my-package-name
+            my_package_name --> my-package-name
+        else:
+            my-package-name --> my_package_name
+            my_package_name --> my_package_name
+    """
+    if dashed:
+        return str(pkg_name).replace('_', '-')
+    return str(pkg_name).replace('-', '_')
+def get_client_pkg_name(pkg_name: str = None):
+    """ Verify the package name provided and get from environment if None.
+        Raise if neither provided nor found.
+        Arguments:
+          pkg_name (optional): Directory name for your Python
+                    package. e.g. my_package_name . If none provided, will check
+                    environment for `SERMOS_CLIENT_PKG_NAME`. If not found,
+                    will exit.
+    """
+    pkg_name = pkg_name if pkg_name else SERMOS_CLIENT_PKG_NAME
+    if pkg_name is None:
+        msg = "Unable to find `pkg-name` in CLI arguments nor in "\
+            "environment under `{}`".format('SERMOS_CLIENT_PKG_NAME')
+        logger.error(msg)
+        raise ValueError(msg)
+    return pkg_name
+def match_prefix(string: str, prefix_p: str) -> bool:
+    """ For given string, determine whether it begins with provided prefix_p.
+    """
+    pattern = re.compile('^(' + prefix_p + ').*')
+    if pattern.match(string):
+        return True
+    return False
+def match_suffix(string: str, suffix_p: str) -> bool:
+    """ For given string, determine whether it ends with provided suffix_p.
+    """
+    pattern = re.compile('.*(' + suffix_p + ')$')
+    if pattern.match(string):
+        return True
+    return False
+def match_prefix_suffix(string: str, prefix_p: str, suffix_p: str) -> bool:
+    """ For given string, determine whether it starts w/ prefix & ends w/ suffix
+    """
+    if match_prefix(string, prefix_p) and match_suffix(string, suffix_p):
+        return True
+    return False
+def find_from_environment(prefix_p: str, suffix_p: str) -> list:
+    """ Find all envirionment variables that match prefix and suffix.
+        Can provide any regex compatible string as values.
+    """
+    matching_vars = []
+    environment_vars = os.environ
+    for var in environment_vars:
+        if match_prefix_suffix(var, prefix_p, suffix_p):
+            matching_vars.append(var)
+    return matching_vars