scalable-pypeline 1.1.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,327 @@
1
+ """ General utilities used frequently in configuration-related tasks.
2
+
3
+ More specifically, these are methods that help interact with Pipeline and
4
+ Schedule configurations that originate from your `sermos.yaml` file. These
5
+ utility functions make it easy to switch between `local` and `cloud` modes
6
+ based on the value of `DEFAULT_BASE_URL` in your environment.
7
+
8
+ - If the base url is `local`, then all config tasks will read directly from
9
+ your local `sermos.yaml` file. Update operations will *not* do anything (that
10
+ is, your sermos.yaml file will not be updated).
11
+
12
+ - If the base url is anything other than `local`, this will assume a cloud
13
+ api url was provided (if None is set in environment, Sermos will default to
14
+ the Sermos Cloud base API assuming this is a Sermos Cloud deployment). You can
15
+ provide your own cloud API endpoints if desired, look to documentation for best
16
+ practices.
17
+
18
+ TODO Need to remove the dependency on Redis and make caching behavior optional.
19
+ """
20
+ import os
21
+ import logging
22
+ import json
23
+ from typing import Union, Any
24
+ from urllib.parse import urljoin
25
+ import requests
26
+ from rhodb.redis_conf import RedisConnector
27
+ from pypeline.constants import DEFAULT_BASE_URL, PIPELINE_CONFIG_CACHE_KEY, \
28
+ SCHEDULE_CONFIG_CACHE_KEY, CONFIG_REFRESH_RATE, USING_SERMOS_CLOUD, \
29
+ LOCAL_DEPLOYMENT_VALUE, DEFAULT_CONFIG_RETRIEVAL_PAGE_SIZE
30
+ from pypeline.sermos_yaml import load_sermos_config
31
+ from pypeline.pipeline_config_schema import BasePipelineSchema
32
+ from pypeline.schedule_config_schema import BaseScheduleSchema
33
+
34
+ logger = logging.getLogger(__name__)
35
+ redis_conn = RedisConnector().get_connection()
36
+
37
+
38
+ def get_access_key(access_key: Union[str, None] = None,
39
+ env_var_name: str = 'SERMOS_ACCESS_KEY'):
40
+ """ Simple helper to get admin server access key in a standard fashion. If
41
+ one is provided, return it back. If not, look in environment for
42
+ `env_var_name`. If that doesn't exist, raise useful error.
43
+
44
+ If this is a local deployment, no access key is required/relevant,
45
+ so simply return 'local'
46
+ """
47
+ if access_key is not None:
48
+ return access_key
49
+
50
+ if not USING_SERMOS_CLOUD:
51
+ return LOCAL_DEPLOYMENT_VALUE # e.g. 'local'
52
+
53
+ try:
54
+ return os.environ[env_var_name]
55
+ except KeyError:
56
+ raise KeyError(
57
+ f"{env_var_name} not found in this environment. Find a valid "
58
+ "access key in your Sermos Cloud administration console.")
59
+
60
+
61
+ # TODO cast to UUID?
62
+ def get_deployment_id(deployment_id: Union[str, None] = None,
63
+ env_var_name: str = 'SERMOS_DEPLOYMENT_ID'):
64
+ """ Simple helper to get the deployment id in a standard fashion. Look in
65
+ the environment for `env_var_name`. If that doesn't exist, raise useful
66
+ error.
67
+
68
+ If this is a local deployment, no deployment id is required/relevant,
69
+ so this will simply return 'local' in the event the DEFAULT_BASE_URL is
70
+ set to the LOCAL_DEPLOYMENT_VALUE ('local' by default) in the environment.
71
+ """
72
+ if deployment_id is not None:
73
+ return deployment_id
74
+
75
+ if not USING_SERMOS_CLOUD:
76
+ return LOCAL_DEPLOYMENT_VALUE # e.g. 'local'
77
+
78
+ try:
79
+ return os.environ[env_var_name]
80
+ except KeyError:
81
+ raise KeyError(
82
+ f"{env_var_name} not found in this environment. Note: this is "
83
+ "required when running a Celery worker as `beat`. Find this ID "
84
+ "in your administration console. For local development, this can "
85
+ "be any arbitrary string.")
86
+
87
+
88
+ def load_json_config_from_redis(key: str) -> Any:
89
+ """ Load a json key from redis. Special carve out for keys explicitly set
90
+ to "none".
91
+ """
92
+ val = redis_conn.get(key)
93
+ if val is None or val.decode('utf-8').lower() == 'none':
94
+ return None
95
+ return json.loads(val)
96
+
97
+
98
+ def set_json_config_to_redis(key: str,
99
+ data: Union[dict, None],
100
+ refresh_rate: int = CONFIG_REFRESH_RATE):
101
+ """ For Admin API actions (e.g. schedules/pipelines), deployments cache
102
+ results. The standard method for doing this is through a refresh key, which
103
+ is set in redis to expire after the CONFIG_REFRESH_RATE. This will set
104
+ the cached key.
105
+
106
+ Rationale for manually setting a "None" key instead of simply skipping
107
+ is to protect against case of a spammed config request for an unknown
108
+ pipeline, for example. This will still limit our requests to Sermos Cloud
109
+ based on the refresh rate even in that scenario.
110
+ """
111
+ if data is None:
112
+ data = 'None'
113
+ else:
114
+ data = json.dumps(data)
115
+
116
+ redis_conn.setex(key, refresh_rate, data)
117
+
118
+
119
+ def _generate_api_url(endpoint: str = ''):
120
+ """ Provide a normalized url based on the base url and endpoint and add in
121
+ the deployment_id to the url, which is required for all default
122
+ pipeline/schedule endpoints if using Sermos Cloud.
123
+
124
+ The Sermos Cloud API spec bases everything on the notion of `deployments`,
125
+ so if you are rolling your own 'non-local' API, you will need to mock this
126
+ concept in order to use the built in helper functions for retrieving
127
+ pipelines and schedules from an API source.
128
+ """
129
+ deployment_id = get_deployment_id() # From env if None
130
+ return urljoin(DEFAULT_BASE_URL, f'deployments/{deployment_id}/{endpoint}')
131
+
132
+
133
+ def _retrieve_and_cache_config(key: str,
134
+ admin_api_endpoint: str,
135
+ access_key: str,
136
+ refresh_rate: int = CONFIG_REFRESH_RATE) -> Any:
137
+ """ Attempt to load a configuration (pipeline/schedule) from cache If not available,
138
+ retrieve API response from Sermos Config Server and cache the response for
139
+ CONFIG_REFRESH_RATE seconds in local Redis.
140
+ """
141
+ conf = load_json_config_from_redis(key)
142
+ if conf is not None:
143
+ return conf
144
+
145
+ # Ask Sermos Cloud (Note: Sermos Cloud's API expects `apikey`)
146
+ headers = {
147
+ 'apikey': access_key,
148
+ }
149
+
150
+ params = {
151
+ 'page_size': DEFAULT_CONFIG_RETRIEVAL_PAGE_SIZE,
152
+ 'page': 1
153
+ }
154
+
155
+ r = requests.get(admin_api_endpoint, headers=headers, verify=True,
156
+ params=params)
157
+
158
+ data = None
159
+ if r.status_code == 200:
160
+ data = r.json()
161
+ else:
162
+ logger.warning(f"Non-200 response retrieving {admin_api_endpoint}: "
163
+ f"{r.status_code}, {r.reason}")
164
+
165
+ # There's a chance we need to request ALL schedule configs from sermos cloud
166
+ # for the scheduled tasks. Lets loop and grab all of them.
167
+ while key == SCHEDULE_CONFIG_CACHE_KEY and \
168
+ len(data['data']['results']) < data['data']['count']:
169
+ params['page'] += 1
170
+ r = requests.get(admin_api_endpoint, headers=headers, verify=True,
171
+ params=params)
172
+ if r.status_code == 200:
173
+ paginated_data = r.json()
174
+ data['data']['results'] = data['data']['results'] + \
175
+ paginated_data['data']['results']
176
+ else:
177
+ logger.warning(f"Non-200 response retrieving {admin_api_endpoint}: "
178
+ f"{r.status_code}, {r.reason}")
179
+ break
180
+
181
+ # Cache result
182
+ if data is not None:
183
+ set_json_config_to_redis(key, data, refresh_rate)
184
+
185
+ return data
186
+
187
+
188
+ def retrieve_latest_pipeline_config(
189
+ pipeline_id: Union[str, None] = None,
190
+ access_key: Union[str, None] = None,
191
+ refresh_rate: int = CONFIG_REFRESH_RATE) -> Union[dict, list]:
192
+ """ Retrieve the 'latest' pipeline configuration.
193
+
194
+ Sermos can be deployed in 'local' mode by setting DEFAULT_BASE_URL=local
195
+ in your environment. In this case, Sermos will retrieve the latest
196
+ configuration from the local filesystem, specifically looking inside the
197
+ sermos.yaml file.
198
+
199
+ If the DEFAULT_BASE_URL is anything else, this will assume that it is a
200
+ valid API base url and make a request. The request will be formatted to
201
+ match what Sermos Cloud expects for seamless Sermos Cloud deployments.
202
+ However, you can provide any base url and stand up your own API if desired!
203
+
204
+ This utilizes redis (required for Sermos-based pipelines/scheduled tasks)
205
+ to cache the result for a predetermined amount of time before requesting an
206
+ update. This is because pipelines/tasks can be invoked rapidly but do not
207
+ change frequently.
208
+ """
209
+ # If this is a LOCAL deployment, look to sermos.yaml directly
210
+ if not USING_SERMOS_CLOUD:
211
+ sermos_config = load_sermos_config()
212
+ if 'pipelines' in sermos_config:
213
+ pipelines = []
214
+ found_pipeline = None
215
+ for p_id, config in sermos_config['pipelines'].items():
216
+ config['sermosPipelineId'] = p_id
217
+ if pipeline_id == p_id:
218
+ found_pipeline = config
219
+ break
220
+ pipelines.append(config)
221
+
222
+ if pipeline_id:
223
+ if found_pipeline:
224
+ return found_pipeline
225
+ raise ValueError(f'Invalid pipeline {pipeline_id}')
226
+
227
+ return pipelines
228
+ return None
229
+
230
+ # If this is a CLOUD deployment, generate a valid API url and ask the API
231
+ # service for pipeline configuration. If this deployment is set up to
232
+ # cache results, do so.
233
+ cache_key = PIPELINE_CONFIG_CACHE_KEY.format(pipeline_id)
234
+ access_key = get_access_key(access_key) # From env if None
235
+
236
+ # Generate pipeline specific API endpoint. If pipeline_id
237
+ # is None, then we're asking for 'all' pipelines.
238
+ api_url = _generate_api_url('pipelines')
239
+ if pipeline_id is not None:
240
+ api_url = urljoin(api_url + '/', pipeline_id) # Add pipeline ID
241
+
242
+ # Retrieve (and cache) result - this will be the exact result from the
243
+ # API response.
244
+ data = _retrieve_and_cache_config(cache_key, api_url, access_key,
245
+ refresh_rate)
246
+ if data:
247
+ if pipeline_id:
248
+ return data['data']
249
+ return data['data']['results']
250
+ return None
251
+
252
+
253
+ def retrieve_latest_schedule_config(access_key: Union[str, None] = None,
254
+ refresh_rate: int = CONFIG_REFRESH_RATE):
255
+ """ Retrieve the 'latest' scheduled tasks configuration.
256
+
257
+ Sermos can be deployed in 'local' mode by setting DEFAULT_BASE_URL=local
258
+ in your environment. In this case, Sermos will retrieve the latest configuration
259
+ from the local filesystem, specifically looking inside the sermos.yaml file.
260
+
261
+ If the DEFAULT_BASE_URL is anything else, this will assume that it is a valid
262
+ API base url and make a request. The request will be formatted to match what
263
+ Sermos Cloud expects for seamless Sermos Cloud deployments. However, you can
264
+ provide any base url and stand up your own API if desired!
265
+
266
+ This utilizes redis (required for Sermos-based pipelines/scheduled tasks) to
267
+ cache the result for a predetermined amount of time before requesting an
268
+ update. This is because pipelines/tasks can be invoked rapidly but do not
269
+ change frequently.
270
+ """
271
+ if not USING_SERMOS_CLOUD:
272
+ sermos_config = load_sermos_config()
273
+ if 'scheduledTasks' in sermos_config:
274
+ tasks = []
275
+ for task_id, config in sermos_config['scheduledTasks'].items():
276
+ config['sermosScheduledTasksId'] = task_id
277
+ tasks.append(config)
278
+ return tasks
279
+ return None
280
+
281
+ cache_key = SCHEDULE_CONFIG_CACHE_KEY
282
+ access_key = get_access_key(access_key) # From env if None
283
+
284
+ api_url = _generate_api_url('scheduled_tasks')
285
+
286
+ data = _retrieve_and_cache_config(cache_key, api_url, access_key,
287
+ refresh_rate)
288
+
289
+ schedules = []
290
+ for schedule in data['data']['results']:
291
+ ScheduleSchema = \
292
+ BaseScheduleSchema.get_by_version(schedule['schemaVersion'])
293
+ schema = ScheduleSchema()
294
+ _schedule = schema.load(schedule)
295
+ _schedule['id'] = schedule['id']
296
+ schedules.append(_schedule)
297
+
298
+ return schedules
299
+
300
+
301
+ def update_schedule_config(new_schedule_config: dict,
302
+ access_key: Union[str, None] = None,
303
+ schedule_config_endpoint: Union[str, None] = None):
304
+ """ Tell Sermos to update a deployment's schedule with new version.
305
+ """
306
+
307
+ # Don't send status to sermos-cloud if we're running in local mode
308
+ if not USING_SERMOS_CLOUD:
309
+ return True
310
+
311
+ access_key = get_access_key(access_key) # From env if None
312
+ api_url = _generate_api_url('scheduled_tasks')
313
+
314
+ # Ask Sermos Cloud (Note: Sermos Cloud's API expects `apikey`)
315
+ headers = {'apikey': access_key}
316
+
317
+ for scheduled_task in new_schedule_config['schedules']:
318
+ copy_task = dict(scheduled_task)
319
+ task_id = copy_task.pop('id')
320
+ url = f"{api_url}/{task_id}"
321
+ r = requests.put(url, json=copy_task, headers=headers, verify=True)
322
+ if r.status_code != 200:
323
+ logger.error("Unable to update schedule task in sermos cloud")
324
+ logger.error(r.json())
325
+ return False
326
+
327
+ return True
@@ -0,0 +1,144 @@
1
+ import logging
2
+ import networkx as nx
3
+ from typing import List, Union
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ def get_execution_graph(
9
+ config: dict,
10
+ adjacency_key: str = 'dagAdjacency',
11
+ task_definitions_key: str = 'taskDefinitions') -> nx.DiGraph:
12
+ """ Generate a directed graph based on a pipeline config's adjacency list
13
+ and task definitions.
14
+
15
+ `dagAdjacency` is a dictionary containing all nodes and downstream
16
+ nodes.
17
+
18
+ `taskDefinitions` is a dictionary containing metadata required for
19
+ each node such as the worker, model version, etc. This metadata is
20
+ attached to each node so it can be retrieved directly from the graph.
21
+ """
22
+ G = nx.DiGraph()
23
+
24
+ # Get our adjacency list and task definitions
25
+ adjacency_dict = config.get(adjacency_key, {})
26
+ task_definitions = config.get(task_definitions_key, {})
27
+ if len(adjacency_dict.keys()) == 0:
28
+ logger.warning('Adjacency definition `{}` was not found ...'.format(
29
+ adjacency_key))
30
+
31
+ # Build the graph
32
+ for node in adjacency_dict.keys():
33
+ adjacent_nodes = adjacency_dict[node]
34
+
35
+ # If no adjacent nodes, then this is a terminal node
36
+ if len(adjacent_nodes) == 0:
37
+ G.add_node(node, attr_dict=task_definitions.get(node, {}))
38
+ continue
39
+
40
+ # Otherwise, we'll add an edge from this node to all adjacent nodes
41
+ # and add the task defnition metadata to the edge
42
+ G.add_edges_from([(node, n, task_definitions.get(n, {}))
43
+ for n in adjacent_nodes])
44
+ return G
45
+
46
+
47
+ def find_entry_points(G: nx.DiGraph) -> List[str]:
48
+ """ Find the entrypoint(s) for this graph.
49
+
50
+ An entrypoint is one for which no predecessors exist.
51
+ """
52
+ result = []
53
+ for node in G.nodes:
54
+ if len(list(G.predecessors(node))) == 0:
55
+ result.append(node)
56
+ return result
57
+
58
+
59
+ def find_successors(G: nx.DiGraph,
60
+ nodes: Union[List[str], str],
61
+ dedup: bool = True) -> Union[List[str], List[List[str]]]:
62
+ """ Find the next point(s) for graph node(s).
63
+
64
+ If dedeup is True (default), return a single list of deduplicated
65
+ values. This is useful when creating a task chain that is comprised
66
+ of groups that can execute concurrently. If two upstream tasks in the
67
+ chain each invoke the same downstream task later in the chain, then
68
+ there is no reason to run that downstream task twice.
69
+
70
+ Examples:
71
+ `G`:
72
+ t1:
73
+ - t3
74
+ t2:
75
+ - t3
76
+ - t4
77
+ t4:
78
+ - t5
79
+ `nodes`: [t1, t2]
80
+
81
+ Return with dedup==True: [t3, t4]
82
+ Return with dedup==False: [[t3], [t3, t4]]
83
+ """
84
+ if type(nodes) != list:
85
+ nodes = [nodes]
86
+
87
+ successors = []
88
+ for node in nodes:
89
+ successors.append(list(G.successors(node)))
90
+
91
+ # Return as-is if we're not deduplicating.
92
+ if not dedup:
93
+ return successors
94
+
95
+ # Deduplicate the list of successors.
96
+ deduped_successors = []
97
+ for group in successors:
98
+ group = [group] if type(group) != list else group
99
+ for node in group:
100
+ if node not in deduped_successors:
101
+ deduped_successors.append(node)
102
+ successors = deduped_successors
103
+ return successors
104
+
105
+
106
+ def get_chainable_tasks(G: nx.DiGraph,
107
+ starting_nodes: List[str] = None,
108
+ graph_tasks: list = []) -> List[str]:
109
+ """ Recursive function to get a list of grouped nodes that can be used
110
+ in a task chain.
111
+
112
+ Recursive portion is for everything other than first entrypoint(s)
113
+ wherein we can re-call this method with the starting node(s) being the
114
+ nodes in the graph that are successors to the entrypoint(s), each
115
+ batch of starting nodes is a group, essentially, so return value is
116
+ something like:
117
+ [
118
+ [t1, t2],
119
+ [t3, t4],
120
+ [t5]
121
+ ]
122
+ """
123
+ if starting_nodes is None:
124
+ starting_nodes = find_entry_points(G)
125
+ graph_tasks.append(starting_nodes)
126
+
127
+ successors = find_successors(G, starting_nodes)
128
+ if len(successors) == 0:
129
+ return graph_tasks
130
+
131
+ graph_tasks.append(successors)
132
+ return get_chainable_tasks(G, successors, graph_tasks)
133
+
134
+
135
+ def find_all_nodes(G: nx.DiGraph) -> List[str]:
136
+ """ Get a list of all nodes in the graph.
137
+ """
138
+ return list(G.nodes)
139
+
140
+
141
+ def find_all_edges(G: nx.DiGraph) -> List[str]:
142
+ """ Get a list of all edges in the graph.
143
+ """
144
+ return list(G.edges)
@@ -0,0 +1,119 @@
1
+ """ Utilities for loading modules/callables based on strings.
2
+ """
3
+ import os
4
+ import re
5
+ import logging
6
+ import importlib
7
+ from typing import Callable
8
+ from pypeline.constants import SERMOS_ACCESS_KEY, SERMOS_CLIENT_PKG_NAME
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class SermosModuleLoader(object):
14
+ """ Helper class to load modules / classes / methods based on a path string.
15
+ """
16
+ def get_module(self, resource_dot_path: str):
17
+ """ Retrieve the module based on a 'resource dot path'.
18
+ e.g. package.subdir.feature_file.MyCallable
19
+ """
20
+ module_path = '.'.join(resource_dot_path.split('.')[:-1])
21
+ module = importlib.import_module(module_path)
22
+ return module
23
+
24
+ def get_callable_name(self, resource_dot_path: str) -> str:
25
+ """ Retrieve the callable based on config string.
26
+ e.g. package.subdir.feature_file.MyCallable
27
+ """
28
+ callable_name = resource_dot_path.split('.')[-1]
29
+ return callable_name
30
+
31
+ def get_callable(self, resource_dot_path: str) -> Callable:
32
+ """ Retrieve the actual handler class based on config string.
33
+ e.g. package.subdir.feature_file.MyCallable
34
+ """
35
+ module = self.get_module(resource_dot_path)
36
+ callable_name = self.get_callable_name(resource_dot_path)
37
+ return getattr(module, callable_name)
38
+
39
+
40
+ def normalized_pkg_name(pkg_name: str, dashed: bool = False):
41
+ """ We maintain consistency by always specifying the package name as
42
+ the "dashed version".
43
+
44
+ Python/setuptools will replace "_" with "-" but resource_filename()
45
+ expects the exact directory name, essentially. In order to keep it
46
+ simple upstream and *always* provide package name as the dashed
47
+ version, we do replacement here to 'normalize' both versions to
48
+ whichever convention you need at the time.
49
+
50
+ if `dashed`:
51
+ my-package-name --> my-package-name
52
+ my_package_name --> my-package-name
53
+
54
+ else:
55
+ my-package-name --> my_package_name
56
+ my_package_name --> my_package_name
57
+ """
58
+ if dashed:
59
+ return str(pkg_name).replace('_', '-')
60
+ return str(pkg_name).replace('-', '_')
61
+
62
+
63
+ def get_client_pkg_name(pkg_name: str = None):
64
+ """ Verify the package name provided and get from environment if None.
65
+
66
+ Raise if neither provided nor found.
67
+
68
+ Arguments:
69
+ pkg_name (optional): Directory name for your Python
70
+ package. e.g. my_package_name . If none provided, will check
71
+ environment for `SERMOS_CLIENT_PKG_NAME`. If not found,
72
+ will exit.
73
+ """
74
+ pkg_name = pkg_name if pkg_name else SERMOS_CLIENT_PKG_NAME
75
+ if pkg_name is None:
76
+ msg = "Unable to find `pkg-name` in CLI arguments nor in "\
77
+ "environment under `{}`".format('SERMOS_CLIENT_PKG_NAME')
78
+ logger.error(msg)
79
+ raise ValueError(msg)
80
+ return pkg_name
81
+
82
+
83
+ def match_prefix(string: str, prefix_p: str) -> bool:
84
+ """ For given string, determine whether it begins with provided prefix_p.
85
+ """
86
+ pattern = re.compile('^(' + prefix_p + ').*')
87
+ if pattern.match(string):
88
+ return True
89
+ return False
90
+
91
+
92
+ def match_suffix(string: str, suffix_p: str) -> bool:
93
+ """ For given string, determine whether it ends with provided suffix_p.
94
+ """
95
+ pattern = re.compile('.*(' + suffix_p + ')$')
96
+ if pattern.match(string):
97
+ return True
98
+ return False
99
+
100
+
101
+ def match_prefix_suffix(string: str, prefix_p: str, suffix_p: str) -> bool:
102
+ """ For given string, determine whether it starts w/ prefix & ends w/ suffix
103
+ """
104
+ if match_prefix(string, prefix_p) and match_suffix(string, suffix_p):
105
+ return True
106
+ return False
107
+
108
+
109
+ def find_from_environment(prefix_p: str, suffix_p: str) -> list:
110
+ """ Find all envirionment variables that match prefix and suffix.
111
+
112
+ Can provide any regex compatible string as values.
113
+ """
114
+ matching_vars = []
115
+ environment_vars = os.environ
116
+ for var in environment_vars:
117
+ if match_prefix_suffix(var, prefix_p, suffix_p):
118
+ matching_vars.append(var)
119
+ return matching_vars