scalable-pypeline 1.2.3__py2.py3-none-any.whl → 2.0.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. pypeline/__init__.py +1 -1
  2. pypeline/barrier.py +34 -0
  3. pypeline/composition.py +349 -0
  4. pypeline/constants.py +51 -84
  5. pypeline/dramatiq.py +470 -0
  6. pypeline/extensions.py +9 -8
  7. pypeline/flask/__init__.py +3 -5
  8. pypeline/flask/api/pipelines.py +109 -148
  9. pypeline/flask/api/schedules.py +14 -39
  10. pypeline/flask/decorators.py +18 -53
  11. pypeline/flask/flask_pypeline.py +156 -0
  12. pypeline/middleware.py +61 -0
  13. pypeline/pipeline_config_schema.py +105 -92
  14. pypeline/pypeline_yaml.py +458 -0
  15. pypeline/schedule_config_schema.py +35 -120
  16. pypeline/utils/config_utils.py +52 -310
  17. pypeline/utils/module_utils.py +35 -71
  18. pypeline/utils/pipeline_utils.py +161 -0
  19. scalable_pypeline-2.0.2.dist-info/METADATA +217 -0
  20. scalable_pypeline-2.0.2.dist-info/RECORD +27 -0
  21. scalable_pypeline-2.0.2.dist-info/entry_points.txt +3 -0
  22. tests/fixtures/__init__.py +0 -1
  23. pypeline/celery.py +0 -206
  24. pypeline/celery_beat.py +0 -254
  25. pypeline/flask/api/utils.py +0 -35
  26. pypeline/flask/flask_sermos.py +0 -156
  27. pypeline/generators.py +0 -196
  28. pypeline/logging_config.py +0 -171
  29. pypeline/pipeline/__init__.py +0 -0
  30. pypeline/pipeline/chained_task.py +0 -70
  31. pypeline/pipeline/generator.py +0 -254
  32. pypeline/sermos_yaml.py +0 -442
  33. pypeline/utils/graph_utils.py +0 -144
  34. pypeline/utils/task_utils.py +0 -552
  35. scalable_pypeline-1.2.3.dist-info/METADATA +0 -163
  36. scalable_pypeline-1.2.3.dist-info/RECORD +0 -33
  37. scalable_pypeline-1.2.3.dist-info/entry_points.txt +0 -2
  38. tests/fixtures/s3_fixtures.py +0 -52
  39. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/LICENSE +0 -0
  40. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/WHEEL +0 -0
  41. {scalable_pypeline-1.2.3.dist-info → scalable_pypeline-2.0.2.dist-info}/top_level.txt +0 -0
pypeline/sermos_yaml.py DELETED
@@ -1,442 +0,0 @@
1
- """ Definition of the `sermos.yaml` file. This is only relevant/used for
2
- managed deployments through Sermos.ai. If self-hosting, safely disregard this
3
- yaml format, no `sermos.yaml` is required for your application.
4
-
5
- If using, a basic file may look like::
6
- serviceConfig:
7
- - name: sermos-worker
8
- registeredTasks:
9
- - handler: sermos_demo_client.workers.demo_worker.demo_worker_task
10
- - handler: sermos_demo_client.workers.demo_worker.demo_model_task
11
-
12
- pipelines:
13
- demo-pipeline:
14
- name: demo-pipeline
15
- description: Demo Pipeline.
16
- schemaVersion: 1
17
- config:
18
- dagAdjacency:
19
- node_a:
20
- - node_b
21
- - node_c
22
- metadata:
23
- maxRetry: 3
24
- maxTtl: 60
25
- queue: default-task-queue
26
- taskDefinitions:
27
- node_a:
28
- handler: sermos_demo_client.workers.demo_pipeline.demo_pipeline_node_a
29
- node_b:
30
- handler: sermos_demo_client.workers.demo_pipeline.demo_pipeline_node_b
31
- queue: node-b-queue
32
- node_c:
33
- handler: sermos_demo_client.workers.demo_pipeline.demo_pipeline_node_c
34
-
35
- scheduledTasks:
36
- demo-model-task:
37
- name: Demo Model Task
38
- enabled: true
39
- config:
40
- scheduleType: interval
41
- task: sermos_demo_client.workers.demo_worker.demo_model_task
42
- queue: default-task-queue
43
- schedule:
44
- every: 60
45
- period: seconds
46
- schemaVersion: 1
47
-
48
- """
49
- import re
50
- import os
51
- import logging
52
- import pkg_resources
53
- import yaml
54
- from yaml.loader import SafeLoader
55
- from marshmallow import Schema, fields, pre_load, EXCLUDE, INCLUDE,\
56
- validates_schema
57
- from marshmallow.validate import OneOf
58
- from marshmallow.exceptions import ValidationError
59
- from pypeline.utils.module_utils import SermosModuleLoader, normalized_pkg_name
60
- from pypeline.constants import SERMOS_YAML_PATH, SERMOS_CLIENT_PKG_NAME
61
- from pypeline.pipeline_config_schema import BasePipelineSchema
62
- from pypeline.schedule_config_schema import BaseScheduleSchema
63
-
64
- logger = logging.getLogger(__name__)
65
-
66
-
67
- class InvalidPackagePath(Exception):
68
- pass
69
-
70
-
71
- class InvalidSermosConfig(Exception):
72
- pass
73
-
74
-
75
- class MissingSermosConfig(Exception):
76
- pass
77
-
78
-
79
- class ExcludeUnknownSchema(Schema):
80
- class Meta:
81
- unknown = EXCLUDE
82
-
83
-
84
- class NameSchema(Schema):
85
- """ Validated name string field.
86
- """
87
- name = fields.String(
88
- required=True,
89
- description="Name for service or image. Must include "
90
- "only alphanumeric characters along with `_` and `-`.",
91
- example="my-service-name")
92
-
93
- @pre_load
94
- def validate_characters(self, item, **kwargs):
95
- """ Ensure name field conforms to allowed characters
96
- """
97
- valid_chars = r'^[\w\d\-\_]+$'
98
- if not bool(re.match(valid_chars, item['name'])):
99
- raise ValueError(
100
- f"Invalid name: {item['name']}. Only alphanumeric characters "
101
- "allowed along with `-` and `_`.")
102
- return item
103
-
104
-
105
- class SermosRegisteredTaskDetailConfigSchema(Schema):
106
- handler = fields.String(
107
- required=True,
108
- description="Full path to the Method handles work / pipeline tasks.",
109
- example="sermos_customer_client.workers.worker_group.useful_worker")
110
-
111
- event = fields.Raw(
112
- required=False,
113
- unknown=INCLUDE,
114
- description="Arbitrary user data, passed through `event` arg in task.")
115
-
116
-
117
- class SermosCeleryWorkerConfigSchema(Schema):
118
- """ Attributes for a celery worker. This worker will run all of the
119
- pipelines and scheduled tasks.
120
- """
121
- registeredTasks = fields.List(
122
- fields.Nested(SermosRegisteredTaskDetailConfigSchema, required=True),
123
- required=False,
124
- _required=True,
125
- description="List of task handlers to register for to your Sermos app."
126
- )
127
-
128
-
129
- class SermosServiceConfigSchema(ExcludeUnknownSchema,
130
- SermosCeleryWorkerConfigSchema, NameSchema):
131
- """ Base service config object definition for workers.
132
- """
133
- pass
134
-
135
-
136
- class SermosYamlSchema(ExcludeUnknownSchema):
137
- """ The primary `sermos.yaml` file schema. This defines all available
138
- properties in a valid Sermos configuration file.
139
- """
140
- serviceConfig = fields.List(
141
- fields.Nested(SermosServiceConfigSchema,
142
- required=True,
143
- description="Core service configuration."),
144
- description="List of workers for Sermos to manage.",
145
- required=True)
146
-
147
- pipelines = fields.Dict(keys=fields.String(),
148
- values=fields.Nested(BasePipelineSchema),
149
- description="List of pipelines",
150
- required=False)
151
-
152
- scheduledTasks = fields.Dict(keys=fields.String(),
153
- values=fields.Nested(BaseScheduleSchema),
154
- description="List of scheduled tasks",
155
- required=False)
156
-
157
- def validate_errors(self, schema: Schema, value: dict):
158
- """ Run Marshmallow validate() and raise if any errors
159
- """
160
- schema = schema()
161
- errors = schema.validate(value)
162
- if len(errors.keys()) > 0:
163
- raise ValidationError(errors)
164
-
165
- @validates_schema
166
- def validate_schema(self, data, **kwargs):
167
- """ Additional validation.
168
-
169
- Nested fields that are not required are not validated by Marshmallow
170
- by default. Do a single level down of validation for now.
171
-
172
- imageConfig can provide *either* an install command for Sermos
173
- to use to build the image for customer *or* a Docker repository
174
- for Sermos to pull.
175
- """
176
- # Vaidate nested
177
- key_schema_pairs = (
178
- ('serviceConfig', SermosServiceConfigSchema),
179
- )
180
- for k_s in key_schema_pairs:
181
- val = data.get(k_s[0], None)
182
- if val is not None:
183
- if type(val) == list:
184
- for v in val:
185
- self.validate_errors(k_s[1], v)
186
- else:
187
- self.validate_errors(k_s[1], val)
188
-
189
- # Validate the services. We list every service schema field as not
190
- # required in order to use them as mixins for a generic service object,
191
- # however, they ARE required, so validate here using the custom
192
- # metadata property `_required`. Default to value of `required`.
193
- for service in data.get('serviceConfig'):
194
- schema = SermosCeleryWorkerConfigSchema
195
- for field in schema().fields:
196
- try:
197
- if schema().fields[field].metadata.get(
198
- '_required',
199
- getattr(schema().fields[field], 'required')):
200
- assert field in service
201
- except AssertionError:
202
- raise ValidationError(
203
- f"`{field}` missing in worker definition.")
204
-
205
- # Validate unique pipeline ids
206
- if 'pipelines' in data:
207
- pipeline_ids = set()
208
- for pipeline_id, pipeline_data in data['pipelines'].items():
209
- if pipeline_id in pipeline_ids:
210
- raise ValidationError("All pipeline ids must be unique!")
211
- pipeline_ids.add(pipeline_id)
212
- schema_version = pipeline_data['schemaVersion']
213
- PipelineSchema = \
214
- BasePipelineSchema.get_by_version(schema_version)
215
- self.validate_errors(PipelineSchema, pipeline_data)
216
-
217
- # Validate unique scheduled tasks names
218
- if 'scheduledTasks' in data:
219
- task_ids = set()
220
- for task_id, task_data in data['scheduledTasks'].items():
221
- if task_id in task_ids:
222
- raise ValidationError("All schedule ids must be unique!")
223
- task_ids.add(task_id)
224
- schema_version = task_data['schemaVersion']
225
- TaskSchema = BaseScheduleSchema.get_by_version(schema_version)
226
- self.validate_errors(TaskSchema, task_data)
227
-
228
-
229
- class YamlPatternConstructor():
230
- """ Adds a pattern resolver + constructor to PyYaml.
231
-
232
- Typical/deault usage is for parsing environment variables
233
- in a yaml file but this can be used for any pattern you provide.
234
-
235
- See: https://pyyaml.org/wiki/PyYAMLDocumentation
236
- """
237
- def __init__(self,
238
- env_var_pattern: str = None,
239
- add_constructor: bool = True):
240
- self.env_var_pattern = env_var_pattern
241
- if self.env_var_pattern is None:
242
- # Default pattern is: ${VAR:default}
243
- self.env_var_pattern = r'^\$\{(.*)\}$'
244
- self.path_matcher = re.compile(self.env_var_pattern)
245
-
246
- if add_constructor:
247
- self.add_constructor()
248
-
249
- def _path_constructor(self, loader, node):
250
- """ Extract the matched value, expand env variable,
251
- and replace the match
252
-
253
- TODO: Would need to update this (specifically the parsing) if any
254
- pattern other than our default (or a highly compatible variation)
255
- is provided.
256
- """
257
- # Try to match the correct env variable pattern in this node's value
258
- # If the value does not match the pattern, return None (which means
259
- # this node will not be parsed for ENV variables and instead just
260
- # returned as-is).
261
- env_var_name = re.match(self.env_var_pattern, node.value)
262
- try:
263
- env_var_name = env_var_name.group(1)
264
- except AttributeError:
265
- return None
266
-
267
- # If we get down here, then the 'node.value' matches our specified
268
- # pattern, so try to parse. env_var_name is the value inside ${...}.
269
- # Split on `:`, which is our delimiter for default values.
270
- env_var_name_split = env_var_name.split(':')
271
-
272
- # Attempt to retrieve the environment variable...from the environment
273
- env_var = os.environ.get(env_var_name_split[0], None)
274
-
275
- if env_var is None: # Nothing found in environment
276
- # If a default was provided (e.g. VAR:default), return that.
277
- # We join anything after first element because the default
278
- # value might be a URL or something with a colon in it
279
- # which would have 'split' above
280
- if len(env_var_name_split) > 1:
281
- return ":".join(env_var_name_split[1:])
282
- return 'unset' # Return 'unset' if not in environ nor default
283
- return env_var
284
-
285
- def add_constructor(self):
286
- """ Initialize PyYaml with ability to resolve/load environment
287
- variables defined in a yaml template when they exist in
288
- the environment.
289
-
290
- Add to SafeLoader in addition to standard Loader.
291
- """
292
- # Add the `!env_var` tag to any scalar (value) that matches the
293
- # pattern self.path_matcher. This allows the template to be much more
294
- # intuitive vs needing to add !env_var to the beginning of each value
295
- yaml.add_implicit_resolver('!env_var', self.path_matcher)
296
- yaml.add_implicit_resolver('!env_var',
297
- self.path_matcher,
298
- Loader=SafeLoader)
299
-
300
- # Add constructor for the tag `!env_var`, which is a function that
301
- # converts a node of a YAML representation graph to a native Python
302
- # object.
303
- yaml.add_constructor('!env_var', self._path_constructor)
304
- yaml.add_constructor('!env_var',
305
- self._path_constructor,
306
- Loader=SafeLoader)
307
-
308
-
309
- def parse_config_file(sermos_yaml: str):
310
- """ Parse the `sermos.yaml` file when it's been loaded.
311
-
312
- Arguments:
313
- sermos_yaml (required): String of loaded sermos.yaml file.
314
- """
315
- YamlPatternConstructor() # Add our env variable parser
316
- try:
317
- sermos_yaml_schema = SermosYamlSchema()
318
- # First suss out yaml issues
319
- sermos_config = yaml.safe_load(sermos_yaml)
320
- # Then schema issues
321
- sermos_config = sermos_yaml_schema.load(sermos_config)
322
- except ValidationError as e:
323
- msg = "Invalid Sermos configuration due to {}"\
324
- .format(e.messages)
325
- logger.error(msg)
326
- raise InvalidSermosConfig(msg)
327
- except Exception as e:
328
- msg = "Invalid Sermos configuration, likely due to invalid "\
329
- "YAML formatting ..."
330
- logger.exception("{} {}".format(msg, e))
331
- raise InvalidSermosConfig(msg)
332
- return sermos_config
333
-
334
-
335
- def _get_pkg_name(pkg_name: str) -> str:
336
- """ Retrieve the normalized package name.
337
- """
338
- if pkg_name is None:
339
- pkg_name = SERMOS_CLIENT_PKG_NAME # From environment
340
- if pkg_name is None:
341
- return None
342
- return normalized_pkg_name(pkg_name)
343
-
344
-
345
- def load_sermos_config(pkg_name: str = None,
346
- sermos_yaml_filename: str = None,
347
- as_dict: bool = True):
348
- """ Load and parse the `sermos.yaml` file. Issue usable exceptions for
349
- known error modes so bootstrapping can handle appropriately.
350
-
351
- Arguments:
352
- pkg_name (required): Directory name for your Python
353
- package. e.g. my_package_name . If none provided, will check
354
- environment for `SERMOS_CLIENT_PKG_NAME`. If not found,
355
- will exit.
356
- sermos_yaml_filename (optional): Relative path to find your
357
- `sermos.yaml` configuration file. Defaults to `sermos.yaml`
358
- which should be found inside your `pkg_name`
359
- as_dict (optional): If true (default), return the loaded sermos
360
- configuration as a dictionary. If false, return the loaded
361
- string value of the yaml file.
362
- """
363
- if sermos_yaml_filename is None:
364
- sermos_yaml_filename = SERMOS_YAML_PATH
365
-
366
- logger.info(f"Loading `sermos.yaml` from package `{pkg_name}` "
367
- f"and file location `{sermos_yaml_filename}` ...")
368
- sermos_config = None
369
-
370
- pkg_name = _get_pkg_name(pkg_name)
371
- if pkg_name is None: # Nothing to retrieve at this point
372
- logger.warning("Unable to retrieve sermos.yaml configuration ...")
373
- return sermos_config
374
-
375
- try:
376
- sermos_config_path = pkg_resources.resource_filename(
377
- pkg_name, sermos_yaml_filename)
378
- except Exception as e:
379
- msg = "Either pkg_name ({}) or sermos_yaml_filename ({}) is "\
380
- "invalid ...".format(pkg_name, sermos_yaml_filename)
381
- logger.error("{} ... {}".format(msg, e))
382
- raise InvalidPackagePath(e)
383
-
384
- try:
385
- with open(sermos_config_path, 'r') as f:
386
- sermos_yaml = f.read()
387
- sermos_config = parse_config_file(sermos_yaml)
388
- except InvalidSermosConfig as e:
389
- raise
390
- except FileNotFoundError as e:
391
- msg = "Sermos config file could not be found at path {} ...".format(
392
- sermos_config_path)
393
- raise MissingSermosConfig(msg)
394
- except Exception as e:
395
- raise e
396
- if as_dict:
397
- return sermos_config
398
- return yaml.safe_dump(sermos_config)
399
-
400
-
401
- def load_client_config_and_version(pkg_name: str = None,
402
- sermos_yaml_filename: str = None):
403
- """ Load and parse the `sermos.yaml` file and a client package's version.
404
-
405
- Arguments:
406
- pkg_name (required): Directory name for your Python
407
- package. e.g. my_package_name . If none provided, will check
408
- environment for `SERMOS_CLIENT_PKG_NAME`. If not found,
409
- will exit.
410
- sermos_yaml_filename (optional): Relative path to find your
411
- `sermos.yaml` configuration file. Defaults to `sermos.yaml`
412
- which should be found inside your `pkg_name`
413
- as_dict (optional): If true (default), return the loaded sermos
414
- configuration as a dictionary. If false, return the loaded
415
- string value of the yaml file.
416
-
417
- For this to work properly, the provided package must be installed in the
418
- same environment as this Sermos package and it must have a `__version__`
419
- variable inside its `__init__.py` file, e.g. `__version__ = '0.0.0'`
420
- """
421
- sermos_config = None
422
- client_version = None
423
-
424
- pkg_name = _get_pkg_name(pkg_name)
425
-
426
- try:
427
- loader = SermosModuleLoader()
428
- pkg = loader.get_module(pkg_name + '.__init__')
429
- client_version = getattr(pkg, '__version__', '0.0.0')
430
- sermos_config = load_sermos_config(pkg_name, sermos_yaml_filename)
431
- except MissingSermosConfig as e:
432
- logger.error(e)
433
- except InvalidSermosConfig as e:
434
- logger.error(e)
435
- except InvalidPackagePath as e:
436
- logger.error(e)
437
- except Exception as e:
438
- logger.error("Unable to load client's pkg __version__ or "
439
- "{} config file for package: {} ... {}".format(
440
- sermos_yaml_filename, pkg_name, e))
441
-
442
- return sermos_config, client_version
@@ -1,144 +0,0 @@
1
- import logging
2
- import networkx as nx
3
- from typing import List, Union
4
-
5
- logger = logging.getLogger(__name__)
6
-
7
-
8
- def get_execution_graph(
9
- config: dict,
10
- adjacency_key: str = 'dagAdjacency',
11
- task_definitions_key: str = 'taskDefinitions') -> nx.DiGraph:
12
- """ Generate a directed graph based on a pipeline config's adjacency list
13
- and task definitions.
14
-
15
- `dagAdjacency` is a dictionary containing all nodes and downstream
16
- nodes.
17
-
18
- `taskDefinitions` is a dictionary containing metadata required for
19
- each node such as the worker, model version, etc. This metadata is
20
- attached to each node so it can be retrieved directly from the graph.
21
- """
22
- G = nx.DiGraph()
23
-
24
- # Get our adjacency list and task definitions
25
- adjacency_dict = config.get(adjacency_key, {})
26
- task_definitions = config.get(task_definitions_key, {})
27
- if len(adjacency_dict.keys()) == 0:
28
- logger.warning('Adjacency definition `{}` was not found ...'.format(
29
- adjacency_key))
30
-
31
- # Build the graph
32
- for node in adjacency_dict.keys():
33
- adjacent_nodes = adjacency_dict[node]
34
-
35
- # If no adjacent nodes, then this is a terminal node
36
- if len(adjacent_nodes) == 0:
37
- G.add_node(node, attr_dict=task_definitions.get(node, {}))
38
- continue
39
-
40
- # Otherwise, we'll add an edge from this node to all adjacent nodes
41
- # and add the task defnition metadata to the edge
42
- G.add_edges_from([(node, n, task_definitions.get(n, {}))
43
- for n in adjacent_nodes])
44
- return G
45
-
46
-
47
- def find_entry_points(G: nx.DiGraph) -> List[str]:
48
- """ Find the entrypoint(s) for this graph.
49
-
50
- An entrypoint is one for which no predecessors exist.
51
- """
52
- result = []
53
- for node in G.nodes:
54
- if len(list(G.predecessors(node))) == 0:
55
- result.append(node)
56
- return result
57
-
58
-
59
- def find_successors(G: nx.DiGraph,
60
- nodes: Union[List[str], str],
61
- dedup: bool = True) -> Union[List[str], List[List[str]]]:
62
- """ Find the next point(s) for graph node(s).
63
-
64
- If dedeup is True (default), return a single list of deduplicated
65
- values. This is useful when creating a task chain that is comprised
66
- of groups that can execute concurrently. If two upstream tasks in the
67
- chain each invoke the same downstream task later in the chain, then
68
- there is no reason to run that downstream task twice.
69
-
70
- Examples:
71
- `G`:
72
- t1:
73
- - t3
74
- t2:
75
- - t3
76
- - t4
77
- t4:
78
- - t5
79
- `nodes`: [t1, t2]
80
-
81
- Return with dedup==True: [t3, t4]
82
- Return with dedup==False: [[t3], [t3, t4]]
83
- """
84
- if type(nodes) != list:
85
- nodes = [nodes]
86
-
87
- successors = []
88
- for node in nodes:
89
- successors.append(list(G.successors(node)))
90
-
91
- # Return as-is if we're not deduplicating.
92
- if not dedup:
93
- return successors
94
-
95
- # Deduplicate the list of successors.
96
- deduped_successors = []
97
- for group in successors:
98
- group = [group] if type(group) != list else group
99
- for node in group:
100
- if node not in deduped_successors:
101
- deduped_successors.append(node)
102
- successors = deduped_successors
103
- return successors
104
-
105
-
106
- def get_chainable_tasks(G: nx.DiGraph,
107
- starting_nodes: List[str] = None,
108
- graph_tasks: list = []) -> List[str]:
109
- """ Recursive function to get a list of grouped nodes that can be used
110
- in a task chain.
111
-
112
- Recursive portion is for everything other than first entrypoint(s)
113
- wherein we can re-call this method with the starting node(s) being the
114
- nodes in the graph that are successors to the entrypoint(s), each
115
- batch of starting nodes is a group, essentially, so return value is
116
- something like:
117
- [
118
- [t1, t2],
119
- [t3, t4],
120
- [t5]
121
- ]
122
- """
123
- if starting_nodes is None:
124
- starting_nodes = find_entry_points(G)
125
- graph_tasks.append(starting_nodes)
126
-
127
- successors = find_successors(G, starting_nodes)
128
- if len(successors) == 0:
129
- return graph_tasks
130
-
131
- graph_tasks.append(successors)
132
- return get_chainable_tasks(G, successors, graph_tasks)
133
-
134
-
135
- def find_all_nodes(G: nx.DiGraph) -> List[str]:
136
- """ Get a list of all nodes in the graph.
137
- """
138
- return list(G.nodes)
139
-
140
-
141
- def find_all_edges(G: nx.DiGraph) -> List[str]:
142
- """ Get a list of all edges in the graph.
143
- """
144
- return list(G.edges)